772 files changed, 199283 insertions, 0 deletions
diff --git a/src/gallium/Makefile b/src/gallium/Makefile
new file mode 100644
index 0000000000..36bd3623e7
--- /dev/null
+++ b/src/gallium/Makefile
@@ -0,0 +1,26 @@
+TOP = ../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = auxiliary drivers
+# Note winsys/ needs to be built after src/mesa
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
+	rm -f `find . -name depend`
+
+
+# Dummy install target
+install:
diff --git a/src/gallium/Makefile.template b/src/gallium/Makefile.template
new file mode 100644
index 0000000000..4e462b5c97
--- /dev/null
+++ b/src/gallium/Makefile.template
@@ -0,0 +1,64 @@
+# -*-makefile-*-
+
+
+# We still have a dependency on the "dri" buffer manager.  Most likely
+# the interface can be reused in non-dri environments, and also as a
+# frontend to simpler memory managers.
+#
+COMMON_SOURCES = 
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+          $(CPP_SOURCES:.cpp=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+
+### Include directories
+INCLUDES = \
+	-I. \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/include \
+        $(DRIVER_INCLUDES)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDES) $(CXXFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) Makefile $(TOP)/src/gallium/Makefile.template
+	$(TOP)/bin/mklib -o $@ -static $(OBJECTS) $(DRIVER_LIBS)
+
+
+depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) \
+		$(ASM_SOURCES) 2> /dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean::
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+include depend
diff --git a/src/gallium/README.portability b/src/gallium/README.portability
new file mode 100644
index 0000000000..adecf4bb79
--- /dev/null
+++ b/src/gallium/README.portability
@@ -0,0 +1,109 @@
+	      CROSS-PLATFORM PORTABILITY GUIDELINES FOR GALLIUM3D 
+
+
+= General Considerations =
+
+The state tracker and winsys driver support a rather limited number of
+platforms. However, the pipe drivers are meant to run in a wide number of
+platforms. Hence the pipe drivers, the auxiliary modules, and all public
+headers in general, should strictly follow these guidelines to ensure
+
+
+= Compiler Support =
+
+* Include the p_compiler.h.
+
+* Don't use the 'inline' keyword, use the INLINE macro in p_compiler.h instead.
+
+* Cast explicitly when converting to integer types of smaller sizes.
+
+* Cast explicitly when converting between float, double and integral types.
+
+* Don't use named struct initializers.
+
+* Don't use variable number of macro arguments. Use static inline functions
+instead.
+
+* Don't use C99 features.
+
+= Standard Library =
+
+* Avoid including standard library headers. Most standard library functions are
+not available in Windows Kernel Mode. Use the appropriate p_*.h include.
+
+== Memory Allocation ==
+
+* Use MALLOC, CALLOC, FREE instead of the malloc, calloc, free functions.
+
+* Use align_pointer() function defined in u_memory.h for aligning pointers
+ in a portable way.
+
+== Debugging ==
+
+* Use the functions/macros in p_debug.h.
+
+* Don't include assert.h, call abort, printf, etc.
+
+
+= Code Style =
+
+== Inherantice in C ==
+
+The main thing we do is mimic inheritance by structure containment.
+
+Here's a silly made-up example:
+
+/* base class */
+struct buffer
+{
+  int size;
+  void (*validate)(struct buffer *buf);
+};
+
+/* sub-class of bufffer */
+struct texture_buffer
+{
+  struct buffer base;  /* the base class, MUST COME FIRST! */
+  int format;
+  int width, height;
+};
+
+
+Then, we'll typically have cast-wrapper functions to convert base-class 
+pointers to sub-class pointers where needed:
+
+static inline struct vertex_buffer *vertex_buffer(struct buffer *buf)
+{
+  return (struct vertex_buffer *) buf;
+}
+
+
+To create/init a sub-classed object:
+
+struct buffer *create_texture_buffer(int w, int h, int format)
+{
+  struct texture_buffer *t = malloc(sizeof(*t));
+  t->format = format;
+  t->width = w;
+  t->height = h;
+  t->base.size = w * h;
+  t->base.validate = tex_validate;
+  return &t->base;
+}
+
+Example sub-class method:
+
+void tex_validate(struct buffer *buf)
+{
+  struct texture_buffer *tb = texture_buffer(buf);
+  assert(tb->format);
+  assert(tb->width);
+  assert(tb->height);
+}
+
+
+Note that we typically do not use typedefs to make "class names"; we use
+'struct whatever' everywhere.
+
+Gallium's pipe_context and the subclassed psb_context, etc are prime examples 
+of this.  There's also many examples in Mesa and the Mesa state tracker.
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
new file mode 100644
index 0000000000..6a3e7e77ed
--- /dev/null
+++ b/src/gallium/SConscript
@@ -0,0 +1,29 @@
+import os
+
+Import('*')
+
+env = env.Clone()
+
+auxiliaries = []
+
+Export('auxiliaries')
+
+
+if llvm:
+	SConscript(['auxiliary/gallivm/SConscript'])
+
+SConscript([
+	# NOTE: order matters!
+	'auxiliary/util/SConscript',
+	'auxiliary/rtasm/SConscript',
+	'auxiliary/tgsi/SConscript',
+	'auxiliary/cso_cache/SConscript',
+	'auxiliary/translate/SConscript',
+	'auxiliary/draw/SConscript',
+	'auxiliary/pipebuffer/SConscript',
+])
+
+for driver in env['drivers']:
+	SConscript(os.path.join('drivers', driver, 'SConscript'))
+
+SConscript('state_trackers/python/SConscript')
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
new file mode 100644
index 0000000000..eaa0f2fe4e
--- /dev/null
+++ b/src/gallium/auxiliary/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = $(GALLIUM_AUXILIARY_DIRS)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/auxiliary/cso_cache/Makefile b/src/gallium/auxiliary/cso_cache/Makefile
new file mode 100644
index 0000000000..6bd6602088
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = cso_cache
+
+C_SOURCES = \
+	cso_context.c \
+	cso_cache.c \
+	cso_hash.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/cso_cache/SConscript b/src/gallium/auxiliary/cso_cache/SConscript
new file mode 100644
index 0000000000..651e68a191
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/SConscript
@@ -0,0 +1,11 @@
+Import('*')
+
+cso_cache = env.ConvenienceLibrary(
+	target = 'cso_cache',
+	source = [
+		'cso_context.c',
+		'cso_cache.c',
+		'cso_hash.c',
+	])
+
+auxiliaries.insert(0, cso_cache)
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
new file mode 100644
index 0000000000..6b1754ea00
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -0,0 +1,406 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <zack@tungstengraphics.com>
+ */
+
+#include "pipe/p_debug.h"
+
+#include "util/u_memory.h"
+
+#include "cso_cache.h"
+#include "cso_hash.h"
+
+
+struct cso_cache {
+   struct cso_hash *blend_hash;
+   struct cso_hash *depth_stencil_hash;
+   struct cso_hash *fs_hash;
+   struct cso_hash *vs_hash;
+   struct cso_hash *rasterizer_hash;
+   struct cso_hash *sampler_hash;
+   int    max_size;
+
+   cso_sanitize_callback sanitize_cb;
+   void                 *sanitize_data;
+};
+
+#if 1
+static unsigned hash_key(const void *key, unsigned key_size)
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+#else
+static unsigned hash_key(const unsigned char *p, int n)
+{
+   unsigned h = 0;
+   unsigned g;
+
+   while (n--) {
+      h = (h << 4) + *p++;
+      if ((g = (h & 0xf0000000)) != 0)
+         h ^= g >> 23;
+      h &= ~g;
+   }
+   return h;
+}
+#endif
+
+unsigned cso_construct_key(void *item, int item_size)
+{
+   return hash_key((item), item_size);
+}
+
+static struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
+{
+   struct cso_hash *hash = 0;
+
+   switch(type) {
+   case CSO_BLEND:
+      hash = sc->blend_hash;
+      break;
+   case CSO_SAMPLER:
+      hash = sc->sampler_hash;
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      hash = sc->depth_stencil_hash;
+      break;
+   case CSO_RASTERIZER:
+      hash = sc->rasterizer_hash;
+      break;
+   case CSO_FRAGMENT_SHADER:
+      hash = sc->fs_hash;
+      break;
+   case CSO_VERTEX_SHADER:
+      hash = sc->vs_hash;
+      break;
+   }
+
+   return hash;
+}
+
+static int _cso_size_for_type(enum cso_cache_type type)
+{
+   switch(type) {
+   case CSO_BLEND:
+      return sizeof(struct pipe_blend_state);
+   case CSO_SAMPLER:
+      return sizeof(struct pipe_sampler_state);
+   case CSO_DEPTH_STENCIL_ALPHA:
+      return sizeof(struct pipe_depth_stencil_alpha_state);
+   case CSO_RASTERIZER:
+      return sizeof(struct pipe_rasterizer_state);
+   case CSO_FRAGMENT_SHADER:
+      return sizeof(struct pipe_shader_state);
+   case CSO_VERTEX_SHADER:
+      return sizeof(struct pipe_shader_state);
+   }
+   return 0;
+}
+
+
+static void delete_blend_state(void *state, void *data)
+{
+   struct cso_blend *cso = (struct cso_blend *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_depth_stencil_state(void *state, void *data)
+{
+   struct cso_depth_stencil_alpha *cso = (struct cso_depth_stencil_alpha *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_sampler_state(void *state, void *data)
+{
+   struct cso_sampler *cso = (struct cso_sampler *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_rasterizer_state(void *state, void *data)
+{
+   struct cso_rasterizer *cso = (struct cso_rasterizer *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_fs_state(void *state, void *data)
+{
+   struct cso_fragment_shader *cso = (struct cso_fragment_shader *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+static void delete_vs_state(void *state, void *data)
+{
+   struct cso_vertex_shader *cso = (struct cso_vertex_shader *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+}
+
+
+static INLINE void delete_cso(void *state, enum cso_cache_type type)
+{
+   switch (type) {
+   case CSO_BLEND:
+      delete_blend_state(state, 0);
+      break;
+   case CSO_SAMPLER:
+      delete_sampler_state(state, 0);
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      delete_depth_stencil_state(state, 0);
+      break;
+   case CSO_RASTERIZER:
+      delete_rasterizer_state(state, 0);
+      break;
+   case CSO_FRAGMENT_SHADER:
+      delete_fs_state(state, 0);
+      break;
+   case CSO_VERTEX_SHADER:
+      delete_vs_state(state, 0);
+      break;
+   default:
+      assert(0);
+      FREE(state);
+   }
+}
+
+
+static INLINE void sanitize_hash(struct cso_cache *sc,
+                                 struct cso_hash *hash,
+                                 enum cso_cache_type type,
+                                 int max_size)
+{
+   if (sc->sanitize_cb)
+      sc->sanitize_cb(hash, type, max_size, sc->sanitize_data);
+}
+
+
+static INLINE void sanitize_cb(struct cso_hash *hash, enum cso_cache_type type,
+                               int max_size, void *user_data)
+{
+   /* if we're approach the maximum size, remove fourth of the entries
+    * otherwise every subsequent call will go through the same */
+   int hash_size = cso_hash_size(hash);
+   int max_entries = (max_size > hash_size) ? max_size : hash_size;
+   int to_remove =  (max_size < max_entries) * max_entries/4;
+   if (hash_size > max_size)
+      to_remove += hash_size - max_size;
+   while (to_remove) {
+      /*remove elements until we're good */
+      /*fixme: currently we pick the nodes to remove at random*/
+      struct cso_hash_iter iter = cso_hash_first_node(hash);
+      void  *cso = cso_hash_take(hash, cso_hash_iter_key(iter));
+      delete_cso(cso, type);
+      --to_remove;
+   }
+}
+
+struct cso_hash_iter
+cso_insert_state(struct cso_cache *sc,
+                 unsigned hash_key, enum cso_cache_type type,
+                 void *state)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   sanitize_hash(sc, hash, type, sc->max_size);
+
+   return cso_hash_insert(hash, hash_key, state);
+}
+
+struct cso_hash_iter
+cso_find_state(struct cso_cache *sc,
+               unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+
+   return cso_hash_find(hash, hash_key);
+}
+
+
+void *cso_hash_find_data_from_template( struct cso_hash *hash,
+				        unsigned hash_key, 
+				        void *templ,
+				        int size )
+{
+   struct cso_hash_iter iter = cso_hash_find(hash, hash_key);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *iter_data = cso_hash_iter_data(iter);
+      if (!memcmp(iter_data, templ, size)) {
+	 /* We found a match
+	  */
+         return iter_data;
+      }
+      iter = cso_hash_iter_next(iter);
+   }
+   return NULL;
+}
+
+
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ)
+{
+   struct cso_hash_iter iter = cso_find_state(sc, hash_key, type);
+   int size = _cso_size_for_type(type);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *iter_data = cso_hash_iter_data(iter);
+      if (!memcmp(iter_data, templ, size))
+         return iter;
+      iter = cso_hash_iter_next(iter);
+   }
+   return iter;
+}
+
+void * cso_take_state(struct cso_cache *sc,
+                      unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   return cso_hash_take(hash, hash_key);
+}
+
+struct cso_cache *cso_cache_create(void)
+{
+   struct cso_cache *sc = MALLOC_STRUCT(cso_cache);
+   if (sc == NULL)
+      return NULL;
+
+   sc->max_size           = 4096;
+   sc->blend_hash         = cso_hash_create();
+   sc->sampler_hash       = cso_hash_create();
+   sc->depth_stencil_hash = cso_hash_create();
+   sc->rasterizer_hash    = cso_hash_create();
+   sc->fs_hash            = cso_hash_create();
+   sc->vs_hash            = cso_hash_create();
+   sc->sanitize_cb        = sanitize_cb;
+   sc->sanitize_data      = 0;
+
+   return sc;
+}
+
+void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
+                        cso_state_callback func, void *user_data)
+{
+   struct cso_hash *hash = 0;
+   struct cso_hash_iter iter;
+
+   switch (type) {
+   case CSO_BLEND:
+      hash = sc->blend_hash;
+      break;
+   case CSO_SAMPLER:
+      hash = sc->sampler_hash;
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      hash = sc->depth_stencil_hash;
+      break;
+   case CSO_RASTERIZER:
+      hash = sc->rasterizer_hash;
+      break;
+   case CSO_FRAGMENT_SHADER:
+      hash = sc->fs_hash;
+      break;
+   case CSO_VERTEX_SHADER:
+      hash = sc->vs_hash;
+      break;
+   }
+
+   iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *state = cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         func(state, user_data);
+      }
+   }
+}
+
+void cso_cache_delete(struct cso_cache *sc)
+{
+   assert(sc);
+   /* delete driver data */
+   cso_for_each_state(sc, CSO_BLEND, delete_blend_state, 0);
+   cso_for_each_state(sc, CSO_DEPTH_STENCIL_ALPHA, delete_depth_stencil_state, 0);
+   cso_for_each_state(sc, CSO_FRAGMENT_SHADER, delete_fs_state, 0);
+   cso_for_each_state(sc, CSO_VERTEX_SHADER, delete_vs_state, 0);
+   cso_for_each_state(sc, CSO_RASTERIZER, delete_rasterizer_state, 0);
+   cso_for_each_state(sc, CSO_SAMPLER, delete_sampler_state, 0);
+
+   cso_hash_delete(sc->blend_hash);
+   cso_hash_delete(sc->sampler_hash);
+   cso_hash_delete(sc->depth_stencil_hash);
+   cso_hash_delete(sc->rasterizer_hash);
+   cso_hash_delete(sc->fs_hash);
+   cso_hash_delete(sc->vs_hash);
+   FREE(sc);
+}
+
+void cso_set_maximum_cache_size(struct cso_cache *sc, int number)
+{
+   sc->max_size = number;
+
+   sanitize_hash(sc, sc->blend_hash, CSO_BLEND, sc->max_size);
+   sanitize_hash(sc, sc->depth_stencil_hash, CSO_DEPTH_STENCIL_ALPHA,
+                 sc->max_size);
+   sanitize_hash(sc, sc->fs_hash, CSO_FRAGMENT_SHADER, sc->max_size);
+   sanitize_hash(sc, sc->vs_hash, CSO_VERTEX_SHADER, sc->max_size);
+   sanitize_hash(sc, sc->rasterizer_hash, CSO_RASTERIZER, sc->max_size);
+   sanitize_hash(sc, sc->sampler_hash, CSO_SAMPLER, sc->max_size);
+}
+
+int cso_maximum_cache_size(const struct cso_cache *sc)
+{
+   return sc->max_size;
+}
+
+void cso_cache_set_sanitize_callback(struct cso_cache *sc,
+                                     cso_sanitize_callback cb,
+                                     void *user_data)
+{
+   sc->sanitize_cb   = cb;
+   sc->sanitize_data = user_data;
+}
+
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.h b/src/gallium/auxiliary/cso_cache/cso_cache.h
new file mode 100644
index 0000000000..6b5c230e8f
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.h
@@ -0,0 +1,176 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /**
+  * @file
+  * Constant State Object (CSO) cache.
+  *
+  * The basic idea is that the states are created via the
+  * create_state/bind_state/delete_state semantics. The driver is expected to
+  * perform as much of the Gallium state translation to whatever its internal
+  * representation is during the create call. Gallium then has a caching
+  * mechanism where it stores the created states. When the pipeline needs an
+  * actual state change, a bind call is issued. In the bind call the driver
+  * gets its already translated representation.
+  *
+  * Those semantics mean that the driver doesn't do the repeated translations
+  * of states on every frame, but only once, when a new state is actually
+  * created.
+  *
+  * Even on hardware that doesn't do any kind of state cache, it makes the
+  * driver look a lot neater, plus it avoids all the redundant state
+  * translations on every frame.
+  *
+  * Currently our constant state objects are:
+  * - alpha test
+  * - blend
+  * - depth stencil
+  * - fragment shader
+  * - rasterizer (old setup)
+  * - sampler
+  * - vertex shader
+  *
+  * Things that are not constant state objects include:
+  * - blend_color
+  * - clip_state
+  * - clear_color_state
+  * - constant_buffer
+  * - feedback_state
+  * - framebuffer_state
+  * - polygon_stipple
+  * - scissor_state
+  * - texture_state
+  * - viewport_state
+  *
+  * @author Zack Rusin <zack@tungstengraphics.com>
+  */
+
+#ifndef CSO_CACHE_H
+#define CSO_CACHE_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+/* cso_hash.h is necessary for cso_hash_iter, as MSVC requires structures
+ * returned by value to be fully defined */
+#include "cso_hash.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+enum cso_cache_type {
+   CSO_BLEND,
+   CSO_SAMPLER,
+   CSO_DEPTH_STENCIL_ALPHA,
+   CSO_RASTERIZER,
+   CSO_FRAGMENT_SHADER,
+   CSO_VERTEX_SHADER
+};
+
+typedef void (*cso_state_callback)(void *ctx, void *obj);
+
+typedef void (*cso_sanitize_callback)(struct cso_hash *hash,
+                                      enum cso_cache_type type,
+                                      int max_size,
+                                      void *user_data);
+
+struct cso_cache;
+
+struct cso_blend {
+   struct pipe_blend_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_depth_stencil_alpha {
+   struct pipe_depth_stencil_alpha_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_rasterizer {
+   struct pipe_rasterizer_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_fragment_shader {
+   struct pipe_shader_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_vertex_shader {
+   struct pipe_shader_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+struct cso_sampler {
+   struct pipe_sampler_state state;
+   void *data;
+   cso_state_callback delete_state;
+   struct pipe_context *context;
+};
+
+unsigned cso_construct_key(void *item, int item_size);
+
+struct cso_cache *cso_cache_create(void);
+void cso_cache_delete(struct cso_cache *sc);
+
+void cso_cache_set_sanitize_callback(struct cso_cache *sc,
+                                     cso_sanitize_callback cb,
+                                     void *user_data);
+
+struct cso_hash_iter cso_insert_state(struct cso_cache *sc,
+                                      unsigned hash_key, enum cso_cache_type type,
+                                      void *state);
+struct cso_hash_iter cso_find_state(struct cso_cache *sc,
+                                    unsigned hash_key, enum cso_cache_type type);
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ);
+void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
+                        cso_state_callback func, void *user_data);
+void * cso_take_state(struct cso_cache *sc, unsigned hash_key,
+                      enum cso_cache_type type);
+
+void cso_set_maximum_cache_size(struct cso_cache *sc, int number);
+int cso_maximum_cache_size(const struct cso_cache *sc);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
new file mode 100644
index 0000000000..68508f24de
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -0,0 +1,866 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /**
+  * @file
+  * 
+  * Wrap the cso cache & hash mechanisms in a simplified
+  * pipe-driver-specific interface.
+  *
+  * @author Zack Rusin <zack@tungstengraphics.com>
+  * @author Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cso_cache/cso_context.h"
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct cso_context {
+   struct pipe_context *pipe;
+   struct cso_cache *cache;
+
+   struct {
+      void *samplers[PIPE_MAX_SAMPLERS];
+      unsigned nr_samplers;
+   } hw;
+
+   void *samplers[PIPE_MAX_SAMPLERS];
+   unsigned nr_samplers;
+
+   void *samplers_saved[PIPE_MAX_SAMPLERS];
+   unsigned nr_samplers_saved;
+
+   struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
+   uint nr_textures;
+
+   struct pipe_texture *textures_saved[PIPE_MAX_SAMPLERS];
+   uint nr_textures_saved;
+
+   /** Current and saved state.
+    * The saved state is used as a 1-deep stack.
+    */
+   void *blend, *blend_saved;
+   void *depth_stencil, *depth_stencil_saved;
+   void *rasterizer, *rasterizer_saved;
+   void *fragment_shader, *fragment_shader_saved;
+   void *vertex_shader, *vertex_shader_saved;
+
+   struct pipe_framebuffer_state fb, fb_saved;
+   struct pipe_viewport_state vp, vp_saved;
+   struct pipe_blend_color blend_color;
+};
+
+
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb);
+
+
+static boolean delete_blend_state(struct cso_context *ctx, void *state)
+{
+   struct cso_blend *cso = (struct cso_blend *)state;
+
+   if (ctx->blend == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_depth_stencil_state(struct cso_context *ctx, void *state)
+{
+   struct cso_depth_stencil_alpha *cso = (struct cso_depth_stencil_alpha *)state;
+
+   if (ctx->depth_stencil == cso->data)
+      return FALSE;
+
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+
+   return TRUE;
+}
+
+static boolean delete_sampler_state(struct cso_context *ctx, void *state)
+{
+   struct cso_sampler *cso = (struct cso_sampler *)state;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_rasterizer_state(struct cso_context *ctx, void *state)
+{
+   struct cso_rasterizer *cso = (struct cso_rasterizer *)state;
+
+   if (ctx->rasterizer == cso->data)
+      return FALSE;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_fs_state(struct cso_context *ctx, void *state)
+{
+   struct cso_fragment_shader *cso = (struct cso_fragment_shader *)state;
+   if (ctx->fragment_shader == cso->data)
+      return FALSE;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return TRUE;
+}
+
+static boolean delete_vs_state(struct cso_context *ctx, void *state)
+{
+   struct cso_vertex_shader *cso = (struct cso_vertex_shader *)state;
+   if (ctx->vertex_shader == cso->data)
+      return TRUE;
+   if (cso->delete_state)
+      cso->delete_state(cso->context, cso->data);
+   FREE(state);
+   return FALSE;
+}
+
+
+static INLINE boolean delete_cso(struct cso_context *ctx,
+                                 void *state, enum cso_cache_type type)
+{
+   switch (type) {
+   case CSO_BLEND:
+      return delete_blend_state(ctx, state);
+      break;
+   case CSO_SAMPLER:
+      return delete_sampler_state(ctx, state);
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      return delete_depth_stencil_state(ctx, state);
+      break;
+   case CSO_RASTERIZER:
+      return delete_rasterizer_state(ctx, state);
+      break;
+   case CSO_FRAGMENT_SHADER:
+      return delete_fs_state(ctx, state);
+      break;
+   case CSO_VERTEX_SHADER:
+      return delete_vs_state(ctx, state);
+      break;
+   default:
+      assert(0);
+      FREE(state);
+   }
+   return FALSE;
+}
+
+static INLINE void sanitize_hash(struct cso_hash *hash, enum cso_cache_type type,
+                                 int max_size, void *user_data)
+{
+   struct cso_context *ctx = (struct cso_context *)user_data;
+   /* if we're approach the maximum size, remove fourth of the entries
+    * otherwise every subsequent call will go through the same */
+   int hash_size = cso_hash_size(hash);
+   int max_entries = (max_size > hash_size) ? max_size : hash_size;
+   int to_remove =  (max_size < max_entries) * max_entries/4;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   if (hash_size > max_size)
+      to_remove += hash_size - max_size;
+   while (to_remove) {
+      /*remove elements until we're good */
+      /*fixme: currently we pick the nodes to remove at random*/
+      void *cso = cso_hash_iter_data(iter);
+      if (delete_cso(ctx, cso, type)) {
+         iter = cso_hash_erase(hash, iter);
+         --to_remove;
+      } else
+         iter = cso_hash_iter_next(iter);
+   }
+}
+
+
+struct cso_context *cso_create_context( struct pipe_context *pipe )
+{
+   struct cso_context *ctx = CALLOC_STRUCT(cso_context);
+   if (ctx == NULL)
+      goto out;
+
+   ctx->cache = cso_cache_create();
+   if (ctx->cache == NULL)
+      goto out;
+   cso_cache_set_sanitize_callback(ctx->cache,
+                                   sanitize_hash,
+                                   ctx);
+
+   ctx->pipe = pipe;
+
+   /* Enable for testing: */
+   if (0) cso_set_maximum_cache_size( ctx->cache, 4 );
+
+   return ctx;
+
+out:
+   cso_destroy_context( ctx );      
+   return NULL;
+}
+
+
+/**
+ * Prior to context destruction, this function unbinds all state objects.
+ */
+void cso_release_all( struct cso_context *ctx )
+{
+   unsigned i;
+   
+   if (ctx->pipe) {
+      ctx->pipe->bind_blend_state( ctx->pipe, NULL );
+      ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
+      ctx->pipe->bind_sampler_states( ctx->pipe, 0, NULL );
+      ctx->pipe->bind_depth_stencil_alpha_state( ctx->pipe, NULL );
+      ctx->pipe->bind_fs_state( ctx->pipe, NULL );
+      ctx->pipe->bind_vs_state( ctx->pipe, NULL );
+   }
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_texture_reference(&ctx->textures[i], NULL);
+      pipe_texture_reference(&ctx->textures_saved[i], NULL);
+   }
+
+   free_framebuffer_state(&ctx->fb);
+   free_framebuffer_state(&ctx->fb_saved);
+
+   if (ctx->cache) {
+      cso_cache_delete( ctx->cache );
+      ctx->cache = NULL;
+   }
+}
+
+
+void cso_destroy_context( struct cso_context *ctx )
+{
+   if (ctx) {
+      //cso_release_all( ctx );
+      FREE( ctx );
+   }
+}
+
+
+/* Those function will either find the state of the given template
+ * in the cache or they will create a new state from the given
+ * template, insert it in the cache and return it.
+ */
+
+/*
+ * If the driver returns 0 from the create method then they will assign
+ * the data member of the cso to be the template itself.
+ */
+
+enum pipe_error cso_set_blend(struct cso_context *ctx,
+                              const struct pipe_blend_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_blend_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_BLEND,
+                                                       (void*)templ);
+   void *handle;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_blend *cso = MALLOC(sizeof(struct cso_blend));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_blend_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_blend_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_BLEND, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_blend *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->blend != handle) {
+      ctx->blend = handle;
+      ctx->pipe->bind_blend_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_blend(struct cso_context *ctx)
+{
+   assert(!ctx->blend_saved);
+   ctx->blend_saved = ctx->blend;
+}
+
+void cso_restore_blend(struct cso_context *ctx)
+{
+   if (ctx->blend != ctx->blend_saved) {
+      ctx->blend = ctx->blend_saved;
+      ctx->pipe->bind_blend_state(ctx->pipe, ctx->blend_saved);
+   }
+   ctx->blend_saved = NULL;
+}
+
+
+
+enum pipe_error cso_single_sampler(struct cso_context *ctx,
+                                   unsigned idx,
+                                   const struct pipe_sampler_state *templ)
+{
+   void *handle = NULL;
+
+   if (templ != NULL) {
+      unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_sampler_state));
+      struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                          hash_key, CSO_SAMPLER,
+                                                          (void*)templ);
+
+      if (cso_hash_iter_is_null(iter)) {
+         struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
+         if (!cso)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
+         memcpy(&cso->state, templ, sizeof(*templ));
+         cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
+         cso->delete_state = (cso_state_callback)ctx->pipe->delete_sampler_state;
+         cso->context = ctx->pipe;
+
+         iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
+         if (cso_hash_iter_is_null(iter)) {
+            FREE(cso);
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+
+         handle = cso->data;
+      }
+      else {
+         handle = ((struct cso_sampler *)cso_hash_iter_data(iter))->data;
+      }
+   }
+
+   ctx->samplers[idx] = handle;
+   return PIPE_OK;
+}
+
+void cso_single_sampler_done( struct cso_context *ctx )
+{
+   unsigned i;
+
+   /* find highest non-null sampler */
+   for (i = PIPE_MAX_SAMPLERS; i > 0; i--) {
+      if (ctx->samplers[i - 1] != NULL)
+         break;
+   }
+
+   ctx->nr_samplers = i;
+
+   if (ctx->hw.nr_samplers != ctx->nr_samplers ||
+       memcmp(ctx->hw.samplers,
+              ctx->samplers,
+              ctx->nr_samplers * sizeof(void *)) != 0) 
+   {
+      memcpy(ctx->hw.samplers, ctx->samplers, ctx->nr_samplers * sizeof(void *));
+      ctx->hw.nr_samplers = ctx->nr_samplers;
+
+      ctx->pipe->bind_sampler_states(ctx->pipe, ctx->nr_samplers, ctx->samplers);
+   }
+}
+
+/*
+ * If the function encouters any errors it will return the
+ * last one. Done to always try to set as many samplers
+ * as possible.
+ */
+enum pipe_error cso_set_samplers( struct cso_context *ctx,
+                                  unsigned nr,
+                                  const struct pipe_sampler_state **templates )
+{
+   unsigned i;
+   enum pipe_error temp, error = PIPE_OK;
+
+   /* TODO: fastpath
+    */
+
+   for (i = 0; i < nr; i++) {
+      temp = cso_single_sampler( ctx, i, templates[i] );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   for ( ; i < ctx->nr_samplers; i++) {
+      temp = cso_single_sampler( ctx, i, NULL );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   cso_single_sampler_done( ctx );
+
+   return error;
+}
+
+void cso_save_samplers(struct cso_context *ctx)
+{
+   ctx->nr_samplers_saved = ctx->nr_samplers;
+   memcpy(ctx->samplers_saved, ctx->samplers, sizeof(ctx->samplers));
+}
+
+void cso_restore_samplers(struct cso_context *ctx)
+{
+   ctx->nr_samplers = ctx->nr_samplers_saved;
+   memcpy(ctx->samplers, ctx->samplers_saved, sizeof(ctx->samplers));
+   cso_single_sampler_done( ctx );
+}
+
+
+enum pipe_error cso_set_sampler_textures( struct cso_context *ctx,
+                                          uint count,
+                                          struct pipe_texture **textures )
+{
+   uint i;
+
+   ctx->nr_textures = count;
+
+   for (i = 0; i < count; i++)
+      pipe_texture_reference(&ctx->textures[i], textures[i]);
+   for ( ; i < PIPE_MAX_SAMPLERS; i++)
+      pipe_texture_reference(&ctx->textures[i], NULL);
+
+   ctx->pipe->set_sampler_textures(ctx->pipe, count, textures);
+
+   return PIPE_OK;
+}
+
+void cso_save_sampler_textures( struct cso_context *ctx )
+{
+   uint i;
+
+   ctx->nr_textures_saved = ctx->nr_textures;
+   for (i = 0; i < ctx->nr_textures; i++) {
+      assert(!ctx->textures_saved[i]);
+      pipe_texture_reference(&ctx->textures_saved[i], ctx->textures[i]);
+   }
+}
+
+void cso_restore_sampler_textures( struct cso_context *ctx )
+{
+   uint i;
+
+   ctx->nr_textures = ctx->nr_textures_saved;
+
+   for (i = 0; i < ctx->nr_textures; i++) {
+      pipe_texture_reference(&ctx->textures[i], NULL);
+      ctx->textures[i] = ctx->textures_saved[i];
+      ctx->textures_saved[i] = NULL;
+   }
+   for ( ; i < PIPE_MAX_SAMPLERS; i++)
+      pipe_texture_reference(&ctx->textures[i], NULL);
+
+   ctx->pipe->set_sampler_textures(ctx->pipe, ctx->nr_textures, ctx->textures);
+
+   ctx->nr_textures_saved = 0;
+}
+
+
+
+enum pipe_error cso_set_depth_stencil_alpha(struct cso_context *ctx,
+                                            const struct pipe_depth_stencil_alpha_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_depth_stencil_alpha_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, 
+						       CSO_DEPTH_STENCIL_ALPHA,
+                                                       (void*)templ);
+   void *handle;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_depth_stencil_alpha *cso = MALLOC(sizeof(struct cso_depth_stencil_alpha));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_depth_stencil_alpha_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_DEPTH_STENCIL_ALPHA, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_depth_stencil_alpha *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->depth_stencil != handle) {
+      ctx->depth_stencil = handle;
+      ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_depth_stencil_alpha(struct cso_context *ctx)
+{
+   assert(!ctx->depth_stencil_saved);
+   ctx->depth_stencil_saved = ctx->depth_stencil;
+}
+
+void cso_restore_depth_stencil_alpha(struct cso_context *ctx)
+{
+   if (ctx->depth_stencil != ctx->depth_stencil_saved) {
+      ctx->depth_stencil = ctx->depth_stencil_saved;
+      ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, ctx->depth_stencil_saved);
+   }
+   ctx->depth_stencil_saved = NULL;
+}
+
+
+
+enum pipe_error cso_set_rasterizer(struct cso_context *ctx,
+                                   const struct pipe_rasterizer_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_rasterizer_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_RASTERIZER,
+                                                       (void*)templ);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_rasterizer *cso = MALLOC(sizeof(struct cso_rasterizer));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(&cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_rasterizer_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_rasterizer_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_RASTERIZER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_rasterizer *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->rasterizer != handle) {
+      ctx->rasterizer = handle;
+      ctx->pipe->bind_rasterizer_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_rasterizer(struct cso_context *ctx)
+{
+   assert(!ctx->rasterizer_saved);
+   ctx->rasterizer_saved = ctx->rasterizer;
+}
+
+void cso_restore_rasterizer(struct cso_context *ctx)
+{
+   if (ctx->rasterizer != ctx->rasterizer_saved) {
+      ctx->rasterizer = ctx->rasterizer_saved;
+      ctx->pipe->bind_rasterizer_state(ctx->pipe, ctx->rasterizer_saved);
+   }
+   ctx->rasterizer_saved = NULL;
+}
+
+
+
+enum pipe_error cso_set_fragment_shader_handle(struct cso_context *ctx,
+                                               void *handle )
+{
+   if (ctx->fragment_shader != handle) {
+      ctx->fragment_shader = handle;
+      ctx->pipe->bind_fs_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_delete_fragment_shader(struct cso_context *ctx, void *handle )
+{
+   if (handle == ctx->fragment_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_fs_state(ctx->pipe, NULL);
+      ctx->fragment_shader = NULL;
+   }
+   ctx->pipe->delete_fs_state(ctx->pipe, handle);
+}
+
+/* Not really working:
+ */
+#if 0
+enum pipe_error cso_set_fragment_shader(struct cso_context *ctx,
+                                        const struct pipe_shader_state *templ)
+{
+   const struct tgsi_token *tokens = templ->tokens;
+   unsigned num_tokens = tgsi_num_tokens(tokens);
+   size_t tokens_size = num_tokens*sizeof(struct tgsi_token);
+   unsigned hash_key = cso_construct_key((void*)tokens, tokens_size);
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, 
+                                                       CSO_FRAGMENT_SHADER,
+                                                       (void*)tokens);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_fragment_shader *cso = MALLOC(sizeof(struct cso_fragment_shader) + tokens_size);
+      struct tgsi_token *cso_tokens = (struct tgsi_token *)((char *)cso + sizeof(*cso));
+
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(cso_tokens, tokens, tokens_size);
+      cso->state.tokens = cso_tokens;
+      cso->data = ctx->pipe->create_fs_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_fs_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_FRAGMENT_SHADER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_fragment_shader *)cso_hash_iter_data(iter))->data;
+   }
+
+   return cso_set_fragment_shader_handle( ctx, handle );
+}
+#endif
+
+void cso_save_fragment_shader(struct cso_context *ctx)
+{
+   assert(!ctx->fragment_shader_saved);
+   ctx->fragment_shader_saved = ctx->fragment_shader;
+}
+
+void cso_restore_fragment_shader(struct cso_context *ctx)
+{
+   if (ctx->fragment_shader_saved != ctx->fragment_shader) {
+      ctx->pipe->bind_fs_state(ctx->pipe, ctx->fragment_shader_saved);
+      ctx->fragment_shader = ctx->fragment_shader_saved;
+   }
+   ctx->fragment_shader_saved = NULL;
+}
+
+
+enum pipe_error cso_set_vertex_shader_handle(struct cso_context *ctx,
+                                             void *handle )
+{
+   if (ctx->vertex_shader != handle) {
+      ctx->vertex_shader = handle;
+      ctx->pipe->bind_vs_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+void cso_delete_vertex_shader(struct cso_context *ctx, void *handle )
+{
+   if (handle == ctx->vertex_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_vs_state(ctx->pipe, NULL);
+      ctx->vertex_shader = NULL;
+   }
+   ctx->pipe->delete_vs_state(ctx->pipe, handle);
+}
+
+
+/* Not really working:
+ */
+#if 0
+enum pipe_error cso_set_vertex_shader(struct cso_context *ctx,
+                                      const struct pipe_shader_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_shader_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_VERTEX_SHADER,
+                                                       (void*)templ);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_vertex_shader *cso = MALLOC(sizeof(struct cso_vertex_shader));
+
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(cso->state, templ, sizeof(*templ));
+      cso->data = ctx->pipe->create_vs_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_vs_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_VERTEX_SHADER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_vertex_shader *)cso_hash_iter_data(iter))->data;
+   }
+
+   return cso_set_vertex_shader_handle( ctx, handle );
+}
+#endif
+
+
+
+void cso_save_vertex_shader(struct cso_context *ctx)
+{
+   assert(!ctx->vertex_shader_saved);
+   ctx->vertex_shader_saved = ctx->vertex_shader;
+}
+
+void cso_restore_vertex_shader(struct cso_context *ctx)
+{
+   if (ctx->vertex_shader_saved != ctx->vertex_shader) {
+      ctx->pipe->bind_vs_state(ctx->pipe, ctx->vertex_shader_saved);
+      ctx->vertex_shader = ctx->vertex_shader_saved;
+   }
+   ctx->vertex_shader_saved = NULL;
+}
+
+
+/**
+ * Copy framebuffer state from src to dst with refcounting of surfaces.
+ */
+static void
+copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                       const struct pipe_framebuffer_state *src)
+{
+   uint i;
+
+   dst->width = src->width;
+   dst->height = src->height;
+   dst->num_cbufs = src->num_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+   }
+   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+}
+
+
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb)
+{
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+   pipe_surface_reference(&fb->zsbuf, NULL);
+}
+
+
+enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
+                                    const struct pipe_framebuffer_state *fb)
+{
+   if (memcmp(&ctx->fb, fb, sizeof(*fb)) != 0) {
+      copy_framebuffer_state(&ctx->fb, fb);
+      ctx->pipe->set_framebuffer_state(ctx->pipe, fb);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_framebuffer(struct cso_context *ctx)
+{
+   copy_framebuffer_state(&ctx->fb_saved, &ctx->fb);
+}
+
+void cso_restore_framebuffer(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) {
+      copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
+      ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb);
+      free_framebuffer_state(&ctx->fb_saved);
+   }
+}
+
+
+enum pipe_error cso_set_viewport(struct cso_context *ctx,
+                                 const struct pipe_viewport_state *vp)
+{
+   if (memcmp(&ctx->vp, vp, sizeof(*vp))) {
+      ctx->vp = *vp;
+      ctx->pipe->set_viewport_state(ctx->pipe, vp);
+   }
+   return PIPE_OK;
+}
+
+void cso_save_viewport(struct cso_context *ctx)
+{
+   ctx->vp_saved = ctx->vp;
+}
+
+
+void cso_restore_viewport(struct cso_context *ctx)
+{
+   if (memcmp(&ctx->vp, &ctx->vp_saved, sizeof(ctx->vp))) {
+      ctx->vp = ctx->vp_saved;
+      ctx->pipe->set_viewport_state(ctx->pipe, &ctx->vp);
+   }
+}
+
+
+
+
+enum pipe_error cso_set_blend_color(struct cso_context *ctx,
+                                    const struct pipe_blend_color *bc)
+{
+   if (memcmp(&ctx->blend_color, bc, sizeof(ctx->blend_color))) {
+      ctx->blend_color = *bc;
+      ctx->pipe->set_blend_color(ctx->pipe, bc);
+   }
+   return PIPE_OK;
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
new file mode 100644
index 0000000000..b04e98bfa1
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -0,0 +1,145 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef CSO_CONTEXT_H
+#define CSO_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_error.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct cso_context;
+
+struct cso_context *cso_create_context( struct pipe_context *pipe );
+
+void cso_release_all( struct cso_context *ctx );
+
+void cso_destroy_context( struct cso_context *cso );
+
+
+
+enum pipe_error cso_set_blend( struct cso_context *cso,
+                               const struct pipe_blend_state *blend );
+void cso_save_blend(struct cso_context *cso);
+void cso_restore_blend(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_depth_stencil_alpha( struct cso_context *cso,
+                                             const struct pipe_depth_stencil_alpha_state *dsa );
+void cso_save_depth_stencil_alpha(struct cso_context *cso);
+void cso_restore_depth_stencil_alpha(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_rasterizer( struct cso_context *cso,
+                                    const struct pipe_rasterizer_state *rasterizer );
+void cso_save_rasterizer(struct cso_context *cso);
+void cso_restore_rasterizer(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_samplers( struct cso_context *cso,
+                                  unsigned count,
+                                  const struct pipe_sampler_state **states );
+void cso_save_samplers(struct cso_context *cso);
+void cso_restore_samplers(struct cso_context *cso);
+
+/* Alternate interface to support state trackers that like to modify
+ * samplers one at a time:
+ */
+enum pipe_error cso_single_sampler( struct cso_context *cso,
+                                    unsigned nr,
+                                    const struct pipe_sampler_state *states );
+
+void cso_single_sampler_done( struct cso_context *cso );
+
+
+
+enum pipe_error cso_set_sampler_textures( struct cso_context *cso,
+                                          uint count,
+                                          struct pipe_texture **textures );
+void cso_save_sampler_textures( struct cso_context *cso );
+void cso_restore_sampler_textures( struct cso_context *cso );
+
+
+
+/* These aren't really sensible -- most of the time the api provides
+ * object semantics for shaders anyway, and the cases where it doesn't
+ * (eg mesa's internall-generated texenv programs), it will be up to
+ * the state tracker to implement their own specialized caching.
+ */
+enum pipe_error cso_set_fragment_shader_handle(struct cso_context *ctx,
+                                               void *handle );
+void cso_delete_fragment_shader(struct cso_context *ctx, void *handle );
+/*
+enum pipe_error cso_set_fragment_shader( struct cso_context *cso,
+                                         const struct pipe_shader_state *shader );
+*/
+void cso_save_fragment_shader(struct cso_context *cso);
+void cso_restore_fragment_shader(struct cso_context *cso);
+
+
+enum pipe_error cso_set_vertex_shader_handle(struct cso_context *ctx,
+                                             void *handle );
+void cso_delete_vertex_shader(struct cso_context *ctx, void *handle );
+/*
+enum pipe_error cso_set_vertex_shader( struct cso_context *cso,
+                                       const struct pipe_shader_state *shader );
+*/
+void cso_save_vertex_shader(struct cso_context *cso);
+void cso_restore_vertex_shader(struct cso_context *cso);
+
+
+
+enum pipe_error cso_set_framebuffer(struct cso_context *cso,
+                                    const struct pipe_framebuffer_state *fb);
+void cso_save_framebuffer(struct cso_context *cso);
+void cso_restore_framebuffer(struct cso_context *cso);
+
+
+enum pipe_error cso_set_viewport(struct cso_context *cso,
+                                 const struct pipe_viewport_state *vp);
+void cso_save_viewport(struct cso_context *cso);
+void cso_restore_viewport(struct cso_context *cso);
+
+
+enum pipe_error cso_set_blend_color(struct cso_context *cso,
+                                    const struct pipe_blend_color *bc);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
new file mode 100644
index 0000000000..4e7664f9bf
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -0,0 +1,439 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin <zack@tungstengraphics.com>
+  */
+
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+
+#include "cso_hash.h"
+
+#define MAX(a, b) ((a > b) ? (a) : (b))
+
+static const int MinNumBits = 4;
+
+static const unsigned char prime_deltas[] = {
+   0,  0,  1,  3,  1,  5,  3,  3,  1,  9,  7,  5,  3,  9, 25,  3,
+   1, 21,  3, 21,  7, 15,  9,  5,  3, 29, 15,  0,  0,  0,  0,  0
+};
+
+static int primeForNumBits(int numBits)
+{
+   return (1 << numBits) + prime_deltas[numBits];
+}
+
+/*
+    Returns the smallest integer n such that
+    primeForNumBits(n) >= hint.
+*/
+static int countBits(int hint)
+{
+   int numBits = 0;
+   int bits = hint;
+
+   while (bits > 1) {
+      bits >>= 1;
+      numBits++;
+   }
+
+   if (numBits >= (int)sizeof(prime_deltas)) {
+      numBits = sizeof(prime_deltas) - 1;
+   } else if (primeForNumBits(numBits) < hint) {
+      ++numBits;
+   }
+   return numBits;
+}
+
+struct cso_node {
+   struct cso_node *next;
+   unsigned key;
+   void *value;
+};
+
+struct cso_hash_data {
+   struct cso_node *fakeNext;
+   struct cso_node **buckets;
+   int size;
+   int nodeSize;
+   short userNumBits;
+   short numBits;
+   int numBuckets;
+};
+
+struct cso_hash {
+   union {
+      struct cso_hash_data *d;
+      struct cso_node      *e;
+   } data;
+};
+
+static void *cso_data_allocate_node(struct cso_hash_data *hash)
+{
+   return MALLOC(hash->nodeSize);
+}
+
+static void cso_free_node(struct cso_node *node)
+{
+   FREE(node);
+}
+
+static struct cso_node *
+cso_hash_create_node(struct cso_hash *hash,
+                      unsigned akey, void *avalue,
+                      struct cso_node **anextNode)
+{
+   struct cso_node *node = cso_data_allocate_node(hash->data.d);
+
+   if (!node)
+      return NULL;
+
+   node->key = akey;
+   node->value = avalue;
+
+   node->next = (struct cso_node*)(*anextNode);
+   *anextNode = node;
+   ++hash->data.d->size;
+   return node;
+}
+
+static void cso_data_rehash(struct cso_hash_data *hash, int hint)
+{
+   if (hint < 0) {
+      hint = countBits(-hint);
+      if (hint < MinNumBits)
+         hint = MinNumBits;
+      hash->userNumBits = (short)hint;
+      while (primeForNumBits(hint) < (hash->size >> 1))
+         ++hint;
+   } else if (hint < MinNumBits) {
+      hint = MinNumBits;
+   }
+
+   if (hash->numBits != hint) {
+      struct cso_node *e = (struct cso_node *)(hash);
+      struct cso_node **oldBuckets = hash->buckets;
+      int oldNumBuckets = hash->numBuckets;
+      int  i = 0;
+
+      hash->numBits = (short)hint;
+      hash->numBuckets = primeForNumBits(hint);
+      hash->buckets = MALLOC(sizeof(struct cso_node*) * hash->numBuckets);
+      for (i = 0; i < hash->numBuckets; ++i)
+         hash->buckets[i] = e;
+
+      for (i = 0; i < oldNumBuckets; ++i) {
+         struct cso_node *firstNode = oldBuckets[i];
+         while (firstNode != e) {
+            unsigned h = firstNode->key;
+            struct cso_node *lastNode = firstNode;
+            struct cso_node *afterLastNode;
+            struct cso_node **beforeFirstNode;
+            
+            while (lastNode->next != e && lastNode->next->key == h)
+               lastNode = lastNode->next;
+
+            afterLastNode = lastNode->next;
+            beforeFirstNode = &hash->buckets[h % hash->numBuckets];
+            while (*beforeFirstNode != e)
+               beforeFirstNode = &(*beforeFirstNode)->next;
+            lastNode->next = *beforeFirstNode;
+            *beforeFirstNode = firstNode;
+            firstNode = afterLastNode;
+         }
+      }
+      FREE(oldBuckets);
+   }
+}
+
+static void cso_data_might_grow(struct cso_hash_data *hash)
+{
+   if (hash->size >= hash->numBuckets)
+      cso_data_rehash(hash, hash->numBits + 1);
+}
+
+static void cso_data_has_shrunk(struct cso_hash_data *hash)
+{
+   if (hash->size <= (hash->numBuckets >> 3) &&
+       hash->numBits > hash->userNumBits) {
+      int max = MAX(hash->numBits-2, hash->userNumBits);
+      cso_data_rehash(hash,  max);
+   }
+}
+
+static struct cso_node *cso_data_first_node(struct cso_hash_data *hash)
+{
+   struct cso_node *e = (struct cso_node *)(hash);
+   struct cso_node **bucket = hash->buckets;
+   int n = hash->numBuckets;
+   while (n--) {
+      if (*bucket != e)
+         return *bucket;
+      ++bucket;
+   }
+   return e;
+}
+
+static struct cso_node **cso_hash_find_node(struct cso_hash *hash, unsigned akey)
+{
+   struct cso_node **node;
+
+   if (hash->data.d->numBuckets) {
+      node = (struct cso_node **)(&hash->data.d->buckets[akey % hash->data.d->numBuckets]);
+      assert(*node == hash->data.e || (*node)->next);
+      while (*node != hash->data.e && (*node)->key != akey)
+         node = &(*node)->next;
+   } else {
+      node = (struct cso_node **)((const struct cso_node * const *)(&hash->data.e));
+   }
+   return node;
+}
+
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash,
+                                       unsigned key, void *data)
+{
+   cso_data_might_grow(hash->data.d);
+
+   {
+      struct cso_node **nextNode = cso_hash_find_node(hash, key);
+      struct cso_node *node = cso_hash_create_node(hash, key, data, nextNode);
+      if (!node) {
+         struct cso_hash_iter null_iter = {hash, 0};
+         return null_iter;
+      }
+
+      {
+         struct cso_hash_iter iter = {hash, node};
+         return iter;
+      }
+   }
+}
+
+struct cso_hash * cso_hash_create(void)
+{
+   struct cso_hash *hash = MALLOC_STRUCT(cso_hash);
+   if (!hash)
+      return NULL;
+
+   hash->data.d = MALLOC_STRUCT(cso_hash_data);
+   if (!hash->data.d) {
+      FREE(hash);
+      return NULL;
+   }
+
+   hash->data.d->fakeNext = 0;
+   hash->data.d->buckets = 0;
+   hash->data.d->size = 0;
+   hash->data.d->nodeSize = sizeof(struct cso_node);
+   hash->data.d->userNumBits = (short)MinNumBits;
+   hash->data.d->numBits = 0;
+   hash->data.d->numBuckets = 0;
+
+   return hash;
+}
+
+void cso_hash_delete(struct cso_hash *hash)
+{
+   struct cso_node *e_for_x = (struct cso_node *)(hash->data.d);
+   struct cso_node **bucket = (struct cso_node **)(hash->data.d->buckets);
+   int n = hash->data.d->numBuckets;
+   while (n--) {
+      struct cso_node *cur = *bucket++;
+      while (cur != e_for_x) {
+         struct cso_node *next = cur->next;
+         cso_free_node(cur);
+         cur = next;
+      }
+   }
+   FREE(hash->data.d->buckets);
+   FREE(hash->data.d);
+   FREE(hash);
+}
+
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash,
+                                     unsigned key)
+{
+   struct cso_node **nextNode = cso_hash_find_node(hash, key);
+   struct cso_hash_iter iter = {hash, *nextNode};
+   return iter;
+}
+
+unsigned cso_hash_iter_key(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->key;
+}
+
+void * cso_hash_iter_data(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->value;
+}
+
+static struct cso_node *cso_hash_data_next(struct cso_node *node)
+{
+   union {
+      struct cso_node *next;
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+   int start;
+   struct cso_node **bucket;
+   int n;
+
+   a.next = node->next;
+   if (!a.next) {
+      debug_printf("iterating beyond the last element\n");
+      return 0;
+   }
+   if (a.next->next)
+      return a.next;
+
+   start = (node->key % a.d->numBuckets) + 1;
+   bucket = a.d->buckets + start;
+   n = a.d->numBuckets - start;
+   while (n--) {
+      if (*bucket != a.e)
+         return *bucket;
+      ++bucket;
+   }
+   return a.e;
+}
+
+
+static struct cso_node *cso_hash_data_prev(struct cso_node *node)
+{
+   union {
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+   int start;
+   struct cso_node *sentinel;
+   struct cso_node **bucket;
+
+   a.e = node;
+   while (a.e->next)
+      a.e = a.e->next;
+
+   if (node == a.e)
+      start = a.d->numBuckets - 1;
+   else
+      start = node->key % a.d->numBuckets;
+
+   sentinel = node;
+   bucket = a.d->buckets + start;
+   while (start >= 0) {
+      if (*bucket != sentinel) {
+         struct cso_node *prev = *bucket;
+         while (prev->next != sentinel)
+            prev = prev->next;
+         return prev;
+      }
+
+      sentinel = a.e;
+      --bucket;
+      --start;
+   }
+   debug_printf("iterating backward beyond first element\n");
+   return a.e;
+}
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter next = {iter.hash, cso_hash_data_next(iter.node)};
+   return next;
+}
+
+int cso_hash_iter_is_null(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.node == iter.hash->data.e)
+      return 1;
+   return 0;
+}
+
+void * cso_hash_take(struct cso_hash *hash,
+                      unsigned akey)
+{
+   struct cso_node **node = cso_hash_find_node(hash, akey);
+   if (*node != hash->data.e) {
+      void *t = (*node)->value;
+      struct cso_node *next = (*node)->next;
+      cso_free_node(*node);
+      *node = next;
+      --hash->data.d->size;
+      cso_data_has_shrunk(hash->data.d);
+      return t;
+   }
+   return 0;
+}
+
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter prev = {iter.hash,
+                                 cso_hash_data_prev(iter.node)};
+   return prev;
+}
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash)
+{
+   struct cso_hash_iter iter = {hash, cso_data_first_node(hash->data.d)};
+   return iter;
+}
+
+int cso_hash_size(struct cso_hash *hash)
+{
+   return hash->data.d->size;
+}
+
+struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter iter)
+{
+   struct cso_hash_iter ret = iter;
+   struct cso_node *node = iter.node;
+   struct cso_node **node_ptr;
+
+   if (node == hash->data.e)
+      return iter;
+
+   ret = cso_hash_iter_next(ret);
+   node_ptr = (struct cso_node**)(&hash->data.d->buckets[node->key % hash->data.d->numBuckets]);
+   while (*node_ptr != node)
+      node_ptr = &(*node_ptr)->next;
+   *node_ptr = node->next;
+   cso_free_node(node);
+   --hash->data.d->size;
+   return ret;
+}
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+   struct cso_node **node = cso_hash_find_node(hash, key);
+   return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
new file mode 100644
index 0000000000..5891c325fa
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -0,0 +1,129 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Hash table implementation.
+ * 
+ * This file provides a hash implementation that is capable of dealing
+ * with collisions. It stores colliding entries in linked list. All
+ * functions operating on the hash return an iterator. The iterator
+ * itself points to the collision list. If there wasn't any collision
+ * the list will have just one entry, otherwise client code should
+ * iterate over the entries to find the exact entry among ones that
+ * had the same key (e.g. memcmp could be used on the data to check
+ * that)
+ * 
+ * @author Zack Rusin <zack@tungstengraphics.com>
+ */
+
+#ifndef CSO_HASH_H
+#define CSO_HASH_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+struct cso_hash;
+struct cso_node;
+
+
+struct cso_hash_iter {
+   struct cso_hash *hash;
+   struct cso_node  *node;
+};
+
+
+struct cso_hash *cso_hash_create(void);
+void             cso_hash_delete(struct cso_hash *hash);
+
+
+int              cso_hash_size(struct cso_hash *hash);
+
+
+/**
+ * Adds a data with the given key to the hash. If entry with the given
+ * key is already in the hash, this current entry is instered before it
+ * in the collision list.
+ * Function returns iterator pointing to the inserted item in the hash.
+ */
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash, unsigned key,
+                                     void *data);
+/**
+ * Removes the item pointed to by the current iterator from the hash.
+ * Note that the data itself is not erased and if it was a malloc'ed pointer
+ * it will have to be freed after calling this function by the callee.
+ * Function returns iterator pointing to the item after the removed one in
+ * the hash.
+ */
+struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter iter);
+
+void  *cso_hash_take(struct cso_hash *hash, unsigned key);
+
+
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
+
+/**
+ * Return an iterator pointing to the first entry in the collision list.
+ */
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
+
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
+
+
+int       cso_hash_iter_is_null(struct cso_hash_iter iter);
+unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
+void     *cso_hash_iter_data(struct cso_hash_iter iter);
+
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter);
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter);
+
+
+/**
+ * Convenience routine to iterate over the collision list while doing a memory
+ * comparison to see which entry in the list is a direct copy of our template
+ * and returns that entry.
+ */
+void *cso_hash_find_data_from_template( struct cso_hash *hash,
+				        unsigned hash_key,
+				        void *templ,
+				        int size );
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
new file mode 100644
index 0000000000..bdbf5a08ed
--- /dev/null
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -0,0 +1,50 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = draw
+
+C_SOURCES = \
+	draw_context.c \
+	draw_pipe.c \
+	draw_pipe_aaline.c \
+	draw_pipe_aapoint.c \
+	draw_pipe_clip.c \
+	draw_pipe_cull.c \
+	draw_pipe_flatshade.c \
+	draw_pipe_offset.c \
+	draw_pipe_pstipple.c \
+	draw_pipe_stipple.c \
+	draw_pipe_twoside.c \
+	draw_pipe_unfilled.c \
+	draw_pipe_util.c \
+	draw_pipe_validate.c \
+	draw_pipe_vbuf.c \
+	draw_pipe_wide_line.c \
+	draw_pipe_wide_point.c \
+	draw_pt.c \
+	draw_pt_elts.c \
+	draw_pt_emit.c \
+	draw_pt_fetch.c \
+	draw_pt_fetch_emit.c \
+	draw_pt_fetch_shade_emit.c \
+	draw_pt_fetch_shade_pipeline.c \
+	draw_pt_post_vs.c \
+        draw_pt_util.c \
+        draw_pt_varray.c \
+	draw_pt_vcache.c \
+	draw_vertex.c \
+	draw_vs.c \
+	draw_vs_varient.c \
+	draw_vs_aos.c \
+	draw_vs_aos_io.c \
+	draw_vs_aos_machine.c \
+	draw_vs_exec.c \
+	draw_vs_llvm.c \
+	draw_vs_ppc.c  \
+	draw_vs_sse.c 
+
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
new file mode 100644
index 0000000000..5f05aa324a
--- /dev/null
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -0,0 +1,46 @@
+Import('*')
+
+draw = env.ConvenienceLibrary(
+	target = 'draw',
+	source = [
+		'draw_context.c',
+		'draw_pipe.c',
+		'draw_pipe_aaline.c',
+		'draw_pipe_aapoint.c',
+		'draw_pipe_clip.c',
+		'draw_pipe_cull.c',
+		'draw_pipe_flatshade.c',
+		'draw_pipe_offset.c',
+		'draw_pipe_pstipple.c',
+		'draw_pipe_stipple.c',
+		'draw_pipe_twoside.c',
+		'draw_pipe_unfilled.c',
+		'draw_pipe_util.c',
+		'draw_pipe_validate.c',
+		'draw_pipe_vbuf.c',
+		'draw_pipe_wide_line.c',
+		'draw_pipe_wide_point.c',
+		'draw_pt.c',
+		'draw_pt_elts.c',
+		'draw_pt_emit.c',
+		'draw_pt_fetch.c',
+		'draw_pt_fetch_emit.c',
+		'draw_pt_fetch_shade_emit.c',
+		'draw_pt_fetch_shade_pipeline.c',
+		'draw_pt_post_vs.c',
+		'draw_pt_util.c',
+		'draw_pt_varray.c',
+		'draw_pt_vcache.c',
+		'draw_vertex.c',
+		'draw_vs.c',
+		'draw_vs_aos.c',
+		'draw_vs_aos_io.c',
+		'draw_vs_aos_machine.c',
+		'draw_vs_exec.c',
+		'draw_vs_llvm.c',
+		'draw_vs_ppc.c',
+		'draw_vs_sse.c',
+		'draw_vs_varient.c'
+	])
+
+auxiliaries.insert(0, draw)
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
new file mode 100644
index 0000000000..7bd4a2e221
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -0,0 +1,429 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "draw_context.h"
+#include "draw_vbuf.h"
+#include "draw_vs.h"
+#include "draw_pt.h"
+#include "draw_pipe.h"
+
+
+struct draw_context *draw_create( void )
+{
+   struct draw_context *draw = CALLOC_STRUCT( draw_context );
+   if (draw == NULL)
+      goto fail;
+
+   ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
+   ASSIGN_4V( draw->plane[3],  0,  1,  0, 1 );
+   ASSIGN_4V( draw->plane[4],  0,  0,  1, 1 ); /* yes these are correct */
+   ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
+   draw->nr_planes = 6;
+
+
+   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
+
+
+   if (!draw_pipeline_init( draw ))
+      goto fail;
+
+   if (!draw_pt_init( draw ))
+      goto fail;
+
+   if (!draw_vs_init( draw ))
+      goto fail;
+
+   return draw;
+
+fail:
+   draw_destroy( draw );   
+   return NULL;
+}
+
+
+void draw_destroy( struct draw_context *draw )
+{
+   if (!draw)
+      return;
+
+
+
+   /* Not so fast -- we're just borrowing this at the moment.
+    * 
+   if (draw->render)
+      draw->render->destroy( draw->render );
+   */
+
+   draw_pipeline_destroy( draw );
+   draw_pt_destroy( draw );
+   draw_vs_destroy( draw );
+
+   FREE( draw );
+}
+
+
+
+void draw_flush( struct draw_context *draw )
+{
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+}
+
+
+/**
+ * Specify the Minimum Resolvable Depth factor for polygon offset.
+ * This factor potentially depends on the number of Z buffer bits,
+ * the rasterization algorithm and the arithmetic performed on Z
+ * values between vertex shading and rasterization.  It will vary
+ * from one driver to another.
+ */
+void draw_set_mrd(struct draw_context *draw, double mrd)
+{
+   draw->mrd = mrd;
+}
+
+
+/**
+ * Register new primitive rasterization/rendering state.
+ * This causes the drawing pipeline to be rebuilt.
+ */
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->rasterizer = raster;
+   draw->bypass_clipping =
+      ((draw->rasterizer && draw->rasterizer->bypass_clipping) ||
+       draw->driver.bypass_clipping);
+}
+
+
+void draw_set_driver_clipping( struct draw_context *draw,
+                               boolean bypass_clipping )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->driver.bypass_clipping = bypass_clipping;
+   draw->bypass_clipping = (draw->rasterizer->bypass_clipping || 
+                            draw->driver.bypass_clipping);
+}
+
+
+/** 
+ * Plug in the primitive rendering/rasterization stage (which is the last
+ * stage in the drawing pipeline).
+ * This is provided by the device driver.
+ */
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->pipeline.rasterize = stage;
+}
+
+
+/**
+ * Set the draw module's clipping state.
+ */
+void draw_set_clip_state( struct draw_context *draw,
+                          const struct pipe_clip_state *clip )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
+   memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
+   draw->nr_planes = 6 + clip->nr;
+}
+
+
+/**
+ * Set the draw module's viewport state.
+ */
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->viewport = *viewport; /* struct copy */
+   draw->identity_viewport = (viewport->scale[0] == 1.0f &&
+                              viewport->scale[1] == 1.0f &&
+                              viewport->scale[2] == 1.0f &&
+                              viewport->scale[3] == 1.0f &&
+                              viewport->translate[0] == 0.0f &&
+                              viewport->translate[1] == 0.0f &&
+                              viewport->translate[2] == 0.0f &&
+                              viewport->translate[3] == 0.0f);
+
+   draw_vs_set_viewport( draw, viewport );
+}
+
+
+
+void
+draw_set_vertex_buffers(struct draw_context *draw,
+                        unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
+{
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(draw->pt.vertex_buffer, buffers, count * sizeof(buffers[0]));
+   draw->pt.nr_vertex_buffers = count;
+}
+
+
+void
+draw_set_vertex_elements(struct draw_context *draw,
+                         unsigned count,
+                         const struct pipe_vertex_element *elements)
+{
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(draw->pt.vertex_element, elements, count * sizeof(elements[0]));
+   draw->pt.nr_vertex_elements = count;
+}
+
+
+/**
+ * Tell drawing context where to find mapped vertex buffers.
+ */
+void
+draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                              unsigned attr, const void *buffer)
+{
+   draw->pt.user.vbuffer[attr] = buffer;
+}
+
+
+void
+draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                const void *buffer, 
+                                unsigned size )
+{
+   draw->pt.user.constants = buffer;
+   draw_vs_set_constants( draw, (const float (*)[4])buffer, size );
+}
+
+
+/**
+ * Tells the draw module to draw points with triangles if their size
+ * is greater than this threshold.
+ */
+void
+draw_wide_point_threshold(struct draw_context *draw, float threshold)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->pipeline.wide_point_threshold = threshold;
+}
+
+
+/**
+ * Tells the draw module to draw lines with triangles if their width
+ * is greater than this threshold.
+ */
+void
+draw_wide_line_threshold(struct draw_context *draw, float threshold)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->pipeline.wide_line_threshold = threshold;
+}
+
+
+/**
+ * Tells the draw module whether or not to implement line stipple.
+ */
+void
+draw_enable_line_stipple(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->pipeline.line_stipple = enable;
+}
+
+
+/**
+ * Tells draw module whether to convert points to quads for sprite mode.
+ */
+void
+draw_enable_point_sprites(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->pipeline.point_sprite = enable;
+}
+
+
+void
+draw_set_force_passthrough( struct draw_context *draw, boolean enable )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->force_passthrough = enable;
+}
+
+
+/**
+ * Ask the draw module for the location/slot of the given vertex attribute in
+ * a post-transformed vertex.
+ *
+ * With this function, drivers that use the draw module should have no reason
+ * to track the current vertex shader.
+ *
+ * Note that the draw module may sometimes generate vertices with extra
+ * attributes (such as texcoords for AA lines).  The driver can call this
+ * function to find those attributes.
+ *
+ * Zero is returned if the attribute is not found since this is
+ * a don't care / undefined situtation.  Returning -1 would be a bit more
+ * work for the drivers.
+ */
+int
+draw_find_vs_output(const struct draw_context *draw,
+                    uint semantic_name, uint semantic_index)
+{
+   const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+   uint i;
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      if (vs->info.output_semantic_name[i] == semantic_name &&
+          vs->info.output_semantic_index[i] == semantic_index)
+         return i;
+   }
+
+   /* XXX there may be more than one extra vertex attrib.
+    * For example, simulated gl_FragCoord and gl_PointCoord.
+    */
+   if (draw->extra_vp_outputs.semantic_name == semantic_name &&
+       draw->extra_vp_outputs.semantic_index == semantic_index) {
+      return draw->extra_vp_outputs.slot;
+   }
+   return 0;
+}
+
+
+/**
+ * Return number of vertex shader outputs.
+ */
+uint
+draw_num_vs_outputs(const struct draw_context *draw)
+{
+   uint count = draw->vs.vertex_shader->info.num_outputs;
+   if (draw->extra_vp_outputs.slot > 0)
+      count++;
+   return count;
+}
+
+
+/**
+ * Provide TGSI sampler objects for vertex shaders that use texture fetches.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_texture_samplers(struct draw_context *draw,
+                      uint num_samplers,
+                      struct tgsi_sampler **samplers)
+{
+   draw->vs.num_samplers = num_samplers;
+   draw->vs.samplers = samplers;
+}
+
+
+
+
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render )
+{
+   draw->render = render;
+}
+
+void draw_set_edgeflags( struct draw_context *draw,
+                         const unsigned *edgeflag )
+{
+   draw->pt.user.edgeflag = edgeflag;
+}
+
+
+
+
+/**
+ * Tell the drawing context about the index/element buffer to use
+ * (ala glDrawElements)
+ * If no element buffer is to be used (i.e. glDrawArrays) then this
+ * should be called with eltSize=0 and elements=NULL.
+ *
+ * \param draw  the drawing context
+ * \param eltSize  size of each element (1, 2 or 4 bytes)
+ * \param elements  the element buffer ptr
+ */
+void
+draw_set_mapped_element_buffer_range( struct draw_context *draw,
+                                      unsigned eltSize,
+                                      unsigned min_index,
+                                      unsigned max_index,
+                                      const void *elements )
+{
+   draw->pt.user.elts = elements;
+   draw->pt.user.eltSize = eltSize;
+   draw->pt.user.min_index = min_index;
+   draw->pt.user.max_index = max_index;
+}
+
+
+void
+draw_set_mapped_element_buffer( struct draw_context *draw,
+                                unsigned eltSize,
+                                const void *elements )
+{
+   draw->pt.user.elts = elements;
+   draw->pt.user.eltSize = eltSize;
+   draw->pt.user.min_index = 0;
+   draw->pt.user.max_index = 0xffffffff;
+}
+
+ 
+/* Revamp me please:
+ */
+void draw_do_flush( struct draw_context *draw, unsigned flags )
+{
+   if (!draw->suspend_flushing)
+   {
+      assert(!draw->flushing); /* catch inadvertant recursion */
+
+      draw->flushing = TRUE;
+
+      draw_pipeline_flush( draw, flags );
+
+      draw->reduced_prim = ~0; /* is reduced_prim needed any more? */
+      
+      draw->flushing = FALSE;
+   }
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
new file mode 100644
index 0000000000..d529e4e9a2
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -0,0 +1,182 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Public interface into the drawing module.
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#ifndef DRAW_CONTEXT_H
+#define DRAW_CONTEXT_H
+
+
+#include "pipe/p_state.h"
+
+
+struct pipe_context;
+struct draw_context;
+struct draw_stage;
+struct draw_vertex_shader;
+struct tgsi_sampler;
+
+
+struct draw_context *draw_create( void );
+
+void draw_destroy( struct draw_context *draw );
+
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport );
+
+void draw_set_clip_state( struct draw_context *pipe,
+                          const struct pipe_clip_state *clip );
+
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster );
+
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage );
+
+void draw_wide_point_threshold(struct draw_context *draw, float threshold);
+
+void draw_wide_line_threshold(struct draw_context *draw, float threshold);
+
+void draw_enable_line_stipple(struct draw_context *draw, boolean enable);
+
+void draw_enable_point_sprites(struct draw_context *draw, boolean enable);
+
+void draw_set_mrd(struct draw_context *draw, double mrd);
+
+boolean
+draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe);
+
+boolean
+draw_install_aapoint_stage(struct draw_context *draw, struct pipe_context *pipe);
+
+boolean
+draw_install_pstipple_stage(struct draw_context *draw, struct pipe_context *pipe);
+
+
+int
+draw_find_vs_output(const struct draw_context *draw,
+                    uint semantic_name, uint semantic_index);
+
+uint
+draw_num_vs_outputs(const struct draw_context *draw);
+
+
+void
+draw_texture_samplers(struct draw_context *draw,
+                      uint num_samplers,
+                      struct tgsi_sampler **samplers);
+
+
+
+/*
+ * Vertex shader functions
+ */
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader);
+void draw_bind_vertex_shader(struct draw_context *draw,
+                             struct draw_vertex_shader *dvs);
+void draw_delete_vertex_shader(struct draw_context *draw,
+                               struct draw_vertex_shader *dvs);
+
+
+
+/*
+ * Vertex data functions
+ */
+
+void draw_set_vertex_buffers(struct draw_context *draw,
+                             unsigned count,
+                             const struct pipe_vertex_buffer *buffers);
+
+void draw_set_vertex_elements(struct draw_context *draw,
+			      unsigned count,
+                              const struct pipe_vertex_element *elements);
+
+void
+draw_set_mapped_element_buffer_range( struct draw_context *draw,
+                                      unsigned eltSize,
+                                      unsigned min_index,
+                                      unsigned max_index,
+                                      const void *elements );
+
+void draw_set_mapped_element_buffer( struct draw_context *draw,
+                                     unsigned eltSize, 
+                                     const void *elements );
+
+void draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                                   unsigned attr, const void *buffer);
+
+void draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                     const void *buffer,
+                                     unsigned size );
+
+void draw_set_edgeflags( struct draw_context *draw,
+                         const unsigned *edgeflag );
+
+
+/***********************************************************************
+ * draw_prim.c 
+ */
+
+void draw_arrays(struct draw_context *draw, unsigned prim,
+		 unsigned start, unsigned count);
+
+void draw_flush(struct draw_context *draw);
+
+
+/*******************************************************************************
+ * Driver backend interface 
+ */
+struct vbuf_render;
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render );
+
+void draw_set_driver_clipping( struct draw_context *draw,
+                               boolean bypass_clipping );
+
+void draw_set_force_passthrough( struct draw_context *draw, 
+                                 boolean enable );
+
+/*******************************************************************************
+ * Draw pipeline 
+ */
+boolean draw_need_pipeline(const struct draw_context *draw,
+                           const struct pipe_rasterizer_state *rasterizer,
+                           unsigned prim );
+
+
+
+#endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
new file mode 100644
index 0000000000..3cde9d36d3
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -0,0 +1,287 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "draw/draw_private.h"
+#include "draw/draw_pipe.h"
+
+
+
+boolean draw_pipeline_init( struct draw_context *draw )
+{
+   /* create pipeline stages */
+   draw->pipeline.wide_line  = draw_wide_line_stage( draw );
+   draw->pipeline.wide_point = draw_wide_point_stage( draw );
+   draw->pipeline.stipple   = draw_stipple_stage( draw );
+   draw->pipeline.unfilled  = draw_unfilled_stage( draw );
+   draw->pipeline.twoside   = draw_twoside_stage( draw );
+   draw->pipeline.offset    = draw_offset_stage( draw );
+   draw->pipeline.clip      = draw_clip_stage( draw );
+   draw->pipeline.flatshade = draw_flatshade_stage( draw );
+   draw->pipeline.cull      = draw_cull_stage( draw );
+   draw->pipeline.validate  = draw_validate_stage( draw );
+   draw->pipeline.first     = draw->pipeline.validate;
+
+   if (!draw->pipeline.wide_line ||
+       !draw->pipeline.wide_point ||
+       !draw->pipeline.stipple ||
+       !draw->pipeline.unfilled ||
+       !draw->pipeline.twoside ||
+       !draw->pipeline.offset ||
+       !draw->pipeline.clip ||
+       !draw->pipeline.flatshade ||
+       !draw->pipeline.cull ||
+       !draw->pipeline.validate)
+      return FALSE;
+
+   /* these defaults are oriented toward the needs of softpipe */
+   draw->pipeline.wide_point_threshold = 1000000.0; /* infinity */
+   draw->pipeline.wide_line_threshold = 1.0;
+   draw->pipeline.line_stipple = TRUE;
+   draw->pipeline.point_sprite = TRUE;
+
+   return TRUE;
+}
+
+
+void draw_pipeline_destroy( struct draw_context *draw )
+{
+   if (draw->pipeline.wide_line)
+      draw->pipeline.wide_line->destroy( draw->pipeline.wide_line );
+   if (draw->pipeline.wide_point)
+      draw->pipeline.wide_point->destroy( draw->pipeline.wide_point );
+   if (draw->pipeline.stipple)
+      draw->pipeline.stipple->destroy( draw->pipeline.stipple );
+   if (draw->pipeline.unfilled)
+      draw->pipeline.unfilled->destroy( draw->pipeline.unfilled );
+   if (draw->pipeline.twoside)
+      draw->pipeline.twoside->destroy( draw->pipeline.twoside );
+   if (draw->pipeline.offset)
+      draw->pipeline.offset->destroy( draw->pipeline.offset );
+   if (draw->pipeline.clip)
+      draw->pipeline.clip->destroy( draw->pipeline.clip );
+   if (draw->pipeline.flatshade)
+      draw->pipeline.flatshade->destroy( draw->pipeline.flatshade );
+   if (draw->pipeline.cull)
+      draw->pipeline.cull->destroy( draw->pipeline.cull );
+   if (draw->pipeline.validate)
+      draw->pipeline.validate->destroy( draw->pipeline.validate );
+   if (draw->pipeline.aaline)
+      draw->pipeline.aaline->destroy( draw->pipeline.aaline );
+   if (draw->pipeline.aapoint)
+      draw->pipeline.aapoint->destroy( draw->pipeline.aapoint );
+   if (draw->pipeline.pstipple)
+      draw->pipeline.pstipple->destroy( draw->pipeline.pstipple );
+   if (draw->pipeline.rasterize)
+      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
+}
+
+
+
+
+
+
+
+static void do_point( struct draw_context *draw,
+		      const char *v0 )
+{
+   struct prim_header prim;
+   
+   prim.flags = 0;
+   prim.pad = 0;
+   prim.v[0] = (struct vertex_header *)v0;
+
+   draw->pipeline.first->point( draw->pipeline.first, &prim );
+}
+
+
+static void do_line( struct draw_context *draw,
+                     ushort flags,
+		     const char *v0,
+		     const char *v1 )
+{
+   struct prim_header prim;
+   
+   prim.flags = flags;
+   prim.pad = 0;
+   prim.v[0] = (struct vertex_header *)v0;
+   prim.v[1] = (struct vertex_header *)v1;
+
+   draw->pipeline.first->line( draw->pipeline.first, &prim );
+}
+
+
+static void do_triangle( struct draw_context *draw,
+                         ushort flags,
+			 char *v0,
+			 char *v1,
+			 char *v2 )
+{
+   struct prim_header prim;
+   
+   prim.v[0] = (struct vertex_header *)v0;
+   prim.v[1] = (struct vertex_header *)v1;
+   prim.v[2] = (struct vertex_header *)v2;
+   prim.flags = flags;
+   prim.pad = 0;
+
+   draw->pipeline.first->tri( draw->pipeline.first, &prim );
+}
+
+
+
+
+/* Code to run the pipeline on a fairly arbitary collection of vertices.
+ *
+ * Vertex headers must be pre-initialized with the
+ * UNDEFINED_VERTEX_ID, this code will cause that id to become
+ * overwritten, so it may have to be reset if there is the intention
+ * to reuse the vertices.
+ *
+ * This code provides a callback to reset the vertex id's which the
+ * draw_vbuf.c code uses when it has to perform a flush.
+ */
+void draw_pipeline_run( struct draw_context *draw,
+                        unsigned prim,
+                        struct vertex_header *vertices,
+                        unsigned vertex_count,
+                        unsigned stride,
+                        const ushort *elts,
+                        unsigned count )
+{
+   char *verts = (char *)vertices;
+   unsigned i;
+
+   draw->pipeline.verts = verts;
+   draw->pipeline.vertex_stride = stride;
+   draw->pipeline.vertex_count = vertex_count;
+   
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i++) 
+         do_point( draw, 
+                   verts + stride * elts[i] );
+      break;
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) 
+         do_line( draw, 
+                  elts[i+0],  /* flags */
+                  verts + stride * (elts[i+0] & ~DRAW_PIPE_FLAG_MASK),
+                  verts + stride * elts[i+1]);
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 0; i+2 < count; i += 3)
+         do_triangle( draw, 
+                      elts[i+0],  /* flags */
+                      verts + stride * (elts[i+0] & ~DRAW_PIPE_FLAG_MASK),
+                      verts + stride * elts[i+1],
+                      verts + stride * elts[i+2]);
+      break;
+   }
+   
+   draw->pipeline.verts = NULL;
+   draw->pipeline.vertex_count = 0;
+}
+
+#define QUAD(i0,i1,i2,i3)                                        \
+   do_triangle( draw,                                            \
+                ( DRAW_PIPE_RESET_STIPPLE |                      \
+                  DRAW_PIPE_EDGE_FLAG_0 |                        \
+                  DRAW_PIPE_EDGE_FLAG_2 ),                       \
+                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i1),                          \
+                verts + stride * (i3));                          \
+      do_triangle( draw,                                         \
+                   ( DRAW_PIPE_EDGE_FLAG_0 |                     \
+                     DRAW_PIPE_EDGE_FLAG_1 ),                    \
+                 verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i2),                          \
+                 verts + stride * (i3))
+
+#define TRIANGLE(flags,i0,i1,i2)                                 \
+   do_triangle( draw,                                            \
+                flags,  /* flags */                              \
+                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i1),                          \
+                 verts + stride * (i2))
+
+#define LINE(flags,i0,i1)                                   \
+   do_line( draw,                                           \
+            flags,                                          \
+            verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+            verts + stride * (i1))
+
+#define POINT(i0)                               \
+   do_point( draw,                              \
+             verts + stride * i0 )
+
+#define FUNC pipe_run_linear
+#define ARGS                                    \
+    struct draw_context *draw,                  \
+    unsigned prim,                              \
+    struct vertex_header *vertices,             \
+    unsigned stride
+
+#define LOCAL_VARS                                           \
+   char *verts = (char *)vertices;                           \
+   boolean flatfirst = (draw->rasterizer->flatshade &&       \
+                        draw->rasterizer->flatshade_first);  \
+   unsigned i;                                               \
+   ushort flags
+
+#define FLUSH
+
+#include "draw_pt_decompose.h"
+
+void draw_pipeline_run_linear( struct draw_context *draw,
+                               unsigned prim,
+                               struct vertex_header *vertices,
+                               unsigned count,
+                               unsigned stride )
+{
+   char *verts = (char *)vertices;
+   draw->pipeline.verts = verts;
+   draw->pipeline.vertex_stride = stride;
+   draw->pipeline.vertex_count = count;
+
+   pipe_run_linear(draw, prim, vertices, stride, count);
+
+   draw->pipeline.verts = NULL;
+   draw->pipeline.vertex_count = 0;
+}
+
+
+void draw_pipeline_flush( struct draw_context *draw, 
+                          unsigned flags )
+{
+   draw->pipeline.first->flush( draw->pipeline.first, flags );
+   draw->pipeline.first = draw->pipeline.validate;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h
new file mode 100644
index 0000000000..dbad8f98ac
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe.h
@@ -0,0 +1,125 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef DRAW_PIPE_H
+#define DRAW_PIPE_H
+
+#include "pipe/p_compiler.h"
+#include "draw_private.h"       /* for sizeof(vertex_header) */
+
+
+/**
+ * Basic info for a point/line/triangle primitive.
+ */
+struct prim_header {
+   float det;                 /**< front/back face determinant */
+   ushort flags;
+   ushort pad;
+   struct vertex_header *v[3];  /**< 1 to 3 vertex pointers */
+};
+
+
+
+/**
+ * Base class for all primitive drawing stages.
+ */
+struct draw_stage
+{
+   struct draw_context *draw;   /**< parent context */
+
+   struct draw_stage *next;     /**< next stage in pipeline */
+
+   struct vertex_header **tmp;  /**< temp vert storage, such as for clipping */
+   unsigned nr_tmps;
+
+   void (*point)( struct draw_stage *,
+		  struct prim_header * );
+
+   void (*line)( struct draw_stage *,
+		 struct prim_header * );
+
+   void (*tri)( struct draw_stage *,
+		struct prim_header * );
+
+   void (*flush)( struct draw_stage *,
+		  unsigned flags );
+
+   void (*reset_stipple_counter)( struct draw_stage * );
+
+   void (*destroy)( struct draw_stage * );
+};
+
+
+extern struct draw_stage *draw_unfilled_stage( struct draw_context *context );
+extern struct draw_stage *draw_twoside_stage( struct draw_context *context );
+extern struct draw_stage *draw_offset_stage( struct draw_context *context );
+extern struct draw_stage *draw_clip_stage( struct draw_context *context );
+extern struct draw_stage *draw_flatshade_stage( struct draw_context *context );
+extern struct draw_stage *draw_cull_stage( struct draw_context *context );
+extern struct draw_stage *draw_stipple_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_line_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_point_stage( struct draw_context *context );
+extern struct draw_stage *draw_validate_stage( struct draw_context *context );
+
+
+extern void draw_free_temp_verts( struct draw_stage *stage );
+extern boolean draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr );
+
+extern void draw_reset_vertex_ids( struct draw_context *draw );
+
+void draw_pipe_passthrough_tri(struct draw_stage *stage, struct prim_header *header);
+void draw_pipe_passthrough_line(struct draw_stage *stage, struct prim_header *header);
+void draw_pipe_passthrough_point(struct draw_stage *stage, struct prim_header *header);
+
+
+
+/**
+ * Get a writeable copy of a vertex.
+ * \param stage  drawing stage info
+ * \param vert  the vertex to copy (source)
+ * \param idx  index into stage's tmp[] array to put the copy (dest)
+ * \return  pointer to the copied vertex
+ */
+static INLINE struct vertex_header *
+dup_vert( struct draw_stage *stage,
+	  const struct vertex_header *vert,
+	  unsigned idx )
+{   
+   struct vertex_header *tmp = stage->tmp[idx];
+   const uint vsize = sizeof(struct vertex_header)
+      + stage->draw->vs.num_vs_outputs * 4 * sizeof(float);
+   memcpy(tmp, vert, vsize);
+   tmp->vertex_id = UNDEFINED_VERTEX_ID;
+   return tmp;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
new file mode 100644
index 0000000000..20841bb5d6
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -0,0 +1,914 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * AA line stage:  AA lines are converted to texture mapped triangles.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw_context.h"
+#include "draw_private.h"
+#include "draw_pipe.h"
+
+
+/**
+ * Max texture level for the alpha texture used for antialiasing
+ */
+#define MAX_TEXTURE_LEVEL  5   /* 32 x 32 */
+
+
+/**
+ * Subclass of pipe_shader_state to carry extra fragment shader info.
+ */
+struct aaline_fragment_shader
+{
+   struct pipe_shader_state state;
+   void *driver_fs;
+   void *aaline_fs;
+   void *aapoint_fs; /* not yet */
+   void *sprite_fs; /* not yet */
+   uint sampler_unit;
+   int generic_attrib;  /**< texcoord/generic used for texture */
+};
+
+
+/**
+ * Subclass of draw_stage
+ */
+struct aaline_stage
+{
+   struct draw_stage stage;
+
+   float half_line_width;
+
+   /** For AA lines, this is the vertex attrib slot for the new texcoords */
+   uint tex_slot;
+   /** position, not necessarily output zero */
+   uint pos_slot;
+
+   void *sampler_cso;
+   struct pipe_texture *texture;
+   uint num_samplers;
+   uint num_textures;
+
+
+   /*
+    * Currently bound state
+    */
+   struct aaline_fragment_shader *fs;
+   struct {
+      void *sampler[PIPE_MAX_SAMPLERS];
+      struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   } state;
+
+   /*
+    * Driver interface/override functions
+    */
+   void * (*driver_create_fs_state)(struct pipe_context *,
+                                    const struct pipe_shader_state *);
+   void (*driver_bind_fs_state)(struct pipe_context *, void *);
+   void (*driver_delete_fs_state)(struct pipe_context *, void *);
+
+   void (*driver_bind_sampler_states)(struct pipe_context *, unsigned,
+                                      void **);
+   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
+                                       struct pipe_texture **);
+
+   struct pipe_context *pipe;
+};
+
+
+
+/**
+ * Subclass of tgsi_transform_context, used for transforming the
+ * user's fragment shader to add the special AA instructions.
+ */
+struct aa_transform_context {
+   struct tgsi_transform_context base;
+   uint tempsUsed;  /**< bitmask */
+   int colorOutput; /**< which output is the primary color */
+   uint samplersUsed;  /**< bitfield of samplers used */
+   int freeSampler;  /** an available sampler for the pstipple */
+   int maxInput, maxGeneric;  /**< max input index found */
+   int colorTemp, texTemp;  /**< temp registers */
+   boolean firstInstruction;
+};
+
+
+/**
+ * TGSI declaration transform callback.
+ * Look for a free sampler, a free input attrib, and two free temp regs.
+ */
+static void
+aa_transform_decl(struct tgsi_transform_context *ctx,
+                  struct tgsi_full_declaration *decl)
+{
+   struct aa_transform_context *aactx = (struct aa_transform_context *) ctx;
+
+   if (decl->Declaration.File == TGSI_FILE_OUTPUT &&
+       decl->Semantic.SemanticName == TGSI_SEMANTIC_COLOR &&
+       decl->Semantic.SemanticIndex == 0) {
+      aactx->colorOutput = decl->DeclarationRange.First;
+   }
+   else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
+      uint i;
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last; i++) {
+         aactx->samplersUsed |= 1 << i;
+      }
+   }
+   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      if ((int) decl->DeclarationRange.Last > aactx->maxInput)
+         aactx->maxInput = decl->DeclarationRange.Last;
+      if (decl->Semantic.SemanticName == TGSI_SEMANTIC_GENERIC &&
+           (int) decl->Semantic.SemanticIndex > aactx->maxGeneric) {
+         aactx->maxGeneric = decl->Semantic.SemanticIndex;
+      }
+   }
+   else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
+      uint i;
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last; i++) {
+         aactx->tempsUsed |= (1 << i);
+      }
+   }
+
+   ctx->emit_declaration(ctx, decl);
+}
+
+
+/**
+ * Find the lowest zero bit in the given word, or -1 if bitfield is all ones.
+ */
+static int
+free_bit(uint bitfield)
+{
+   int i;
+   for (i = 0; i < 32; i++) {
+      if ((bitfield & (1 << i)) == 0)
+         return i;
+   }
+   return -1;
+}
+
+
+/**
+ * TGSI instruction transform callback.
+ * Replace writes to result.color w/ a temp reg.
+ * Upon END instruction, insert texture sampling code for antialiasing.
+ */
+static void
+aa_transform_inst(struct tgsi_transform_context *ctx,
+                  struct tgsi_full_instruction *inst)
+{
+   struct aa_transform_context *aactx = (struct aa_transform_context *) ctx;
+
+   if (aactx->firstInstruction) {
+      /* emit our new declarations before the first instruction */
+
+      struct tgsi_full_declaration decl;
+      uint i;
+
+      /* find free sampler */
+      aactx->freeSampler = free_bit(aactx->samplersUsed);
+      if (aactx->freeSampler >= PIPE_MAX_SAMPLERS)
+         aactx->freeSampler = PIPE_MAX_SAMPLERS - 1;
+
+      /* find two free temp regs */
+      for (i = 0; i < 32; i++) {
+         if ((aactx->tempsUsed & (1 << i)) == 0) {
+            /* found a free temp */
+            if (aactx->colorTemp < 0)
+               aactx->colorTemp  = i;
+            else if (aactx->texTemp < 0)
+               aactx->texTemp  = i;
+            else
+               break;
+         }
+      }
+      assert(aactx->colorTemp >= 0);
+      assert(aactx->texTemp >= 0);
+
+      /* declare new generic input/texcoord */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_INPUT;
+      /* XXX this could be linear... */
+      decl.Declaration.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.SemanticName = TGSI_SEMANTIC_GENERIC;
+      decl.Semantic.SemanticIndex = aactx->maxGeneric + 1;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = aactx->maxInput + 1;
+      ctx->emit_declaration(ctx, &decl);
+
+      /* declare new sampler */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_SAMPLER;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = aactx->freeSampler;
+      ctx->emit_declaration(ctx, &decl);
+
+      /* declare new temp regs */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = aactx->texTemp;
+      ctx->emit_declaration(ctx, &decl);
+
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = aactx->colorTemp;
+      ctx->emit_declaration(ctx, &decl);
+
+      aactx->firstInstruction = FALSE;
+   }
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_END &&
+       aactx->colorOutput != -1) {
+      struct tgsi_full_instruction newInst;
+
+      /* TEX */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_TEX;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = aactx->texTemp;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.InstructionExtTexture.Texture = TGSI_TEXTURE_2D;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = aactx->maxInput + 1;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_SAMPLER;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = aactx->freeSampler;
+
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* MOV rgb */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MOV;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+      newInst.FullDstRegisters[0].DstRegister.Index = aactx->colorOutput;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_XYZ;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = aactx->colorTemp;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* MUL alpha */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MUL;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+      newInst.FullDstRegisters[0].DstRegister.Index = aactx->colorOutput;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_W;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = aactx->colorTemp;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = aactx->texTemp;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* END */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_END;
+      newInst.Instruction.NumDstRegs = 0;
+      newInst.Instruction.NumSrcRegs = 0;
+      ctx->emit_instruction(ctx, &newInst);
+   }
+   else {
+      /* Not an END instruction.
+       * Look for writes to result.color and replace with colorTemp reg.
+       */
+      uint i;
+
+      for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+         struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+         if (dst->DstRegister.File == TGSI_FILE_OUTPUT &&
+             dst->DstRegister.Index == aactx->colorOutput) {
+            dst->DstRegister.File = TGSI_FILE_TEMPORARY;
+            dst->DstRegister.Index = aactx->colorTemp;
+         }
+      }
+
+      ctx->emit_instruction(ctx, inst);
+   }
+}
+
+
+/**
+ * Generate the frag shader we'll use for drawing AA lines.
+ * This will be the user's shader plus some texture/modulate instructions.
+ */
+static boolean
+generate_aaline_fs(struct aaline_stage *aaline)
+{
+   const struct pipe_shader_state *orig_fs = &aaline->fs->state;
+   struct pipe_shader_state aaline_fs;
+   struct aa_transform_context transform;
+
+#define MAX 1000
+
+   aaline_fs = *orig_fs; /* copy to init */
+   aaline_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (aaline_fs.tokens == NULL)
+      return FALSE;
+
+   memset(&transform, 0, sizeof(transform));
+   transform.colorOutput = -1;
+   transform.maxInput = -1;
+   transform.maxGeneric = -1;
+   transform.colorTemp = -1;
+   transform.texTemp = -1;
+   transform.firstInstruction = TRUE;
+   transform.base.transform_instruction = aa_transform_inst;
+   transform.base.transform_declaration = aa_transform_decl;
+
+   tgsi_transform_shader(orig_fs->tokens,
+                         (struct tgsi_token *) aaline_fs.tokens,
+                         MAX, &transform.base);
+
+#if 0 /* DEBUG */
+   tgsi_dump(orig_fs->tokens, 0);
+   tgsi_dump(aaline_fs.tokens, 0);
+#endif
+
+   aaline->fs->sampler_unit = transform.freeSampler;
+
+   aaline->fs->aaline_fs
+      = aaline->driver_create_fs_state(aaline->pipe, &aaline_fs);
+   if (aaline->fs->aaline_fs == NULL)
+      return FALSE;
+
+   aaline->fs->generic_attrib = transform.maxGeneric + 1;
+   return TRUE;
+}
+
+
+/**
+ * Create the texture map we'll use for antialiasing the lines.
+ */
+static boolean
+aaline_create_texture(struct aaline_stage *aaline)
+{
+   struct pipe_context *pipe = aaline->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture texTemp;
+   uint level;
+
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = PIPE_FORMAT_A8_UNORM; /* XXX verify supported by driver! */
+   texTemp.last_level = MAX_TEXTURE_LEVEL;
+   texTemp.width[0] = 1 << MAX_TEXTURE_LEVEL;
+   texTemp.height[0] = 1 << MAX_TEXTURE_LEVEL;
+   texTemp.depth[0] = 1;
+   pf_get_block(texTemp.format, &texTemp.block);
+
+   aaline->texture = screen->texture_create(screen, &texTemp);
+   if (!aaline->texture)
+      return FALSE;
+
+   /* Fill in mipmap images.
+    * Basically each level is solid opaque, except for the outermost
+    * texels which are zero.  Special case the 1x1 and 2x2 levels.
+    */
+   for (level = 0; level <= MAX_TEXTURE_LEVEL; level++) {
+      struct pipe_surface *surface;
+      const uint size = aaline->texture->width[level];
+      ubyte *data;
+      uint i, j;
+
+      assert(aaline->texture->width[level] == aaline->texture->height[level]);
+
+      /* This texture is new, no need to flush. 
+       */
+      surface = screen->get_tex_surface(screen, aaline->texture, 0, level, 0,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+      data = screen->surface_map(screen, surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+      if (data == NULL)
+         return FALSE;
+
+      for (i = 0; i < size; i++) {
+         for (j = 0; j < size; j++) {
+            ubyte d;
+            if (size == 1) {
+               d = 255;
+            }
+            else if (size == 2) {
+               d = 200; /* tuneable */
+            }
+            else if (i == 0 || j == 0 || i == size - 1 || j == size - 1) {
+               d = 0;
+            }
+            else {
+               d = 255;
+            }
+            data[i * surface->stride + j] = d;
+         }
+      }
+
+      /* unmap */
+      screen->surface_unmap(screen, surface);
+      screen->tex_surface_release(screen, &surface);
+   }
+   return TRUE;
+}
+
+
+/**
+ * Create the sampler CSO that'll be used for antialiasing.
+ * By using a mipmapped texture, we don't have to generate a different
+ * texture image for each line size.
+ */
+static boolean
+aaline_create_sampler(struct aaline_stage *aaline)
+{
+   struct pipe_sampler_state sampler;
+   struct pipe_context *pipe = aaline->pipe;
+
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_LINEAR;
+   sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+   sampler.normalized_coords = 1;
+   sampler.min_lod = 0.0f;
+   sampler.max_lod = MAX_TEXTURE_LEVEL;
+
+   aaline->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+   if (aaline->sampler_cso == NULL)
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/**
+ * When we're about to draw our first AA line in a batch, this function is
+ * called to tell the driver to bind our modified fragment shader.
+ */
+static boolean
+bind_aaline_fragment_shader(struct aaline_stage *aaline)
+{
+   struct draw_context *draw = aaline->stage.draw;
+
+   if (!aaline->fs->aaline_fs && 
+       !generate_aaline_fs(aaline))
+      return FALSE;
+
+   draw->suspend_flushing = TRUE;
+   aaline->driver_bind_fs_state(aaline->pipe, aaline->fs->aaline_fs);
+   draw->suspend_flushing = FALSE;
+
+   return TRUE;
+}
+
+
+
+static INLINE struct aaline_stage *
+aaline_stage( struct draw_stage *stage )
+{
+   return (struct aaline_stage *) stage;
+}
+
+
+/**
+ * Draw a wide line by drawing a quad, using geometry which will
+ * fullfill GL's antialiased line requirements.
+ */
+static void
+aaline_line(struct draw_stage *stage, struct prim_header *header)
+{
+   const struct aaline_stage *aaline = aaline_stage(stage);
+   const float half_width = aaline->half_line_width;
+   struct prim_header tri;
+   struct vertex_header *v[8];
+   uint texPos = aaline->tex_slot;
+   uint posPos = aaline->pos_slot;
+   float *pos, *tex;
+   float dx = header->v[1]->data[posPos][0] - header->v[0]->data[posPos][0];
+   float dy = header->v[1]->data[posPos][1] - header->v[0]->data[posPos][1];
+   double a = atan2(dy, dx);
+   float c_a = (float) cos(a), s_a = (float) sin(a);
+   uint i;
+
+   /* XXX the ends of lines aren't quite perfect yet, but probably passable */
+   dx = 0.5F * half_width;
+   dy = half_width;
+
+   /* allocate/dup new verts */
+   for (i = 0; i < 8; i++) {
+      v[i] = dup_vert(stage, header->v[i/4], i);
+   }
+
+   /*
+    * Quad strip for line from v0 to v1 (*=endpoints):
+    *
+    *  1   3                     5   7
+    *  +---+---------------------+---+
+    *  |                             |
+    *  | *v0                     v1* |
+    *  |                             |
+    *  +---+---------------------+---+
+    *  0   2                     4   6
+    */
+
+   /* new verts */
+   pos = v[0]->data[posPos];
+   pos[0] += (-dx * c_a -  dy * s_a);
+   pos[1] += (-dx * s_a +  dy * c_a);
+
+   pos = v[1]->data[posPos];
+   pos[0] += (-dx * c_a - -dy * s_a);
+   pos[1] += (-dx * s_a + -dy * c_a);
+
+   pos = v[2]->data[posPos];
+   pos[0] += ( dx * c_a -  dy * s_a);
+   pos[1] += ( dx * s_a +  dy * c_a);
+
+   pos = v[3]->data[posPos];
+   pos[0] += ( dx * c_a - -dy * s_a);
+   pos[1] += ( dx * s_a + -dy * c_a);
+
+   pos = v[4]->data[posPos];
+   pos[0] += (-dx * c_a -  dy * s_a);
+   pos[1] += (-dx * s_a +  dy * c_a);
+
+   pos = v[5]->data[posPos];
+   pos[0] += (-dx * c_a - -dy * s_a);
+   pos[1] += (-dx * s_a + -dy * c_a);
+
+   pos = v[6]->data[posPos];
+   pos[0] += ( dx * c_a -  dy * s_a);
+   pos[1] += ( dx * s_a +  dy * c_a);
+
+   pos = v[7]->data[posPos];
+   pos[0] += ( dx * c_a - -dy * s_a);
+   pos[1] += ( dx * s_a + -dy * c_a);
+
+   /* new texcoords */
+   tex = v[0]->data[texPos];
+   ASSIGN_4V(tex, 0, 0, 0, 1);
+
+   tex = v[1]->data[texPos];
+   ASSIGN_4V(tex, 0, 1, 0, 1);
+
+   tex = v[2]->data[texPos];
+   ASSIGN_4V(tex, .5, 0, 0, 1);
+
+   tex = v[3]->data[texPos];
+   ASSIGN_4V(tex, .5, 1, 0, 1);
+
+   tex = v[4]->data[texPos];
+   ASSIGN_4V(tex, .5, 0, 0, 1);
+
+   tex = v[5]->data[texPos];
+   ASSIGN_4V(tex, .5, 1, 0, 1);
+
+   tex = v[6]->data[texPos];
+   ASSIGN_4V(tex, 1, 0, 0, 1);
+
+   tex = v[7]->data[texPos];
+   ASSIGN_4V(tex, 1, 1, 0, 1);
+
+   /* emit 6 tris for the quad strip */
+   tri.v[0] = v[2];  tri.v[1] = v[1];  tri.v[2] = v[0];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[3];  tri.v[1] = v[1];  tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[4];  tri.v[1] = v[3];  tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[5];  tri.v[1] = v[3];  tri.v[2] = v[4];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[6];  tri.v[1] = v[5];  tri.v[2] = v[4];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[7];  tri.v[1] = v[5];  tri.v[2] = v[6];
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void
+aaline_first_line(struct draw_stage *stage, struct prim_header *header)
+{
+   auto struct aaline_stage *aaline = aaline_stage(stage);
+   struct draw_context *draw = stage->draw;
+   struct pipe_context *pipe = aaline->pipe;
+   uint num_samplers;
+
+   assert(draw->rasterizer->line_smooth);
+
+   if (draw->rasterizer->line_width <= 3.0)
+      aaline->half_line_width = 1.5f;
+   else
+      aaline->half_line_width = 0.5f * draw->rasterizer->line_width;
+
+   /*
+    * Bind (generate) our fragprog, sampler and texture
+    */
+   if (!bind_aaline_fragment_shader(aaline)) {
+      stage->line = draw_pipe_passthrough_line;
+      stage->line(stage, header);
+      return;
+   }
+
+   /* update vertex attrib info */
+   aaline->tex_slot = draw->vs.num_vs_outputs;
+   aaline->pos_slot = draw->vs.position_output;
+
+   /* advertise the extra post-transformed vertex attribute */
+   draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+   draw->extra_vp_outputs.semantic_index = aaline->fs->generic_attrib;
+   draw->extra_vp_outputs.slot = aaline->tex_slot;
+
+   /* how many samplers? */
+   /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
+   num_samplers = MAX2(aaline->num_textures, aaline->num_samplers);
+   num_samplers = MAX2(num_samplers, aaline->fs->sampler_unit + 1);
+
+   aaline->state.sampler[aaline->fs->sampler_unit] = aaline->sampler_cso;
+   pipe_texture_reference(&aaline->state.texture[aaline->fs->sampler_unit],
+                          aaline->texture);
+
+   draw->suspend_flushing = TRUE;
+   aaline->driver_bind_sampler_states(pipe, num_samplers, aaline->state.sampler);
+   aaline->driver_set_sampler_textures(pipe, num_samplers, aaline->state.texture);
+   draw->suspend_flushing = FALSE;
+
+   /* now really draw first line */
+   stage->line = aaline_line;
+   stage->line(stage, header);
+}
+
+
+static void
+aaline_flush(struct draw_stage *stage, unsigned flags)
+{
+   struct draw_context *draw = stage->draw;
+   struct aaline_stage *aaline = aaline_stage(stage);
+   struct pipe_context *pipe = aaline->pipe;
+
+   stage->line = aaline_first_line;
+   stage->next->flush( stage->next, flags );
+
+   /* restore original frag shader, texture, sampler state */
+   draw->suspend_flushing = TRUE;
+   aaline->driver_bind_fs_state(pipe, aaline->fs->driver_fs);
+   aaline->driver_bind_sampler_states(pipe, aaline->num_samplers,
+                                      aaline->state.sampler);
+   aaline->driver_set_sampler_textures(pipe, aaline->num_textures,
+                                       aaline->state.texture);
+   draw->suspend_flushing = FALSE;
+
+   draw->extra_vp_outputs.slot = 0;
+}
+
+
+static void
+aaline_reset_stipple_counter(struct draw_stage *stage)
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void
+aaline_destroy(struct draw_stage *stage)
+{
+   struct aaline_stage *aaline = aaline_stage(stage);
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_texture_reference(&aaline->state.texture[i], NULL);
+   }
+
+   if (aaline->sampler_cso)
+      aaline->pipe->delete_sampler_state(aaline->pipe, aaline->sampler_cso);
+
+   if (aaline->texture)
+      pipe_texture_release(&aaline->texture);
+
+   draw_free_temp_verts( stage );
+
+   FREE( stage );
+}
+
+
+static struct aaline_stage *
+draw_aaline_stage(struct draw_context *draw)
+{
+   struct aaline_stage *aaline = CALLOC_STRUCT(aaline_stage);
+   if (aaline == NULL)
+      return NULL;
+
+   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
+      goto fail;
+
+   aaline->stage.draw = draw;
+   aaline->stage.next = NULL;
+   aaline->stage.point = draw_pipe_passthrough_point;
+   aaline->stage.line = aaline_first_line;
+   aaline->stage.tri = draw_pipe_passthrough_tri;
+   aaline->stage.flush = aaline_flush;
+   aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter;
+   aaline->stage.destroy = aaline_destroy;
+
+   return aaline;
+
+ fail:
+   if (aaline)
+      aaline_destroy(&aaline->stage);
+
+   return NULL;
+}
+
+
+static struct aaline_stage *
+aaline_stage_from_pipe(struct pipe_context *pipe)
+{
+   struct draw_context *draw = (struct draw_context *) pipe->draw;
+   return aaline_stage(draw->pipeline.aaline);
+}
+
+
+/**
+ * This function overrides the driver's create_fs_state() function and
+ * will typically be called by the state tracker.
+ */
+static void *
+aaline_create_fs_state(struct pipe_context *pipe,
+                       const struct pipe_shader_state *fs)
+{
+   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
+   struct aaline_fragment_shader *aafs = CALLOC_STRUCT(aaline_fragment_shader);
+   if (aafs == NULL)
+      return NULL;
+
+   aafs->state = *fs;
+
+   /* pass-through */
+   aafs->driver_fs = aaline->driver_create_fs_state(aaline->pipe, fs);
+
+   return aafs;
+}
+
+
+static void
+aaline_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
+   struct aaline_fragment_shader *aafs = (struct aaline_fragment_shader *) fs;
+
+   /* save current */
+   aaline->fs = aafs;
+   /* pass-through */
+   aaline->driver_bind_fs_state(aaline->pipe,
+                                (aafs ? aafs->driver_fs : NULL));
+}
+
+
+static void
+aaline_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
+   struct aaline_fragment_shader *aafs = (struct aaline_fragment_shader *) fs;
+   /* pass-through */
+   aaline->driver_delete_fs_state(aaline->pipe, aafs->driver_fs);
+   FREE(aafs);
+}
+
+
+static void
+aaline_bind_sampler_states(struct pipe_context *pipe,
+                           unsigned num, void **sampler)
+{
+   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
+
+   /* save current */
+   memcpy(aaline->state.sampler, sampler, num * sizeof(void *));
+   aaline->num_samplers = num;
+
+   /* pass-through */
+   aaline->driver_bind_sampler_states(aaline->pipe, num, sampler);
+}
+
+
+static void
+aaline_set_sampler_textures(struct pipe_context *pipe,
+                            unsigned num, struct pipe_texture **texture)
+{
+   struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
+   uint i;
+
+   /* save current */
+   for (i = 0; i < num; i++) {
+      pipe_texture_reference(&aaline->state.texture[i], texture[i]);
+   }
+   for ( ; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_texture_reference(&aaline->state.texture[i], NULL);
+   }
+   aaline->num_textures = num;
+
+   /* pass-through */
+   aaline->driver_set_sampler_textures(aaline->pipe, num, texture);
+}
+
+
+/**
+ * Called by drivers that want to install this AA line prim stage
+ * into the draw module's pipeline.  This will not be used if the
+ * hardware has native support for AA lines.
+ */
+boolean
+draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
+{
+   struct aaline_stage *aaline;
+
+   pipe->draw = (void *) draw;
+
+   /*
+    * Create / install AA line drawing / prim stage
+    */
+   aaline = draw_aaline_stage( draw );
+   if (!aaline)
+      goto fail;
+
+   aaline->pipe = pipe;
+
+   /* create special texture, sampler state */
+   if (!aaline_create_texture(aaline))
+      goto fail;
+
+   if (!aaline_create_sampler(aaline))
+      goto fail;
+
+   /* save original driver functions */
+   aaline->driver_create_fs_state = pipe->create_fs_state;
+   aaline->driver_bind_fs_state = pipe->bind_fs_state;
+   aaline->driver_delete_fs_state = pipe->delete_fs_state;
+
+   aaline->driver_bind_sampler_states = pipe->bind_sampler_states;
+   aaline->driver_set_sampler_textures = pipe->set_sampler_textures;
+
+   /* override the driver's functions */
+   pipe->create_fs_state = aaline_create_fs_state;
+   pipe->bind_fs_state = aaline_bind_fs_state;
+   pipe->delete_fs_state = aaline_delete_fs_state;
+
+   pipe->bind_sampler_states = aaline_bind_sampler_states;
+   pipe->set_sampler_textures = aaline_set_sampler_textures;
+   
+   /* Install once everything is known to be OK:
+    */
+   draw->pipeline.aaline = &aaline->stage;
+
+   return TRUE;
+
+ fail:
+   if (aaline)
+      aaline->stage.destroy( &aaline->stage );
+   
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
new file mode 100644
index 0000000000..2c1cacbdb4
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -0,0 +1,875 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * AA point stage:  AA points are converted to quads and rendered with a
+ * special fragment shader.  Another approach would be to use a texture
+ * map image of a point, but experiments indicate the quality isn't nearly
+ * as good as this approach.
+ *
+ * Note: this looks a lot like draw_aaline.c but there's actually little
+ * if any code that can be shared.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw_context.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
+
+
+/*
+ * Enabling NORMALIZE might give _slightly_ better results.
+ * Basically, it controls whether we compute distance as d=sqrt(x*x+y*y) or
+ * d=x*x+y*y.  Since we're working with a unit circle, the later seems
+ * close enough and saves some costly instructions.
+ */
+#define NORMALIZE 0
+
+
+/**
+ * Subclass of pipe_shader_state to carry extra fragment shader info.
+ */
+struct aapoint_fragment_shader
+{
+   struct pipe_shader_state state;
+   void *driver_fs;   /**< the regular shader */
+   void *aapoint_fs;  /**< the aa point-augmented shader */
+   int generic_attrib; /**< The generic input attrib/texcoord we'll use */
+};
+
+
+/**
+ * Subclass of draw_stage
+ */
+struct aapoint_stage
+{
+   struct draw_stage stage;
+
+   int psize_slot;
+   float radius;
+
+   /** this is the vertex attrib slot for the new texcoords */
+   uint tex_slot;
+   uint pos_slot;
+
+   /*
+    * Currently bound state
+    */
+   struct aapoint_fragment_shader *fs;
+
+   /*
+    * Driver interface/override functions
+    */
+   void * (*driver_create_fs_state)(struct pipe_context *,
+                                    const struct pipe_shader_state *);
+   void (*driver_bind_fs_state)(struct pipe_context *, void *);
+   void (*driver_delete_fs_state)(struct pipe_context *, void *);
+
+   struct pipe_context *pipe;
+};
+
+
+
+/**
+ * Subclass of tgsi_transform_context, used for transforming the
+ * user's fragment shader to add the special AA instructions.
+ */
+struct aa_transform_context {
+   struct tgsi_transform_context base;
+   uint tempsUsed;  /**< bitmask */
+   int colorOutput; /**< which output is the primary color */
+   int maxInput, maxGeneric;  /**< max input index found */
+   int tmp0, colorTemp;  /**< temp registers */
+   boolean firstInstruction;
+};
+
+
+/**
+ * TGSI declaration transform callback.
+ * Look for two free temp regs and available input reg for new texcoords.
+ */
+static void
+aa_transform_decl(struct tgsi_transform_context *ctx,
+                  struct tgsi_full_declaration *decl)
+{
+   struct aa_transform_context *aactx = (struct aa_transform_context *) ctx;
+
+   if (decl->Declaration.File == TGSI_FILE_OUTPUT &&
+       decl->Semantic.SemanticName == TGSI_SEMANTIC_COLOR &&
+       decl->Semantic.SemanticIndex == 0) {
+      aactx->colorOutput = decl->DeclarationRange.First;
+   }
+   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      if ((int) decl->DeclarationRange.Last > aactx->maxInput)
+         aactx->maxInput = decl->DeclarationRange.Last;
+      if (decl->Semantic.SemanticName == TGSI_SEMANTIC_GENERIC &&
+           (int) decl->Semantic.SemanticIndex > aactx->maxGeneric) {
+         aactx->maxGeneric = decl->Semantic.SemanticIndex;
+      }
+   }
+   else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
+      uint i;
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last; i++) {
+         aactx->tempsUsed |= (1 << i);
+      }
+   }
+
+   ctx->emit_declaration(ctx, decl);
+}
+
+
+/**
+ * TGSI instruction transform callback.
+ * Replace writes to result.color w/ a temp reg.
+ * Upon END instruction, insert texture sampling code for antialiasing.
+ */
+static void
+aa_transform_inst(struct tgsi_transform_context *ctx,
+                  struct tgsi_full_instruction *inst)
+{
+   struct aa_transform_context *aactx = (struct aa_transform_context *) ctx;
+   struct tgsi_full_instruction newInst;
+
+   if (aactx->firstInstruction) {
+      /* emit our new declarations before the first instruction */
+
+      struct tgsi_full_declaration decl;
+      const int texInput = aactx->maxInput + 1;
+      int tmp0;
+      uint i;
+
+      /* find two free temp regs */
+      for (i = 0; i < 32; i++) {
+         if ((aactx->tempsUsed & (1 << i)) == 0) {
+            /* found a free temp */
+            if (aactx->tmp0 < 0)
+               aactx->tmp0 = i;
+            else if (aactx->colorTemp < 0)
+               aactx->colorTemp = i;
+            else
+               break;
+         }
+      }
+
+      assert(aactx->colorTemp != aactx->tmp0);
+
+      tmp0 = aactx->tmp0;
+
+      /* declare new generic input/texcoord */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_INPUT;
+      /* XXX this could be linear... */
+      decl.Declaration.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.SemanticName = TGSI_SEMANTIC_GENERIC;
+      decl.Semantic.SemanticIndex = aactx->maxGeneric + 1;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = texInput;
+      ctx->emit_declaration(ctx, &decl);
+
+      /* declare new temp regs */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = tmp0;
+      ctx->emit_declaration(ctx, &decl);
+
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = aactx->colorTemp;
+      ctx->emit_declaration(ctx, &decl);
+
+      aactx->firstInstruction = FALSE;
+
+
+      /*
+       * Emit code to compute fragment coverage, kill if outside point radius
+       *
+       * Temp reg0 usage:
+       *  t0.x = distance of fragment from center point
+       *  t0.y = boolean, is t0.x > 1.0, also misc temp usage
+       *  t0.z = temporary for computing 1/(1-k) value
+       *  t0.w = final coverage value
+       */
+
+      /* MUL t0.xy, tex, tex;  # compute x^2, y^2 */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MUL;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_XY;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = texInput;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* ADD t0.x, t0.x, t0.y;  # x^2 + y^2 */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_ADD;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+      ctx->emit_instruction(ctx, &newInst);
+
+#if NORMALIZE  /* OPTIONAL normalization of length */
+      /* RSQ t0.x, t0.x; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_RSQ;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* RCP t0.x, t0.x; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_RCP;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      ctx->emit_instruction(ctx, &newInst);
+#endif
+
+      /* SGT t0.y, t0.xxxx, t0.wwww;  # bool b = d > 1 (NOTE t0.w == 1) */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_SGT;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_Y;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_W;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* KIL -tmp0.yyyy;   # if -tmp0.y < 0, KILL */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_KIL;
+      newInst.Instruction.NumDstRegs = 0;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.Negate = 1;
+      ctx->emit_instruction(ctx, &newInst);
+
+
+      /* compute coverage factor = (1-d)/(1-k) */
+
+      /* SUB t0.z, tex.w, tex.z;  # m = 1 - k */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_SUB;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_Z;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Z;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* RCP t0.z, t0.z;  # t0.z = 1 / m */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_RCP;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_Z;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Z;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* SUB t0.y, 1, t0.x;  # d = 1 - d */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_SUB;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_Y;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* MUL t0.w, t0.y, t0.z;   # coverage = d * m */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MUL;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_W;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Z;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* SLE t0.y, t0.x, tex.z;  # bool b = distance <= k */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_SLE;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_Y;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Z;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* CMP t0.w, -t0.y, tex.w, t0.w;
+       *  # if -t0.y < 0 then
+       *       t0.w = 1
+       *    else
+       *       t0.w = t0.w
+       */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_CMP;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = tmp0;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_W;
+      newInst.Instruction.NumSrcRegs = 3;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+      newInst.FullSrcRegisters[0].SrcRegister.Negate = 1;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = texInput;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[2].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[2].SrcRegister.Index = tmp0;
+      newInst.FullSrcRegisters[2].SrcRegister.SwizzleX = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[2].SrcRegister.SwizzleY = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[2].SrcRegister.SwizzleZ = TGSI_SWIZZLE_W;
+      newInst.FullSrcRegisters[2].SrcRegister.SwizzleW = TGSI_SWIZZLE_W;
+      ctx->emit_instruction(ctx, &newInst);
+
+   }
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+      /* add alpha modulation code at tail of program */
+
+      /* MOV result.color.xyz, colorTemp; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MOV;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+      newInst.FullDstRegisters[0].DstRegister.Index = aactx->colorOutput;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_XYZ;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = aactx->colorTemp;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* MUL result.color.w, colorTemp, tmp0.w; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MUL;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+      newInst.FullDstRegisters[0].DstRegister.Index = aactx->colorOutput;
+      newInst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_W;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = aactx->colorTemp;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = aactx->tmp0;
+      ctx->emit_instruction(ctx, &newInst);
+   }
+   else {
+      /* Not an END instruction.
+       * Look for writes to result.color and replace with colorTemp reg.
+       */
+      uint i;
+
+      for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+         struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+         if (dst->DstRegister.File == TGSI_FILE_OUTPUT &&
+             dst->DstRegister.Index == aactx->colorOutput) {
+            dst->DstRegister.File = TGSI_FILE_TEMPORARY;
+            dst->DstRegister.Index = aactx->colorTemp;
+         }
+      }
+   }
+
+   ctx->emit_instruction(ctx, inst);
+}
+
+
+/**
+ * Generate the frag shader we'll use for drawing AA points.
+ * This will be the user's shader plus some texture/modulate instructions.
+ */
+static boolean
+generate_aapoint_fs(struct aapoint_stage *aapoint)
+{
+   const struct pipe_shader_state *orig_fs = &aapoint->fs->state;
+   struct pipe_shader_state aapoint_fs;
+   struct aa_transform_context transform;
+
+#define MAX 1000
+
+   aapoint_fs = *orig_fs; /* copy to init */
+   aapoint_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (aapoint_fs.tokens == NULL)
+      return FALSE;
+
+   memset(&transform, 0, sizeof(transform));
+   transform.colorOutput = -1;
+   transform.maxInput = -1;
+   transform.maxGeneric = -1;
+   transform.colorTemp = -1;
+   transform.tmp0 = -1;
+   transform.firstInstruction = TRUE;
+   transform.base.transform_instruction = aa_transform_inst;
+   transform.base.transform_declaration = aa_transform_decl;
+
+   tgsi_transform_shader(orig_fs->tokens,
+                         (struct tgsi_token *) aapoint_fs.tokens,
+                         MAX, &transform.base);
+
+#if 0 /* DEBUG */
+   printf("draw_aapoint, orig shader:\n");
+   tgsi_dump(orig_fs->tokens, 0);
+   printf("draw_aapoint, new shader:\n");
+   tgsi_dump(aapoint_fs.tokens, 0);
+#endif
+
+   aapoint->fs->aapoint_fs
+      = aapoint->driver_create_fs_state(aapoint->pipe, &aapoint_fs);
+   if (aapoint->fs->aapoint_fs == NULL)
+      return FALSE;
+
+   aapoint->fs->generic_attrib = transform.maxGeneric + 1;
+
+   return TRUE;
+}
+
+
+/**
+ * When we're about to draw our first AA point in a batch, this function is
+ * called to tell the driver to bind our modified fragment shader.
+ */
+static boolean
+bind_aapoint_fragment_shader(struct aapoint_stage *aapoint)
+{
+   struct draw_context *draw = aapoint->stage.draw;
+
+   if (!aapoint->fs->aapoint_fs &&
+       !generate_aapoint_fs(aapoint))
+      return FALSE;
+
+   draw->suspend_flushing = TRUE;
+   aapoint->driver_bind_fs_state(aapoint->pipe, aapoint->fs->aapoint_fs);
+   draw->suspend_flushing = FALSE;
+
+   return TRUE;
+}
+
+
+
+static INLINE struct aapoint_stage *
+aapoint_stage( struct draw_stage *stage )
+{
+   return (struct aapoint_stage *) stage;
+}
+
+
+
+
+/**
+ * Draw an AA point by drawing a quad.
+ */
+static void
+aapoint_point(struct draw_stage *stage, struct prim_header *header)
+{
+   const struct aapoint_stage *aapoint = aapoint_stage(stage);
+   struct prim_header tri;
+   struct vertex_header *v[4];
+   uint texPos = aapoint->tex_slot;
+   uint pos_slot = aapoint->pos_slot;
+   float radius, *pos, *tex;
+   uint i;
+   float k;
+
+   if (aapoint->psize_slot >= 0) {
+      radius = 0.5f * header->v[0]->data[aapoint->psize_slot][0];
+   }
+   else {
+      radius = aapoint->radius;
+   }
+
+   /*
+    * Note: the texcoords (generic attrib, really) we use are special:
+    * The S and T components simply vary from -1 to +1.
+    * The R component is k, below.
+    * The Q component is 1.0 and will used as a handy constant in the
+    * fragment shader.
+    */
+
+   /*
+    * k is the threshold distance from the point's center at which
+    * we begin alpha attenuation (the coverage value).
+    * Operating within a unit circle, we'll compute the fragment's
+    * distance 'd' from the center point using the texcoords.
+    * IF d > 1.0 THEN
+    *    KILL fragment
+    * ELSE IF d > k THEN
+    *    compute coverage in [0,1] proportional to d in [k, 1].
+    * ELSE
+    *    coverage = 1.0;  // full coverage
+    * ENDIF
+    *
+    * Note: the ELSEIF and ELSE clauses are actually implemented with CMP to
+    * avoid using IF/ELSE/ENDIF TGSI opcodes.
+    */
+
+#if !NORMALIZE
+   k = 1.0f / radius;
+   k = 1.0f - 2.0f * k + k * k;
+#else
+   k = 1.0f - 1.0f / radius;
+#endif
+
+   /* allocate/dup new verts */
+   for (i = 0; i < 4; i++) {
+      v[i] = dup_vert(stage, header->v[0], i);
+   }
+
+   /* new verts */
+   pos = v[0]->data[pos_slot];
+   pos[0] -= radius;
+   pos[1] -= radius;
+
+   pos = v[1]->data[pos_slot];
+   pos[0] += radius;
+   pos[1] -= radius;
+
+   pos = v[2]->data[pos_slot];
+   pos[0] += radius;
+   pos[1] += radius;
+
+   pos = v[3]->data[pos_slot];
+   pos[0] -= radius;
+   pos[1] += radius;
+
+   /* new texcoords */
+   tex = v[0]->data[texPos];
+   ASSIGN_4V(tex, -1, -1, k, 1);
+
+   tex = v[1]->data[texPos];
+   ASSIGN_4V(tex,  1, -1, k, 1);
+
+   tex = v[2]->data[texPos];
+   ASSIGN_4V(tex,  1,  1, k, 1);
+
+   tex = v[3]->data[texPos];
+   ASSIGN_4V(tex, -1,  1, k, 1);
+
+   /* emit 2 tris for the quad strip */
+   tri.v[0] = v[0];
+   tri.v[1] = v[1];
+   tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[0];
+   tri.v[1] = v[2];
+   tri.v[2] = v[3];
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void
+aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
+{
+   auto struct aapoint_stage *aapoint = aapoint_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   assert(draw->rasterizer->point_smooth);
+
+   if (draw->rasterizer->point_size <= 2.0)
+      aapoint->radius = 1.0;
+   else
+      aapoint->radius = 0.5f * draw->rasterizer->point_size;
+
+   /*
+    * Bind (generate) our fragprog.
+    */
+   bind_aapoint_fragment_shader(aapoint);
+
+   /* update vertex attrib info */
+   aapoint->tex_slot = draw->vs.num_vs_outputs;
+   assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */
+
+   aapoint->pos_slot = draw->vs.position_output;
+
+   draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+   draw->extra_vp_outputs.semantic_index = aapoint->fs->generic_attrib;
+   draw->extra_vp_outputs.slot = aapoint->tex_slot;
+
+   /* find psize slot in post-transform vertex */
+   aapoint->psize_slot = -1;
+   if (draw->rasterizer->point_size_per_vertex) {
+      /* find PSIZ vertex output */
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+      uint i;
+      for (i = 0; i < vs->info.num_outputs; i++) {
+         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
+            aapoint->psize_slot = i;
+            break;
+         }
+      }
+   }
+
+   /* now really draw first point */
+   stage->point = aapoint_point;
+   stage->point(stage, header);
+}
+
+
+static void
+aapoint_flush(struct draw_stage *stage, unsigned flags)
+{
+   struct draw_context *draw = stage->draw;
+   struct aapoint_stage *aapoint = aapoint_stage(stage);
+   struct pipe_context *pipe = aapoint->pipe;
+
+   stage->point = aapoint_first_point;
+   stage->next->flush( stage->next, flags );
+
+   /* restore original frag shader */
+   draw->suspend_flushing = TRUE;
+   aapoint->driver_bind_fs_state(pipe, aapoint->fs->driver_fs);
+   draw->suspend_flushing = FALSE;
+
+   draw->extra_vp_outputs.slot = 0;
+}
+
+
+static void
+aapoint_reset_stipple_counter(struct draw_stage *stage)
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void
+aapoint_destroy(struct draw_stage *stage)
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+static struct aapoint_stage *
+draw_aapoint_stage(struct draw_context *draw)
+{
+   struct aapoint_stage *aapoint = CALLOC_STRUCT(aapoint_stage);
+   if (aapoint == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
+      goto fail;
+
+   aapoint->stage.draw = draw;
+   aapoint->stage.next = NULL;
+   aapoint->stage.point = aapoint_first_point;
+   aapoint->stage.line = draw_pipe_passthrough_line;
+   aapoint->stage.tri = draw_pipe_passthrough_tri;
+   aapoint->stage.flush = aapoint_flush;
+   aapoint->stage.reset_stipple_counter = aapoint_reset_stipple_counter;
+   aapoint->stage.destroy = aapoint_destroy;
+
+   return aapoint;
+
+ fail:
+   if (aapoint)
+      aapoint_destroy(&aapoint->stage);
+
+   return NULL;
+
+}
+
+
+static struct aapoint_stage *
+aapoint_stage_from_pipe(struct pipe_context *pipe)
+{
+   struct draw_context *draw = (struct draw_context *) pipe->draw;
+   return aapoint_stage(draw->pipeline.aapoint);
+}
+
+
+/**
+ * This function overrides the driver's create_fs_state() function and
+ * will typically be called by the state tracker.
+ */
+static void *
+aapoint_create_fs_state(struct pipe_context *pipe,
+                       const struct pipe_shader_state *fs)
+{
+   struct aapoint_stage *aapoint = aapoint_stage_from_pipe(pipe);
+   struct aapoint_fragment_shader *aafs = CALLOC_STRUCT(aapoint_fragment_shader);
+   if (aafs == NULL) 
+      return NULL;
+
+   aafs->state = *fs;
+
+   /* pass-through */
+   aafs->driver_fs = aapoint->driver_create_fs_state(aapoint->pipe, fs);
+
+   return aafs;
+}
+
+
+static void
+aapoint_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct aapoint_stage *aapoint = aapoint_stage_from_pipe(pipe);
+   struct aapoint_fragment_shader *aafs = (struct aapoint_fragment_shader *) fs;
+   /* save current */
+   aapoint->fs = aafs;
+   /* pass-through */
+   aapoint->driver_bind_fs_state(aapoint->pipe,
+                                 (aafs ? aafs->driver_fs : NULL));
+}
+
+
+static void
+aapoint_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct aapoint_stage *aapoint = aapoint_stage_from_pipe(pipe);
+   struct aapoint_fragment_shader *aafs = (struct aapoint_fragment_shader *) fs;
+   /* pass-through */
+   aapoint->driver_delete_fs_state(aapoint->pipe, aafs->driver_fs);
+   FREE(aafs);
+}
+
+
+/**
+ * Called by drivers that want to install this AA point prim stage
+ * into the draw module's pipeline.  This will not be used if the
+ * hardware has native support for AA points.
+ */
+boolean
+draw_install_aapoint_stage(struct draw_context *draw,
+                           struct pipe_context *pipe)
+{
+   struct aapoint_stage *aapoint;
+
+   pipe->draw = (void *) draw;
+
+   /*
+    * Create / install AA point drawing / prim stage
+    */
+   aapoint = draw_aapoint_stage( draw );
+   if (aapoint == NULL)
+      goto fail;
+
+   aapoint->pipe = pipe;
+
+   /* save original driver functions */
+   aapoint->driver_create_fs_state = pipe->create_fs_state;
+   aapoint->driver_bind_fs_state = pipe->bind_fs_state;
+   aapoint->driver_delete_fs_state = pipe->delete_fs_state;
+
+   /* override the driver's functions */
+   pipe->create_fs_state = aapoint_create_fs_state;
+   pipe->bind_fs_state = aapoint_bind_fs_state;
+   pipe->delete_fs_state = aapoint_delete_fs_state;
+
+   draw->pipeline.aapoint = &aapoint->stage;
+
+   return TRUE;
+
+ fail:
+   if (aapoint)
+      aapoint->stage.destroy( &aapoint->stage );
+
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
new file mode 100644
index 0000000000..3265dcd154
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -0,0 +1,515 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Clipping stage
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_vs.h"
+#include "draw_pipe.h"
+
+
+#ifndef IS_NEGATIVE
+#define IS_NEGATIVE(X) ((X) < 0.0)
+#endif
+
+#ifndef DIFFERENT_SIGNS
+#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F)
+#endif
+
+#ifndef MAX_CLIPPED_VERTICES
+#define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1)
+#endif
+
+
+
+struct clipper {
+   struct draw_stage stage;      /**< base class */
+
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
+   float (*plane)[4];
+};
+
+
+/* This is a bit confusing:
+ */
+static INLINE struct clipper *clipper_stage( struct draw_stage *stage )
+{
+   return (struct clipper *)stage;
+}
+
+
+#define LINTERP(T, OUT, IN) ((OUT) + (T) * ((IN) - (OUT)))
+
+
+/* All attributes are float[4], so this is easy:
+ */
+static void interp_attr( float *fdst,
+			 float t,
+			 const float *fin,
+			 const float *fout )
+{  
+   fdst[0] = LINTERP( t, fout[0], fin[0] );
+   fdst[1] = LINTERP( t, fout[1], fin[1] );
+   fdst[2] = LINTERP( t, fout[2], fin[2] );
+   fdst[3] = LINTERP( t, fout[3], fin[3] );
+}
+
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+
+/* Interpolate between two vertices to produce a third.  
+ */
+static void interp( const struct clipper *clip,
+		    struct vertex_header *dst,
+		    float t,
+		    const struct vertex_header *out, 
+		    const struct vertex_header *in )
+{
+   const unsigned nr_attrs = clip->stage.draw->vs.num_vs_outputs;
+   const unsigned pos_attr = clip->stage.draw->vs.position_output;
+   unsigned j;
+
+   /* Vertex header.
+    */
+   {
+      dst->clipmask = 0;
+      dst->edgeflag = 0;        /* will get overwritten later */
+      dst->pad = 0;
+      dst->vertex_id = UNDEFINED_VERTEX_ID;
+   }
+
+   /* Clip coordinates:  interpolate normally
+    */
+   {
+      interp_attr(dst->clip, t, in->clip, out->clip);
+   }
+
+   /* Do the projective divide and insert window coordinates:
+    */
+   {
+      const float *pos = dst->clip;
+      const float *scale = clip->stage.draw->viewport.scale;
+      const float *trans = clip->stage.draw->viewport.translate;
+      const float oow = 1.0f / pos[3];
+
+      dst->data[pos_attr][0] = pos[0] * oow * scale[0] + trans[0];
+      dst->data[pos_attr][1] = pos[1] * oow * scale[1] + trans[1];
+      dst->data[pos_attr][2] = pos[2] * oow * scale[2] + trans[2];
+      dst->data[pos_attr][3] = oow;
+   }
+
+   /* Other attributes
+    */
+   for (j = 0; j < nr_attrs; j++) {
+      if (j != pos_attr)
+         interp_attr(dst->data[j], t, in->data[j], out->data[j]);
+   }
+}
+
+
+static void emit_poly( struct draw_stage *stage,
+		       struct vertex_header **inlist,
+		       unsigned n,
+		       const struct prim_header *origPrim)
+{
+   struct prim_header header;
+   unsigned i;
+
+   const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+   const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+   const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+   /* later stages may need the determinant, but only the sign matters */
+   header.det = origPrim->det;
+   header.flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+   header.pad = 0;
+
+   for (i = 2; i < n; i++, header.flags = edge_middle) {
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
+
+      if (i == n-1)
+        header.flags |= edge_last;
+
+      if (0) {
+         const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
+         uint j, k;
+         debug_printf("Clipped tri:\n");
+         for (j = 0; j < 3; j++) {
+            for (k = 0; k < vs->info.num_outputs; k++) {
+               debug_printf("  Vert %d: Attr %d:  %f %f %f %f\n", j, k,
+                            header.v[j]->data[k][0],
+                            header.v[j]->data[k][1],
+                            header.v[j]->data[k][2],
+                            header.v[j]->data[k][3]);
+            }
+         }
+      }
+
+      stage->next->tri( stage->next, &header );
+   }
+}
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+
+/* Clip a triangle against the viewport and user clip planes.
+ */
+static void
+do_clip_tri( struct draw_stage *stage, 
+	     struct prim_header *header,
+	     unsigned clipmask )
+{
+   struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *a[MAX_CLIPPED_VERTICES];
+   struct vertex_header *b[MAX_CLIPPED_VERTICES];
+   struct vertex_header **inlist = a;
+   struct vertex_header **outlist = b;
+   unsigned tmpnr = 0;
+   unsigned n = 3;
+   unsigned i;
+
+   inlist[0] = header->v[0];
+   inlist[1] = header->v[1];
+   inlist[2] = header->v[2];
+
+   while (clipmask && n >= 3) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      struct vertex_header *vert_prev = inlist[0];
+      float dp_prev = dot4( vert_prev->clip, plane );
+      unsigned outcount = 0;
+
+      clipmask &= ~(1<<plane_idx);
+
+      inlist[n] = inlist[0]; /* prevent rotation of vertices */
+
+      for (i = 1; i <= n; i++) {
+	 struct vertex_header *vert = inlist[i];
+
+	 float dp = dot4( vert->clip, plane );
+
+	 if (!IS_NEGATIVE(dp_prev)) {
+	    outlist[outcount++] = vert_prev;
+	 }
+
+	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
+	    struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++];
+	    outlist[outcount++] = new_vert;
+
+	    if (IS_NEGATIVE(dp)) {
+	       /* Going out of bounds.  Avoid division by zero as we
+		* know dp != dp_prev from DIFFERENT_SIGNS, above.
+		*/
+	       float t = dp / (dp - dp_prev);
+	       interp( clipper, new_vert, t, vert, vert_prev );
+	       
+	       /* Force edgeflag true in this case:
+		*/
+	       new_vert->edgeflag = 1;
+	    } else {
+	       /* Coming back in.
+		*/
+	       float t = dp_prev / (dp_prev - dp);
+	       interp( clipper, new_vert, t, vert_prev, vert );
+
+	       /* Copy starting vert's edgeflag:
+		*/
+	       new_vert->edgeflag = vert_prev->edgeflag;
+	    }
+	 }
+
+	 vert_prev = vert;
+	 dp_prev = dp;
+      }
+
+      {
+	 struct vertex_header **tmp = inlist;
+	 inlist = outlist;
+	 outlist = tmp;
+	 n = outcount;
+      }
+   }
+
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
+   /* Emit the polygon as triangles to the setup stage:
+    */
+   if (n >= 3)
+      emit_poly( stage, inlist, n, header );
+}
+
+
+/* Clip a line against the viewport and user clip planes.
+ */
+static void
+do_clip_line( struct draw_stage *stage,
+	      struct prim_header *header,
+	      unsigned clipmask )
+{
+   const struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const float *pos0 = v0->clip;
+   const float *pos1 = v1->clip;
+   float t0 = 0.0F;
+   float t1 = 0.0F;
+   struct prim_header newprim;
+
+   while (clipmask) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      const float dp0 = dot4( pos0, plane );
+      const float dp1 = dot4( pos1, plane );
+
+      if (dp1 < 0.0F) {
+	 float t = dp1 / (dp1 - dp0);
+         t1 = MAX2(t1, t);
+      } 
+
+      if (dp0 < 0.0F) {
+	 float t = dp0 / (dp0 - dp1);
+         t0 = MAX2(t0, t);
+      }
+
+      if (t0 + t1 >= 1.0F)
+	 return; /* discard */
+
+      clipmask &= ~(1 << plane_idx);  /* turn off this plane's bit */
+   }
+
+   if (v0->clipmask) {
+      interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
+      newprim.v[0] = stage->tmp[0];
+   }
+   else {
+      newprim.v[0] = v0;
+   }
+
+   if (v1->clipmask) {
+      interp( clipper, stage->tmp[1], t1, v1, v0 );
+      newprim.v[1] = stage->tmp[1];
+   }
+   else {
+      newprim.v[1] = v1;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static void
+clip_point( struct draw_stage *stage, 
+	    struct prim_header *header )
+{
+   if (header->v[0]->clipmask == 0) 
+      stage->next->point( stage->next, header );
+}
+
+
+static void
+clip_line( struct draw_stage *stage,
+	   struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->line( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask &
+             header->v[1]->clipmask) == 0) {
+      do_clip_line(stage, header, clipmask);
+   }
+   /* else, totally clipped */
+}
+
+
+static void
+clip_tri( struct draw_stage *stage,
+	  struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask | 
+                        header->v[2]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->tri( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask & 
+             header->v[1]->clipmask & 
+             header->v[2]->clipmask) == 0) {
+      do_clip_tri(stage, header, clipmask);
+   }
+}
+
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+   if (clipper->flat) {
+      const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->info.num_outputs; i++) {
+	 if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->info.output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void clip_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void clip_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Allocate a new clipper stage.
+ * \return pointer to new stage object
+ */
+struct draw_stage *draw_clip_stage( struct draw_context *draw )
+{
+   struct clipper *clipper = CALLOC_STRUCT(clipper);
+   if (clipper == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
+      goto fail;
+
+   clipper->stage.draw = draw;
+   clipper->stage.point = clip_point;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
+   clipper->stage.flush = clip_flush;
+   clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
+   clipper->stage.destroy = clip_destroy;
+
+   clipper->plane = draw->plane;
+
+   return &clipper->stage;
+
+ fail:
+   if (clipper)
+      clipper->stage.destroy( &clipper->stage );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
new file mode 100644
index 0000000000..053be5f050
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -0,0 +1,147 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for polygon culling
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "draw_pipe.h"
+
+
+struct cull_stage {
+   struct draw_stage stage;
+   unsigned winding;  /**< which winding(s) to cull (one of PIPE_WINDING_x) */
+};
+
+
+static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
+{
+   return (struct cull_stage *)stage;
+}
+
+
+
+
+static void cull_tri( struct draw_stage *stage,
+		      struct prim_header *header )
+{
+   const unsigned pos = stage->draw->vs.position_output;
+
+   /* Window coords: */
+   const float *v0 = header->v[0]->data[pos];
+   const float *v1 = header->v[1]->data[pos];
+   const float *v2 = header->v[2]->data[pos];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+   
+   /* det = cross(e,f).z */
+   header->det = ex * fy - ey * fx;
+
+   if (header->det != 0) {
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+
+      if ((winding & cull_stage(stage)->winding) == 0) {
+         /* triangle is not culled, pass to next stage */
+	 stage->next->tri( stage->next, header );
+      }
+   }
+}
+
+static void cull_first_tri( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct cull_stage *cull = cull_stage(stage);
+
+   cull->winding = stage->draw->rasterizer->cull_mode;
+
+   stage->tri = cull_tri;
+   stage->tri( stage, header );
+}
+
+
+
+static void cull_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = cull_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+static void cull_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void cull_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create a new polygon culling stage.
+ */
+struct draw_stage *draw_cull_stage( struct draw_context *draw )
+{
+   struct cull_stage *cull = CALLOC_STRUCT(cull_stage);
+   if (cull == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
+      goto fail;
+
+   cull->stage.draw = draw;
+   cull->stage.next = NULL;
+   cull->stage.point = draw_pipe_passthrough_point;
+   cull->stage.line = draw_pipe_passthrough_line;
+   cull->stage.tri = cull_first_tri;
+   cull->stage.flush = cull_flush;
+   cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
+   cull->stage.destroy = cull_destroy;
+
+   return &cull->stage;
+
+ fail:
+   if (cull)
+      cull->stage.destroy( &cull->stage );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
new file mode 100644
index 0000000000..43d1fecc4d
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -0,0 +1,281 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
+
+
+/** subclass of draw_stage */
+struct flat_stage
+{
+   struct draw_stage stage;
+
+   uint num_color_attribs;
+   uint color_attribs[2];  /* front/back primary colors */
+
+   uint num_spec_attribs;
+   uint spec_attribs[2];  /* front/back secondary colors */
+};
+
+#define COPY_3FV( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+} while (0)
+
+
+static INLINE struct flat_stage *
+flat_stage(struct draw_stage *stage)
+{
+   return (struct flat_stage *) stage;
+}
+
+
+/** Copy all the color attributes from 'src' vertex to 'dst' vertex */
+static INLINE void copy_colors( struct draw_stage *stage,
+                                struct vertex_header *dst,
+                                const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+
+   for (i = 0; i < flat->num_spec_attribs; i++) {
+      const uint attr = flat->spec_attribs[i];
+      COPY_3FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+/** Copy all the color attributes from src vertex to dst0 & dst1 vertices */
+static INLINE void copy_colors2( struct draw_stage *stage,
+                                 struct vertex_header *dst0,
+                                 struct vertex_header *dst1,
+                                 const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst0->data[attr], src->data[attr]);
+      COPY_4FV(dst1->data[attr], src->data[attr]);
+   }
+
+   for (i = 0; i < flat->num_spec_attribs; i++) {
+      const uint attr = flat->spec_attribs[i];
+      COPY_3FV(dst0->data[attr], src->data[attr]);
+      COPY_3FV(dst1->data[attr], src->data[attr]);
+   }
+}
+
+
+/**
+ * Flatshade tri.  Required for clipping and when unfilled tris are
+ * active, otherwise handled by hardware.
+ */
+static void flatshade_tri_0( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.flags = header->flags;
+   tmp.pad = header->pad;
+   tmp.v[0] = header->v[0];
+   tmp.v[1] = dup_vert(stage, header->v[1], 0);
+   tmp.v[2] = dup_vert(stage, header->v[2], 1);
+
+   copy_colors2(stage, tmp.v[1], tmp.v[2], tmp.v[0]);
+   
+   stage->next->tri( stage->next, &tmp );
+}
+
+
+static void flatshade_tri_2( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.flags = header->flags;
+   tmp.pad = header->pad;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = header->v[2];
+
+   copy_colors2(stage, tmp.v[0], tmp.v[1], tmp.v[2]);
+   
+   stage->next->tri( stage->next, &tmp );
+}
+
+
+
+
+
+/**
+ * Flatshade line.  Required for clipping.
+ */
+static void flatshade_line_0( struct draw_stage *stage,
+                              struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.v[0] = header->v[0];
+   tmp.v[1] = dup_vert(stage, header->v[1], 0);
+
+   copy_colors(stage, tmp.v[1], tmp.v[0]);
+   
+   stage->next->line( stage->next, &tmp );
+}
+
+static void flatshade_line_1( struct draw_stage *stage,
+                              struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = header->v[1];
+
+   copy_colors(stage, tmp.v[0], tmp.v[1]);
+   
+   stage->next->line( stage->next, &tmp );
+}
+
+
+
+
+static void flatshade_init_state( struct draw_stage *stage )
+{
+   struct flat_stage *flat = flat_stage(stage);
+   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
+   uint i;
+
+   /* Find which vertex shader outputs are colors, make a list */
+   flat->num_color_attribs = 0;
+   flat->num_spec_attribs = 0;
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+          vs->info.output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         if (vs->info.output_semantic_index[i] == 0)
+            flat->color_attribs[flat->num_color_attribs++] = i;
+         else
+            flat->spec_attribs[flat->num_spec_attribs++] = i;
+      }
+   }
+
+   /* Choose flatshade routine according to provoking vertex:
+    */
+   if (stage->draw->rasterizer->flatshade_first) {
+      stage->line = flatshade_line_0;
+      stage->tri = flatshade_tri_0;
+   }
+   else {
+      stage->line = flatshade_line_1;
+      stage->tri = flatshade_tri_2;
+   }
+}
+
+static void flatshade_first_tri( struct draw_stage *stage,
+				 struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void flatshade_first_line( struct draw_stage *stage,
+				  struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void flatshade_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = flatshade_first_tri;
+   stage->line = flatshade_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void flatshade_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void flatshade_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create flatshading drawing stage.
+ */
+struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
+{
+   struct flat_stage *flatshade = CALLOC_STRUCT(flat_stage);
+   if (flatshade == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
+      goto fail;
+
+   flatshade->stage.draw = draw;
+   flatshade->stage.next = NULL;
+   flatshade->stage.point = draw_pipe_passthrough_point;
+   flatshade->stage.line = flatshade_first_line;
+   flatshade->stage.tri = flatshade_first_tri;
+   flatshade->stage.flush = flatshade_flush;
+   flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter;
+   flatshade->stage.destroy = flatshade_destroy;
+
+   return &flatshade->stage;
+
+ fail:
+   if (flatshade)
+      flatshade->stage.destroy( &flatshade->stage );
+
+   return NULL;
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_pipe_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
new file mode 100644
index 0000000000..8ddc4ee617
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -0,0 +1,184 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  polygon offset state
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "draw_pipe.h"
+
+
+
+struct offset_stage {
+   struct draw_stage stage;
+
+   float scale;
+   float units;
+};
+
+
+
+static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
+{
+   return (struct offset_stage *) stage;
+}
+
+
+
+
+
+/**
+ * Offset tri Z.  Some hardware can handle this, but not usually when
+ * doing unfilled rendering.
+ */
+static void do_offset_tri( struct draw_stage *stage,
+			   struct prim_header *header )
+{
+   const unsigned pos = stage->draw->vs.position_output;
+   struct offset_stage *offset = offset_stage(stage);   
+   float inv_det = 1.0f / header->det;
+
+   /* Window coords:
+    */
+   float *v0 = header->v[0]->data[pos];
+   float *v1 = header->v[1]->data[pos];
+   float *v2 = header->v[2]->data[pos];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   float ex = v0[0] - v2[0];
+   float ey = v0[1] - v2[1];
+   float ez = v0[2] - v2[2];
+   float fx = v1[0] - v2[0];
+   float fy = v1[1] - v2[1];
+   float fz = v1[2] - v2[2];
+
+   /* (a,b) = cross(e,f).xy */
+   float a = ey*fz - ez*fy;
+   float b = ez*fx - ex*fz;
+
+   float dzdx = fabsf(a * inv_det);
+   float dzdy = fabsf(b * inv_det);
+
+   float zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale;
+
+   /*
+    * Note: we're applying the offset and clamping per-vertex.
+    * Ideally, the offset is applied per-fragment prior to fragment shading.
+    */
+   v0[2] = CLAMP(v0[2] + zoffset, 0.0f, 1.0f);
+   v1[2] = CLAMP(v1[2] + zoffset, 0.0f, 1.0f);
+   v2[2] = CLAMP(v2[2] + zoffset, 0.0f, 1.0f);
+
+   stage->next->tri( stage->next, header );
+}
+
+
+static void offset_tri( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.flags = header->flags;
+   tmp.pad = header->pad;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = dup_vert(stage, header->v[2], 2);
+
+   do_offset_tri( stage, &tmp );
+}
+
+
+static void offset_first_tri( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct offset_stage *offset = offset_stage(stage);
+
+   offset->units = stage->draw->rasterizer->offset_units * stage->draw->mrd;
+   offset->scale = stage->draw->rasterizer->offset_scale;
+
+   stage->tri = offset_tri;
+   stage->tri( stage, header );
+}
+
+
+
+
+static void offset_flush( struct draw_stage *stage,
+			  unsigned flags )
+{
+   stage->tri = offset_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void offset_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void offset_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create polygon offset drawing stage.
+ */
+struct draw_stage *draw_offset_stage( struct draw_context *draw )
+{
+   struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
+   if (offset == NULL)
+      goto fail;
+
+   draw_alloc_temp_verts( &offset->stage, 3 );
+
+   offset->stage.draw = draw;
+   offset->stage.next = NULL;
+   offset->stage.point = draw_pipe_passthrough_point;
+   offset->stage.line = draw_pipe_passthrough_line;
+   offset->stage.tri = offset_first_tri;
+   offset->stage.flush = offset_flush;
+   offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
+   offset->stage.destroy = offset_destroy;
+
+   return &offset->stage;
+
+ fail:
+   if (offset)
+      offset->stage.destroy( &offset->stage );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
new file mode 100644
index 0000000000..b764d9c518
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -0,0 +1,770 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Polygon stipple stage:  implement polygon stipple with texture map and
+ * fragment program.  The fragment program samples the texture and does
+ * a fragment kill for the stipple-failing fragments.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "pipe/p_inlines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "tgsi/tgsi_transform.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw_context.h"
+#include "draw_pipe.h"
+
+
+
+/**
+ * Subclass of pipe_shader_state to carry extra fragment shader info.
+ */
+struct pstip_fragment_shader
+{
+   struct pipe_shader_state state;
+   void *driver_fs;
+   void *pstip_fs;
+   uint sampler_unit;
+};
+
+
+/**
+ * Subclass of draw_stage
+ */
+struct pstip_stage
+{
+   struct draw_stage stage;
+
+   void *sampler_cso;
+   struct pipe_texture *texture;
+   uint num_samplers;
+   uint num_textures;
+
+   /*
+    * Currently bound state
+    */
+   struct pstip_fragment_shader *fs;
+   struct {
+      void *samplers[PIPE_MAX_SAMPLERS];
+      struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
+      const struct pipe_poly_stipple *stipple;
+   } state;
+
+   /*
+    * Driver interface/override functions
+    */
+   void * (*driver_create_fs_state)(struct pipe_context *,
+                                    const struct pipe_shader_state *);
+   void (*driver_bind_fs_state)(struct pipe_context *, void *);
+   void (*driver_delete_fs_state)(struct pipe_context *, void *);
+
+   void (*driver_bind_sampler_states)(struct pipe_context *, unsigned, void **);
+
+   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
+                                       struct pipe_texture **);
+
+   void (*driver_set_polygon_stipple)(struct pipe_context *,
+                                      const struct pipe_poly_stipple *);
+
+   struct pipe_context *pipe;
+};
+
+
+
+/**
+ * Subclass of tgsi_transform_context, used for transforming the
+ * user's fragment shader to add the special AA instructions.
+ */
+struct pstip_transform_context {
+   struct tgsi_transform_context base;
+   uint tempsUsed;  /**< bitmask */
+   int wincoordInput;
+   int maxInput;
+   uint samplersUsed;  /**< bitfield of samplers used */
+   int freeSampler;  /** an available sampler for the pstipple */
+   int texTemp;  /**< temp registers */
+   int numImmed;
+   boolean firstInstruction;
+};
+
+
+/**
+ * TGSI declaration transform callback.
+ * Look for a free sampler, a free input attrib, and two free temp regs.
+ */
+static void
+pstip_transform_decl(struct tgsi_transform_context *ctx,
+                     struct tgsi_full_declaration *decl)
+{
+   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
+
+   if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
+      uint i;
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last; i++) {
+         pctx->samplersUsed |= 1 << i;
+      }
+   }
+   else if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      pctx->maxInput = MAX2(pctx->maxInput, (int) decl->DeclarationRange.Last);
+      if (decl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION)
+         pctx->wincoordInput = (int) decl->DeclarationRange.First;
+   }
+   else if (decl->Declaration.File == TGSI_FILE_TEMPORARY) {
+      uint i;
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last; i++) {
+         pctx->tempsUsed |= (1 << i);
+      }
+   }
+
+   ctx->emit_declaration(ctx, decl);
+}
+
+
+static void
+pstip_transform_immed(struct tgsi_transform_context *ctx,
+                      struct tgsi_full_immediate *immed)
+{
+   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
+   pctx->numImmed++;
+}
+
+
+/**
+ * Find the lowest zero bit in the given word, or -1 if bitfield is all ones.
+ */
+static int
+free_bit(uint bitfield)
+{
+   int i;
+   for (i = 0; i < 32; i++) {
+      if ((bitfield & (1 << i)) == 0)
+         return i;
+   }
+   return -1;
+}
+
+
+/**
+ * TGSI instruction transform callback.
+ * Replace writes to result.color w/ a temp reg.
+ * Upon END instruction, insert texture sampling code for antialiasing.
+ */
+static void
+pstip_transform_inst(struct tgsi_transform_context *ctx,
+                     struct tgsi_full_instruction *inst)
+{
+   struct pstip_transform_context *pctx = (struct pstip_transform_context *) ctx;
+
+   if (pctx->firstInstruction) {
+      /* emit our new declarations before the first instruction */
+
+      struct tgsi_full_declaration decl;
+      struct tgsi_full_instruction newInst;
+      uint i;
+      int wincoordInput;
+
+      /* find free sampler */
+      pctx->freeSampler = free_bit(pctx->samplersUsed);
+      if (pctx->freeSampler >= PIPE_MAX_SAMPLERS)
+         pctx->freeSampler = PIPE_MAX_SAMPLERS - 1;
+
+      if (pctx->wincoordInput < 0)
+         wincoordInput = pctx->maxInput + 1;
+      else
+         wincoordInput = pctx->wincoordInput;
+
+      /* find one free temp reg */
+      for (i = 0; i < 32; i++) {
+         if ((pctx->tempsUsed & (1 << i)) == 0) {
+            /* found a free temp */
+            if (pctx->texTemp < 0)
+               pctx->texTemp  = i;
+            else
+               break;
+         }
+      }
+      assert(pctx->texTemp >= 0);
+
+      if (pctx->wincoordInput < 0) {
+         /* declare new position input reg */
+         decl = tgsi_default_full_declaration();
+         decl.Declaration.File = TGSI_FILE_INPUT;
+         decl.Declaration.Interpolate = TGSI_INTERPOLATE_LINEAR; /* XXX? */
+         decl.Declaration.Semantic = 1;
+         decl.Semantic.SemanticName = TGSI_SEMANTIC_POSITION;
+         decl.Semantic.SemanticIndex = 0;
+         decl.DeclarationRange.First = 
+            decl.DeclarationRange.Last = wincoordInput;
+         ctx->emit_declaration(ctx, &decl);
+      }
+
+      /* declare new sampler */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_SAMPLER;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = pctx->freeSampler;
+      ctx->emit_declaration(ctx, &decl);
+
+      /* declare new temp regs */
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_TEMPORARY;
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = pctx->texTemp;
+      ctx->emit_declaration(ctx, &decl);
+
+      /* emit immediate = {1/32, 1/32, 1, 1}
+       * The index/position of this immediate will be pctx->numImmed
+       */
+      {
+         static const float value[4] = { 1.0/32, 1.0/32, 1.0, 1.0 };
+         struct tgsi_full_immediate immed;
+         uint size = 4;
+         immed = tgsi_default_full_immediate();
+         immed.Immediate.Size = 1 + size; /* one for the token itself */
+         immed.u.Pointer = (void *) value;
+         ctx->emit_immediate(ctx, &immed);
+      }
+
+      pctx->firstInstruction = FALSE;
+
+
+      /* 
+       * Insert new MUL/TEX/KILP instructions at start of program
+       * Take gl_FragCoord, divide by 32 (stipple size), sample the
+       * texture and kill fragment if needed.
+       *
+       * We'd like to use non-normalized texcoords to index into a RECT
+       * texture, but we can only use GL_REPEAT wrap mode with normalized
+       * texcoords.  Darn.
+       */
+
+      /* MUL texTemp, INPUT[wincoord], 1/32; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_MUL;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = pctx->texTemp;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = wincoordInput;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_IMMEDIATE;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = pctx->numImmed;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* TEX texTemp, texTemp, sampler; */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_TEX;
+      newInst.Instruction.NumDstRegs = 1;
+      newInst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullDstRegisters[0].DstRegister.Index = pctx->texTemp;
+      newInst.Instruction.NumSrcRegs = 2;
+      newInst.InstructionExtTexture.Texture = TGSI_TEXTURE_2D;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = pctx->texTemp;
+      newInst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_SAMPLER;
+      newInst.FullSrcRegisters[1].SrcRegister.Index = pctx->freeSampler;
+      ctx->emit_instruction(ctx, &newInst);
+
+      /* KIL -texTemp;   # if -texTemp < 0, KILL fragment */
+      newInst = tgsi_default_full_instruction();
+      newInst.Instruction.Opcode = TGSI_OPCODE_KIL;
+      newInst.Instruction.NumDstRegs = 0;
+      newInst.Instruction.NumSrcRegs = 1;
+      newInst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_TEMPORARY;
+      newInst.FullSrcRegisters[0].SrcRegister.Index = pctx->texTemp;
+      newInst.FullSrcRegisters[0].SrcRegister.Negate = 1;
+      ctx->emit_instruction(ctx, &newInst);
+   }
+
+   /* emit this instruction */
+   ctx->emit_instruction(ctx, inst);
+}
+
+
+/**
+ * Generate the frag shader we'll use for doing polygon stipple.
+ * This will be the user's shader prefixed with a TEX and KIL instruction.
+ */
+static boolean
+generate_pstip_fs(struct pstip_stage *pstip)
+{
+   const struct pipe_shader_state *orig_fs = &pstip->fs->state;
+   /*struct draw_context *draw = pstip->stage.draw;*/
+   struct pipe_shader_state pstip_fs;
+   struct pstip_transform_context transform;
+
+#define MAX 1000
+
+   pstip_fs = *orig_fs; /* copy to init */
+   pstip_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (pstip_fs.tokens == NULL)
+      return FALSE;
+
+   memset(&transform, 0, sizeof(transform));
+   transform.wincoordInput = -1;
+   transform.maxInput = -1;
+   transform.texTemp = -1;
+   transform.firstInstruction = TRUE;
+   transform.base.transform_instruction = pstip_transform_inst;
+   transform.base.transform_declaration = pstip_transform_decl;
+   transform.base.transform_immediate = pstip_transform_immed;
+
+   tgsi_transform_shader(orig_fs->tokens,
+                         (struct tgsi_token *) pstip_fs.tokens,
+                         MAX, &transform.base);
+
+#if 0 /* DEBUG */
+   tgsi_dump(orig_fs->tokens, 0);
+   tgsi_dump(pstip_fs.tokens, 0);
+#endif
+
+   pstip->fs->sampler_unit = transform.freeSampler;
+   assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS);
+
+   pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs);
+
+   return TRUE;
+}
+
+
+/**
+ * Load texture image with current stipple pattern.
+ */
+static void
+pstip_update_texture(struct pstip_stage *pstip)
+{
+   static const uint bit31 = 1 << 31;
+   struct pipe_context *pipe = pstip->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *surface;
+   const uint *stipple = pstip->state.stipple->stipple;
+   uint i, j;
+   ubyte *data;
+
+   /* XXX: want to avoid flushing just because we use stipple: 
+    */
+   pipe->flush( pipe, PIPE_FLUSH_TEXTURE_CACHE, NULL );
+
+   surface = screen->get_tex_surface(screen, pstip->texture, 0, 0, 0,
+                                     PIPE_BUFFER_USAGE_CPU_WRITE);
+   data = screen->surface_map(screen, surface,
+                              PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   /*
+    * Load alpha texture.
+    * Note: 0 means keep the fragment, 255 means kill it.
+    * We'll negate the texel value and use KILP which kills if value
+    * is negative.
+    */
+   for (i = 0; i < 32; i++) {
+      for (j = 0; j < 32; j++) {
+         if (stipple[i] & (bit31 >> j)) {
+            /* fragment "on" */
+            data[i * surface->stride + j] = 0;
+         }
+         else {
+            /* fragment "off" */
+            data[i * surface->stride + j] = 255;
+         }
+      }
+   }
+
+   /* unmap */
+   screen->surface_unmap(screen, surface);
+   screen->tex_surface_release(screen, &surface);
+}
+
+
+/**
+ * Create the texture map we'll use for stippling.
+ */
+static boolean
+pstip_create_texture(struct pstip_stage *pstip)
+{
+   struct pipe_context *pipe = pstip->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture texTemp;
+
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = PIPE_FORMAT_A8_UNORM; /* XXX verify supported by driver! */
+   texTemp.last_level = 0;
+   texTemp.width[0] = 32;
+   texTemp.height[0] = 32;
+   texTemp.depth[0] = 1;
+   pf_get_block(texTemp.format, &texTemp.block);
+
+   pstip->texture = screen->texture_create(screen, &texTemp);
+   if (pstip->texture == NULL)
+      return FALSE;
+
+   return TRUE;
+}
+
+
+/**
+ * Create the sampler CSO that'll be used for stippling.
+ */
+static boolean
+pstip_create_sampler(struct pstip_stage *pstip)
+{
+   struct pipe_sampler_state sampler;
+   struct pipe_context *pipe = pstip->pipe;
+
+   memset(&sampler, 0, sizeof(sampler));
+   sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
+   sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
+   sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
+   sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
+   sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
+   sampler.normalized_coords = 1;
+   sampler.min_lod = 0.0f;
+   sampler.max_lod = 0.0f;
+
+   pstip->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+   if (pstip->sampler_cso == NULL)
+      return FALSE;
+   
+   return TRUE;
+}
+
+
+/**
+ * When we're about to draw our first stipple polygon in a batch, this function
+ * is called to tell the driver to bind our modified fragment shader.
+ */
+static boolean
+bind_pstip_fragment_shader(struct pstip_stage *pstip)
+{
+   struct draw_context *draw = pstip->stage.draw;
+   if (!pstip->fs->pstip_fs &&
+       !generate_pstip_fs(pstip))
+      return FALSE;
+
+   draw->suspend_flushing = TRUE;
+   pstip->driver_bind_fs_state(pstip->pipe, pstip->fs->pstip_fs);
+   draw->suspend_flushing = FALSE;
+   return TRUE;
+}
+
+
+static INLINE struct pstip_stage *
+pstip_stage( struct draw_stage *stage )
+{
+   return (struct pstip_stage *) stage;
+}
+
+
+static void
+pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   struct pstip_stage *pstip = pstip_stage(stage);
+   struct pipe_context *pipe = pstip->pipe;
+   struct draw_context *draw = stage->draw;
+   uint num_samplers;
+
+   assert(stage->draw->rasterizer->poly_stipple_enable);
+
+   /* bind our fragprog */
+   if (!bind_pstip_fragment_shader(pstip)) {
+      stage->tri = draw_pipe_passthrough_tri;
+      stage->tri(stage, header);
+      return;
+   }
+      
+
+   /* how many samplers? */
+   /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
+   num_samplers = MAX2(pstip->num_textures, pstip->num_samplers);
+   num_samplers = MAX2(num_samplers, pstip->fs->sampler_unit + 1);
+
+   /* plug in our sampler, texture */
+   pstip->state.samplers[pstip->fs->sampler_unit] = pstip->sampler_cso;
+   pipe_texture_reference(&pstip->state.textures[pstip->fs->sampler_unit],
+                          pstip->texture);
+
+   assert(num_samplers <= PIPE_MAX_SAMPLERS);
+
+   draw->suspend_flushing = TRUE;
+   pstip->driver_bind_sampler_states(pipe, num_samplers, pstip->state.samplers);
+   pstip->driver_set_sampler_textures(pipe, num_samplers, pstip->state.textures);
+   draw->suspend_flushing = FALSE;
+
+   /* now really draw first triangle */
+   stage->tri = draw_pipe_passthrough_tri;
+   stage->tri(stage, header);
+}
+
+
+static void
+pstip_flush(struct draw_stage *stage, unsigned flags)
+{
+   struct draw_context *draw = stage->draw;
+   struct pstip_stage *pstip = pstip_stage(stage);
+   struct pipe_context *pipe = pstip->pipe;
+
+   stage->tri = pstip_first_tri;
+   stage->next->flush( stage->next, flags );
+
+   /* restore original frag shader, texture, sampler state */
+   draw->suspend_flushing = TRUE;
+   pstip->driver_bind_fs_state(pipe, pstip->fs->driver_fs);
+   pstip->driver_bind_sampler_states(pipe, pstip->num_samplers,
+                                     pstip->state.samplers);
+   pstip->driver_set_sampler_textures(pipe, pstip->num_textures,
+                                      pstip->state.textures);
+   draw->suspend_flushing = FALSE;
+}
+
+
+static void
+pstip_reset_stipple_counter(struct draw_stage *stage)
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void
+pstip_destroy(struct draw_stage *stage)
+{
+   struct pstip_stage *pstip = pstip_stage(stage);
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_texture_reference(&pstip->state.textures[i], NULL);
+   }
+
+   pstip->pipe->delete_sampler_state(pstip->pipe, pstip->sampler_cso);
+
+   pipe_texture_release(&pstip->texture);
+
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+static struct pstip_stage *
+draw_pstip_stage(struct draw_context *draw)
+{
+   struct pstip_stage *pstip = CALLOC_STRUCT(pstip_stage);
+
+   draw_alloc_temp_verts( &pstip->stage, 8 );
+
+   pstip->stage.draw = draw;
+   pstip->stage.next = NULL;
+   pstip->stage.point = draw_pipe_passthrough_point;
+   pstip->stage.line = draw_pipe_passthrough_line;
+   pstip->stage.tri = pstip_first_tri;
+   pstip->stage.flush = pstip_flush;
+   pstip->stage.reset_stipple_counter = pstip_reset_stipple_counter;
+   pstip->stage.destroy = pstip_destroy;
+
+   return pstip;
+}
+
+
+static struct pstip_stage *
+pstip_stage_from_pipe(struct pipe_context *pipe)
+{
+   struct draw_context *draw = (struct draw_context *) pipe->draw;
+   return pstip_stage(draw->pipeline.pstipple);
+}
+
+
+/**
+ * This function overrides the driver's create_fs_state() function and
+ * will typically be called by the state tracker.
+ */
+static void *
+pstip_create_fs_state(struct pipe_context *pipe,
+                       const struct pipe_shader_state *fs)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+   struct pstip_fragment_shader *aafs = CALLOC_STRUCT(pstip_fragment_shader);
+
+   if (aafs) {
+      aafs->state = *fs;
+
+      /* pass-through */
+      aafs->driver_fs = pstip->driver_create_fs_state(pstip->pipe, fs);
+   }
+
+   return aafs;
+}
+
+
+static void
+pstip_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+   struct pstip_fragment_shader *aafs = (struct pstip_fragment_shader *) fs;
+   /* save current */
+   pstip->fs = aafs;
+   /* pass-through */
+   pstip->driver_bind_fs_state(pstip->pipe,
+                               (aafs ? aafs->driver_fs : NULL));
+}
+
+
+static void
+pstip_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+   struct pstip_fragment_shader *aafs = (struct pstip_fragment_shader *) fs;
+   /* pass-through */
+   pstip->driver_delete_fs_state(pstip->pipe, aafs->driver_fs);
+   FREE(aafs);
+}
+
+
+static void
+pstip_bind_sampler_states(struct pipe_context *pipe,
+                          unsigned num, void **sampler)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+   uint i;
+
+   /* save current */
+   memcpy(pstip->state.samplers, sampler, num * sizeof(void *));
+   for (i = num; i < PIPE_MAX_SAMPLERS; i++) {
+      pstip->state.samplers[i] = NULL;
+   }
+
+   pstip->num_samplers = num;
+   /* pass-through */
+   pstip->driver_bind_sampler_states(pstip->pipe, num, sampler);
+}
+
+
+static void
+pstip_set_sampler_textures(struct pipe_context *pipe,
+                           unsigned num, struct pipe_texture **texture)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+   uint i;
+
+   /* save current */
+   for (i = 0; i < num; i++) {
+      pipe_texture_reference(&pstip->state.textures[i], texture[i]);
+   }
+   for (; i < PIPE_MAX_SAMPLERS; i++) {
+      pipe_texture_reference(&pstip->state.textures[i], NULL);
+   }
+
+   pstip->num_textures = num;
+
+   /* pass-through */
+   pstip->driver_set_sampler_textures(pstip->pipe, num, texture);
+}
+
+
+static void
+pstip_set_polygon_stipple(struct pipe_context *pipe,
+                          const struct pipe_poly_stipple *stipple)
+{
+   struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
+
+   /* save current */
+   pstip->state.stipple = stipple;
+
+   /* pass-through */
+   pstip->driver_set_polygon_stipple(pstip->pipe, stipple);
+
+   pstip_update_texture(pstip);
+}
+
+
+/**
+ * Called by drivers that want to install this polygon stipple stage
+ * into the draw module's pipeline.  This will not be used if the
+ * hardware has native support for polygon stipple.
+ */
+boolean
+draw_install_pstipple_stage(struct draw_context *draw,
+                            struct pipe_context *pipe)
+{
+   struct pstip_stage *pstip;
+
+   pipe->draw = (void *) draw;
+
+   /*
+    * Create / install pgon stipple drawing / prim stage
+    */
+   pstip = draw_pstip_stage( draw );
+   if (pstip == NULL)
+      goto fail;
+
+   draw->pipeline.pstipple = &pstip->stage;
+
+   pstip->pipe = pipe;
+
+   /* create special texture, sampler state */
+   if (!pstip_create_texture(pstip))
+      goto fail;
+
+   if (!pstip_create_sampler(pstip))
+      goto fail;
+
+   /* save original driver functions */
+   pstip->driver_create_fs_state = pipe->create_fs_state;
+   pstip->driver_bind_fs_state = pipe->bind_fs_state;
+   pstip->driver_delete_fs_state = pipe->delete_fs_state;
+
+   pstip->driver_bind_sampler_states = pipe->bind_sampler_states;
+   pstip->driver_set_sampler_textures = pipe->set_sampler_textures;
+   pstip->driver_set_polygon_stipple = pipe->set_polygon_stipple;
+
+   /* override the driver's functions */
+   pipe->create_fs_state = pstip_create_fs_state;
+   pipe->bind_fs_state = pstip_bind_fs_state;
+   pipe->delete_fs_state = pstip_delete_fs_state;
+
+   pipe->bind_sampler_states = pstip_bind_sampler_states;
+   pipe->set_sampler_textures = pstip_set_sampler_textures;
+   pipe->set_polygon_stipple = pstip_set_polygon_stipple;
+
+   return TRUE;
+
+ fail:
+   if (pstip)
+      pstip->stage.destroy( &pstip->stage );
+
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
new file mode 100644
index 0000000000..b65e2aa102
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -0,0 +1,250 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+/* Implement line stipple by cutting lines up into smaller lines.
+ * There are hundreds of ways to implement line stipple, this is one
+ * choice that should work in all situations, requires no state
+ * manipulations, but with a penalty in terms of large amounts of
+ * generated geometry.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_pipe.h"
+
+
+/** Subclass of draw_stage */
+struct stipple_stage {
+   struct draw_stage stage;
+   float counter;
+   uint pattern;
+   uint factor;
+};
+
+
+static INLINE struct stipple_stage *
+stipple_stage(struct draw_stage *stage)
+{
+   return (struct stipple_stage *) stage;
+}
+
+
+/**
+ * Compute interpolated vertex attributes for 'dst' at position 't' 
+ * between 'v0' and 'v1'.
+ * XXX using linear interpolation for all attribs at this time.
+ */
+static void
+screen_interp( struct draw_context *draw,
+               struct vertex_header *dst,
+               float t,
+               const struct vertex_header *v0, 
+               const struct vertex_header *v1 )
+{
+   uint attr;
+   for (attr = 0; attr < draw->vs.num_vs_outputs; attr++) {
+      const float *val0 = v0->data[attr];
+      const float *val1 = v1->data[attr];
+      float *newv = dst->data[attr];
+      uint i;
+      for (i = 0; i < 4; i++) {
+         newv[i] = val0[i] + t * (val1[i] - val0[i]);
+      }
+   }
+}
+
+
+static void
+emit_segment(struct draw_stage *stage, struct prim_header *header,
+             float t0, float t1)
+{
+   struct vertex_header *v0new = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1new = dup_vert(stage, header->v[1], 1);
+   struct prim_header newprim = *header;
+
+   if (t0 > 0.0) {
+      screen_interp( stage->draw, v0new, t0, header->v[0], header->v[1] );
+      newprim.v[0] = v0new;
+   }
+
+   if (t1 < 1.0) {
+      screen_interp( stage->draw, v1new, t1, header->v[0], header->v[1] );
+      newprim.v[1] = v1new;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static INLINE unsigned
+stipple_test(int counter, ushort pattern, int factor)
+{
+   int b = (counter / factor) & 0xf;
+   return (1 << b) & pattern;
+}
+
+
+static void
+stipple_line(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const unsigned pos = stage->draw->vs.position_output;
+   const float *pos0 = v0->data[pos];
+   const float *pos1 = v1->data[pos];
+   float start = 0;
+   int state = 0;
+
+   float x0 = pos0[0];
+   float x1 = pos1[0];
+   float y0 = pos0[1];
+   float y1 = pos1[1];
+
+   float dx = x0 > x1 ? x0 - x1 : x1 - x0;
+   float dy = y0 > y1 ? y0 - y1 : y1 - y0;
+
+   float length = MAX2(dx, dy);
+   int i;
+
+   if (header->flags & DRAW_PIPE_RESET_STIPPLE)
+      stipple->counter = 0;
+
+
+   /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table.
+    */
+   for (i = 0; i < length; i++) {
+      int result = stipple_test( (int) stipple->counter+i,
+                                 (ushort) stipple->pattern, stipple->factor );
+      if (result != state) {
+         /* changing from "off" to "on" or vice versa */
+	 if (state) {
+	    if (start != i) {
+               /* finishing an "on" segment */
+	       emit_segment( stage, header, start / length, i / length );
+            }
+	 }
+	 else {
+            /* starting an "on" segment */
+	    start = (float) i;
+	 }
+	 state = result;	   
+      }
+   }
+
+   if (state && start < length)
+      emit_segment( stage, header, start / length, 1.0 );
+
+   stipple->counter += length;
+}
+
+
+static void
+reset_stipple_counter(struct draw_stage *stage)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+static void
+stipple_reset_point(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->point(stage->next, header);
+}
+
+static void
+stipple_reset_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->tri(stage->next, header);
+}
+
+
+static void
+stipple_first_line(struct draw_stage *stage, 
+		   struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   stipple->pattern = draw->rasterizer->line_stipple_pattern;
+   stipple->factor = draw->rasterizer->line_stipple_factor + 1;
+
+   stage->line = stipple_line;
+   stage->line( stage, header );
+}
+
+
+static void
+stipple_flush(struct draw_stage *stage, unsigned flags)
+{
+   stage->line = stipple_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+
+
+static void 
+stipple_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create line stippler stage
+ */
+struct draw_stage *draw_stipple_stage( struct draw_context *draw )
+{
+   struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
+
+   draw_alloc_temp_verts( &stipple->stage, 2 );
+
+   stipple->stage.draw = draw;
+   stipple->stage.next = NULL;
+   stipple->stage.point = stipple_reset_point;
+   stipple->stage.line = stipple_first_line;
+   stipple->stage.tri = stipple_reset_tri;
+   stipple->stage.reset_stipple_counter = reset_stipple_counter;
+   stipple->stage.flush = stipple_flush;
+   stipple->stage.destroy = stipple_destroy;
+
+   return &stipple->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
new file mode 100644
index 0000000000..c329d92339
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -0,0 +1,199 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
+
+struct twoside_stage {
+   struct draw_stage stage;
+   float sign;         /**< +1 or -1 */
+   uint attrib_front0, attrib_back0;
+   uint attrib_front1, attrib_back1;
+};
+
+
+static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
+{
+   return (struct twoside_stage *)stage;
+}
+
+
+
+
+/**
+ * Copy back color(s) to front color(s).
+ */
+static INLINE struct vertex_header *
+copy_bfc( struct twoside_stage *twoside, 
+          const struct vertex_header *v,
+          unsigned idx )
+{   
+   struct vertex_header *tmp = dup_vert( &twoside->stage, v, idx );
+   
+   if (twoside->attrib_back0) {
+      COPY_4FV(tmp->data[twoside->attrib_front0],
+               tmp->data[twoside->attrib_back0]);
+   }
+   if (twoside->attrib_back1) {
+      COPY_4FV(tmp->data[twoside->attrib_front1],
+               tmp->data[twoside->attrib_back1]);
+   }
+
+   return tmp;
+}
+
+
+/* Twoside tri:
+ */
+static void twoside_tri( struct draw_stage *stage,
+			 struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+
+   if (header->det * twoside->sign < 0.0) {
+      /* this is a back-facing triangle */
+      struct prim_header tmp;
+
+      tmp.det = header->det;
+      tmp.flags = header->flags;
+      tmp.pad = header->pad;
+      /* copy back attribs to front attribs */
+      tmp.v[0] = copy_bfc(twoside, header->v[0], 0);
+      tmp.v[1] = copy_bfc(twoside, header->v[1], 1);
+      tmp.v[2] = copy_bfc(twoside, header->v[2], 2);
+
+      stage->next->tri( stage->next, &tmp );
+   }
+   else {
+      stage->next->tri( stage->next, header );
+   }
+}
+
+
+
+static void twoside_first_tri( struct draw_stage *stage, 
+			       struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
+   uint i;
+
+   twoside->attrib_front0 = 0;
+   twoside->attrib_front1 = 0;
+   twoside->attrib_back0 = 0;
+   twoside->attrib_back1 = 0;
+
+   /* Find which vertex shader outputs are front/back colors */
+   for (i = 0; i < vs->info.num_outputs; i++) {
+      if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+         if (vs->info.output_semantic_index[i] == 0)
+            twoside->attrib_front0 = i;
+         else
+            twoside->attrib_front1 = i;
+      }
+      if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         if (vs->info.output_semantic_index[i] == 0)
+            twoside->attrib_back0 = i;
+         else
+            twoside->attrib_back1 = i;
+      }
+   }
+
+   if (!twoside->attrib_back0)
+      twoside->attrib_front0 = 0;
+
+   if (!twoside->attrib_back1)
+      twoside->attrib_front1 = 0;
+
+   /*
+    * We'll multiply the primitive's determinant by this sign to determine
+    * if the triangle is back-facing (negative).
+    * sign = -1 for CCW, +1 for CW
+    */
+   twoside->sign = (stage->draw->rasterizer->front_winding == PIPE_WINDING_CCW) ? -1.0f : 1.0f;
+
+   stage->tri = twoside_tri;
+   stage->tri( stage, header );
+}
+
+
+static void twoside_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = twoside_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void twoside_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void twoside_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create twoside pipeline stage.
+ */
+struct draw_stage *draw_twoside_stage( struct draw_context *draw )
+{
+   struct twoside_stage *twoside = CALLOC_STRUCT(twoside_stage);
+   if (twoside == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
+      goto fail;
+
+   twoside->stage.draw = draw;
+   twoside->stage.next = NULL;
+   twoside->stage.point = draw_pipe_passthrough_point;
+   twoside->stage.line = draw_pipe_passthrough_line;
+   twoside->stage.tri = twoside_first_tri;
+   twoside->stage.flush = twoside_flush;
+   twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
+   twoside->stage.destroy = twoside_destroy;
+
+   return &twoside->stage;
+
+ fail:
+   if (twoside)
+      twoside->stage.destroy( &twoside->stage );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
new file mode 100644
index 0000000000..68835fd1a5
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -0,0 +1,203 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for handling glPolygonMode(line/point).
+ * Convert triangles to points or lines as needed.
+ */
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+#include "draw_pipe.h"
+
+
+struct unfilled_stage {
+   struct draw_stage stage;
+
+   /** [0] = front face, [1] = back face.
+    * legal values:  PIPE_POLYGON_MODE_FILL, PIPE_POLYGON_MODE_LINE,
+    * and PIPE_POLYGON_MODE_POINT,
+    */
+   unsigned mode[2];
+};
+
+
+static INLINE struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
+{
+   return (struct unfilled_stage *)stage;
+}
+
+
+
+static void point( struct draw_stage *stage,
+		   struct vertex_header *v0 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   stage->next->point( stage->next, &tmp );
+}
+
+static void line( struct draw_stage *stage,
+		  struct vertex_header *v0,
+		  struct vertex_header *v1 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   tmp.v[1] = v1;
+   stage->next->line( stage->next, &tmp );
+}
+
+
+static void points( struct draw_stage *stage,
+		    struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) point( stage, v0 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) point( stage, v1 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) point( stage, v2 );
+}
+
+
+static void lines( struct draw_stage *stage,
+		   struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+   if (header->flags & DRAW_PIPE_RESET_STIPPLE)
+      stage->next->reset_stipple_counter( stage->next );
+
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_2) && v2->edgeflag) line( stage, v2, v0 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_0) && v0->edgeflag) line( stage, v0, v1 );
+   if ((header->flags & DRAW_PIPE_EDGE_FLAG_1) && v1->edgeflag) line( stage, v1, v2 );
+}
+
+
+/* Unfilled tri:  
+ *
+ * Note edgeflags in the vertex struct is not sufficient as we will
+ * need to manipulate them when decomposing primitives.  
+ * 
+ * We currently keep the vertex edgeflag and primitive edgeflag mask
+ * separate until the last possible moment.
+ */
+static void unfilled_tri( struct draw_stage *stage,
+			  struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+   unsigned mode = unfilled->mode[header->det >= 0.0];
+  
+   switch (mode) {
+   case PIPE_POLYGON_MODE_FILL:
+      stage->next->tri( stage->next, header );
+      break;
+   case PIPE_POLYGON_MODE_LINE:
+      lines( stage, header );
+      break;
+   case PIPE_POLYGON_MODE_POINT:
+      points( stage, header );
+      break;
+   default:
+      assert(0);
+   }   
+}
+
+
+static void unfilled_first_tri( struct draw_stage *stage, 
+				struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+
+   unfilled->mode[0] = stage->draw->rasterizer->fill_ccw; /* front */
+   unfilled->mode[1] = stage->draw->rasterizer->fill_cw;  /* back */
+
+   stage->tri = unfilled_tri;
+   stage->tri( stage, header );
+}
+
+
+
+static void unfilled_flush( struct draw_stage *stage,
+			    unsigned flags )
+{
+   stage->next->flush( stage->next, flags );
+
+   stage->tri = unfilled_first_tri;
+}
+
+
+static void unfilled_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void unfilled_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create unfilled triangle stage.
+ */
+struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
+{
+   struct unfilled_stage *unfilled = CALLOC_STRUCT(unfilled_stage);
+   if (unfilled == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
+      goto fail;
+
+   unfilled->stage.draw = draw;
+   unfilled->stage.next = NULL;
+   unfilled->stage.tmp = NULL;
+   unfilled->stage.point = draw_pipe_passthrough_point;
+   unfilled->stage.line = draw_pipe_passthrough_line;
+   unfilled->stage.tri = unfilled_first_tri;
+   unfilled->stage.flush = unfilled_flush;
+   unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
+   unfilled->stage.destroy = unfilled_destroy;
+
+   return &unfilled->stage;
+
+ fail:
+   if (unfilled)
+      unfilled->stage.destroy( &unfilled->stage );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_util.c b/src/gallium/auxiliary/draw/draw_pipe_util.c
new file mode 100644
index 0000000000..e22e5fed0c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_util.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pipe.h"
+
+
+
+void
+draw_pipe_passthrough_point(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->point(stage->next, header);
+}
+
+void
+draw_pipe_passthrough_line(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->line(stage->next, header);
+}
+
+void
+draw_pipe_passthrough_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+
+
+
+/* This is only used for temporary verts.
+ */
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+/**
+ * Allocate space for temporary post-transform vertices, such as for clipping.
+ */
+boolean draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr )
+{
+   assert(!stage->tmp);
+
+   stage->tmp = NULL;
+   stage->nr_tmps = nr;
+
+   if (nr != 0)
+   {
+      unsigned i;
+      ubyte *store = (ubyte *) MALLOC( MAX_VERTEX_SIZE * nr );
+
+      if (store == NULL)
+         return FALSE;
+
+      stage->tmp = (struct vertex_header **) MALLOC( sizeof(struct vertex_header *) * nr );
+      if (stage->tmp == NULL) {
+         FREE(store);
+         return FALSE;
+      }
+         
+      for (i = 0; i < nr; i++)
+         stage->tmp[i] = (struct vertex_header *)(store + i * MAX_VERTEX_SIZE);
+   }
+
+   return TRUE;
+}
+
+
+void draw_free_temp_verts( struct draw_stage *stage )
+{
+   if (stage->tmp) {
+      FREE( stage->tmp[0] );
+      FREE( stage->tmp );
+      stage->tmp = NULL;
+   }
+}
+
+
+/* Reset vertex ids.  This is basically a type of flush.
+ *
+ * Called only from draw_pipe_vbuf.c
+ */
+void draw_reset_vertex_ids(struct draw_context *draw)
+{
+   struct draw_stage *stage = draw->pipeline.first;
+   
+   while (stage) {
+      unsigned i;
+
+      for (i = 0; i < stage->nr_tmps; i++)
+	 stage->tmp[i]->vertex_id = UNDEFINED_VERTEX_ID;
+
+      stage = stage->next;
+   }
+
+   if (draw->pipeline.verts)
+   {
+      unsigned i;
+      char *verts = draw->pipeline.verts;
+      unsigned stride = draw->pipeline.vertex_stride;
+
+      for (i = 0; i < draw->pipeline.vertex_count; i++) {
+         ((struct vertex_header *)verts)->vertex_id = UNDEFINED_VERTEX_ID;
+         verts += stride;
+      }
+   }
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_pipe_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
new file mode 100644
index 0000000000..f34c68728e
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -0,0 +1,317 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+#include "draw_pipe.h"
+#include "draw_context.h"
+
+static boolean points( unsigned prim )
+{
+   return (prim == PIPE_PRIM_POINTS);
+}
+
+static boolean lines( unsigned prim )
+{
+   return (prim == PIPE_PRIM_LINES ||
+           prim == PIPE_PRIM_LINE_STRIP ||
+           prim == PIPE_PRIM_LINE_LOOP);
+}
+
+static boolean triangles( unsigned prim )
+{
+   return prim >= PIPE_PRIM_TRIANGLES;
+}
+
+/**
+ * Check if we need any special pipeline stages, or whether
+ * prims/verts can go through untouched.  Don't test for bypass
+ * clipping or vs modes, this function is just about the primitive
+ * pipeline stages.
+ */
+boolean
+draw_need_pipeline(const struct draw_context *draw,
+                   const struct pipe_rasterizer_state *rasterizer,
+                   unsigned int prim )
+{
+   /* Don't have to worry about triangles turning into lines/points
+    * and triggering the pipeline, because we have to trigger the
+    * pipeline *anyway* if unfilled mode is active.
+    */
+   if (lines(prim)) 
+   {
+      /* line stipple */
+      if (rasterizer->line_stipple_enable && draw->pipeline.line_stipple)
+         return TRUE;
+
+      /* wide lines */
+      if (rasterizer->line_width > draw->pipeline.wide_line_threshold)
+         return TRUE;
+
+      /* AA lines */
+      if (rasterizer->line_smooth && draw->pipeline.aaline)
+         return TRUE;
+   }
+
+   if (points(prim))
+   {
+      /* large points */
+      if (rasterizer->point_size > draw->pipeline.wide_point_threshold)
+         return TRUE;
+
+      /* AA points */
+      if (rasterizer->point_smooth && draw->pipeline.aapoint)
+         return TRUE;
+
+      /* point sprites */
+      if (rasterizer->point_sprite && draw->pipeline.point_sprite)
+         return TRUE;
+   }
+
+
+   if (triangles(prim)) 
+   {
+      /* polygon stipple */
+      if (rasterizer->poly_stipple_enable && draw->pipeline.pstipple)
+         return TRUE;
+
+      /* unfilled polygons */
+      if (rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+          rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL)
+         return TRUE;
+      
+      /* polygon offset */
+      if (rasterizer->offset_cw || rasterizer->offset_ccw)
+         return TRUE;
+
+      /* two-side lighting */
+      if (rasterizer->light_twoside)
+         return TRUE;
+   }
+
+   /* polygon cull - this is difficult - hardware can cull just fine
+    * most of the time (though sometimes CULL_NEITHER is unsupported.
+    * 
+    * Generally this isn't a reason to require the pipeline, though.
+    *
+   if (rasterizer->cull_mode)
+      return TRUE;
+    */
+
+   return FALSE;
+}
+
+
+
+/**
+ * Rebuild the rendering pipeline.
+ */
+static struct draw_stage *validate_pipeline( struct draw_stage *stage )
+{
+   struct draw_context *draw = stage->draw;
+   struct draw_stage *next = draw->pipeline.rasterize;
+   int need_det = 0;
+   int precalc_flat = 0;
+   boolean wide_lines, wide_points;
+
+   /* Set the validate's next stage to the rasterize stage, so that it
+    * can be found later if needed for flushing.
+    */
+   stage->next = next;
+
+   /* drawing wide lines? */
+   wide_lines = (draw->rasterizer->line_width > draw->pipeline.wide_line_threshold
+                 && !draw->rasterizer->line_smooth);
+
+   /* drawing large points? */
+   if (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)
+      wide_points = TRUE;
+   else if (draw->rasterizer->point_smooth && draw->pipeline.aapoint)
+      wide_points = FALSE;
+   else if (draw->rasterizer->point_size > draw->pipeline.wide_point_threshold)
+      wide_points = TRUE;
+   else
+      wide_points = FALSE;
+
+   /*
+    * NOTE: we build up the pipeline in end-to-start order.
+    *
+    * TODO: make the current primitive part of the state and build
+    * shorter pipelines for lines & points.
+    */
+
+   if (draw->rasterizer->line_smooth && draw->pipeline.aaline) {
+      draw->pipeline.aaline->next = next;
+      next = draw->pipeline.aaline;
+   }
+
+   if (draw->rasterizer->point_smooth && draw->pipeline.aapoint) {
+      draw->pipeline.aapoint->next = next;
+      next = draw->pipeline.aapoint;
+   }
+
+   if (wide_lines) {
+      draw->pipeline.wide_line->next = next;
+      next = draw->pipeline.wide_line;
+      precalc_flat = 1;
+   }
+
+   if (wide_points || draw->rasterizer->point_sprite) {
+      draw->pipeline.wide_point->next = next;
+      next = draw->pipeline.wide_point;
+   }
+
+   if (draw->rasterizer->line_stipple_enable && draw->pipeline.line_stipple) {
+      draw->pipeline.stipple->next = next;
+      next = draw->pipeline.stipple;
+      precalc_flat = 1;		/* only needed for lines really */
+   }
+
+   if (draw->rasterizer->poly_stipple_enable
+       && draw->pipeline.pstipple) {
+      draw->pipeline.pstipple->next = next;
+      next = draw->pipeline.pstipple;
+   }
+
+   if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) {
+      draw->pipeline.unfilled->next = next;
+      next = draw->pipeline.unfilled;
+      precalc_flat = 1;		/* only needed for triangles really */
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
+	 
+   if (draw->rasterizer->offset_cw ||
+       draw->rasterizer->offset_ccw) {
+      draw->pipeline.offset->next = next;
+      next = draw->pipeline.offset;
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->light_twoside) {
+      draw->pipeline.twoside->next = next;
+      next = draw->pipeline.twoside;
+      need_det = 1;
+   }
+
+   /* Always run the cull stage as we calculate determinant there
+    * also.  
+    *
+    * This can actually be a win as culling out the triangles can lead
+    * to less work emitting vertices, smaller vertex buffers, etc.
+    * It's difficult to say whether this will be true in general.
+    */
+   if (need_det || draw->rasterizer->cull_mode) {
+      draw->pipeline.cull->next = next;
+      next = draw->pipeline.cull;
+   }
+
+   /* Clip stage
+    */
+   if (!draw->bypass_clipping)
+   {
+      draw->pipeline.clip->next = next;
+      next = draw->pipeline.clip;
+   }
+
+   
+   draw->pipeline.first = next;
+   return next;
+}
+
+static void validate_tri( struct draw_stage *stage, 
+			  struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->tri( pipeline, header );
+}
+
+static void validate_line( struct draw_stage *stage, 
+			   struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->line( pipeline, header );
+}
+
+static void validate_point( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->point( pipeline, header );
+}
+
+static void validate_reset_stipple_counter( struct draw_stage *stage )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->reset_stipple_counter( pipeline );
+}
+
+static void validate_flush( struct draw_stage *stage, 
+			    unsigned flags )
+{
+   /* May need to pass a backend flush on to the rasterize stage.
+    */
+   if (stage->next)
+      stage->next->flush( stage->next, flags );
+}
+
+
+static void validate_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create validate pipeline stage.
+ */
+struct draw_stage *draw_validate_stage( struct draw_context *draw )
+{
+   struct draw_stage *stage = CALLOC_STRUCT(draw_stage);
+   if (stage == NULL)
+      return NULL;
+
+   stage->draw = draw;
+   stage->next = NULL;
+   stage->point = validate_point;
+   stage->line = validate_line;
+   stage->tri = validate_tri;
+   stage->flush = validate_flush;
+   stage->reset_stipple_counter = validate_reset_stipple_counter;
+   stage->destroy = validate_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
new file mode 100644
index 0000000000..5ead25efff
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -0,0 +1,496 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Jos� Fonseca <jrfonsec@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_pipe.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+
+/**
+ * Vertex buffer emit stage.
+ */
+struct vbuf_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct vbuf_render *render;
+   
+   const struct vertex_info *vinfo;
+   
+   /** Vertex size in bytes */
+   unsigned vertex_size;
+
+   struct translate *translate;
+   
+   /* FIXME: we have no guarantee that 'unsigned' is 32bit */
+
+   /** Vertices in hardware format */
+   unsigned *vertices;
+   unsigned *vertex_ptr;
+   unsigned max_vertices;
+   unsigned nr_vertices;
+   
+   /** Indices */
+   ushort *indices;
+   unsigned max_indices;
+   unsigned nr_indices;
+
+   /* Cache point size somewhere it's address won't change:
+    */
+   float point_size;
+
+   struct translate_cache *cache;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct vbuf_stage *
+vbuf_stage( struct draw_stage *stage )
+{
+   assert(stage);
+   return (struct vbuf_stage *)stage;
+}
+
+
+static void vbuf_flush_indices( struct vbuf_stage *vbuf );
+static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
+static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
+
+
+static INLINE boolean 
+overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
+{
+   unsigned long used = (unsigned long) ((char *)ptr - (char *)map);
+   return (used + bytes) > bufsz;
+}
+
+
+static INLINE void 
+check_space( struct vbuf_stage *vbuf, unsigned nr )
+{
+   if (vbuf->nr_vertices + nr > vbuf->max_vertices ) {
+      vbuf_flush_vertices(vbuf);
+      vbuf_alloc_vertices(vbuf);
+   }
+
+   if (vbuf->nr_indices + nr > vbuf->max_indices )
+      vbuf_flush_indices(vbuf);
+}
+
+
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.  We only use the vertex->data[] fields.
+ */
+static INLINE ushort 
+emit_vertex( struct vbuf_stage *vbuf,
+             struct vertex_header *vertex )
+{
+   if(vertex->vertex_id == UNDEFINED_VERTEX_ID) {      
+      /* Hmm - vertices are emitted one at a time - better make sure
+       * set_buffer is efficient.  Consider a special one-shot mode for
+       * translate.
+       */
+      /* Note: we really do want data[0] here, not data[pos]: 
+       */
+      vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0);
+      vbuf->translate->run(vbuf->translate, 0, 1, vbuf->vertex_ptr);
+
+      if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
+      
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
+      vertex->vertex_id = vbuf->nr_vertices++;
+   }
+
+   return (ushort)vertex->vertex_id;
+}
+
+
+static void 
+vbuf_tri( struct draw_stage *stage,
+          struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 3 );
+
+   for (i = 0; i < 3; i++) {
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
+   }
+}
+
+
+static void 
+vbuf_line( struct draw_stage *stage, 
+           struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 2 );
+
+   for (i = 0; i < 2; i++) {
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
+   }   
+}
+
+
+static void 
+vbuf_point( struct draw_stage *stage, 
+            struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   check_space( vbuf, 1 );
+
+   vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[0] );
+}
+
+
+
+
+/**
+ * Set the prim type for subsequent vertices.
+ * This may result in a new vertex size.  The existing vbuffer (if any)
+ * will be flushed if needed and a new one allocated.
+ */
+static void
+vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
+{
+   struct translate_key hw_key;
+   unsigned dst_offset;
+   unsigned i;
+
+   vbuf->render->set_primitive(vbuf->render, prim);
+
+   /* Must do this after set_primitive() above:
+    * 
+    * XXX: need some state managment to track when this needs to be
+    * recalculated.  The driver should tell us whether there was a
+    * state change.
+    */
+   vbuf->vinfo = vbuf->render->get_vertex_info(vbuf->render);
+
+   if (vbuf->vertex_size != vbuf->vinfo->size * sizeof(float)) {
+      vbuf_flush_vertices(vbuf);
+      vbuf->vertex_size = vbuf->vinfo->size * sizeof(float);
+   }
+
+   /* Translate from pipeline vertices to hw vertices.
+    */
+   dst_offset = 0;
+
+   for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
+      unsigned emit_sz = 0;
+      unsigned src_buffer = 0;
+      unsigned output_format;
+      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
+
+      switch (vbuf->vinfo->attrib[i].emit) {
+      case EMIT_4F:
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
+	 break;
+      case EMIT_3F:
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
+	 break;
+      case EMIT_2F:
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
+	 break;
+      case EMIT_1F:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 break;
+      case EMIT_1F_PSIZE:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 src_buffer = 1;
+	 src_offset = 0;
+	 break;
+      case EMIT_4UB:
+	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 emit_sz = 4 * sizeof(ubyte);
+         break;
+      default:
+	 assert(0);
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 break;
+      }
+      
+      hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      hw_key.element[i].input_buffer = src_buffer;
+      hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].output_format = output_format;
+      hw_key.element[i].output_offset = dst_offset;
+
+      dst_offset += emit_sz;
+   }
+
+   hw_key.nr_elements = vbuf->vinfo->num_attribs;
+   hw_key.output_stride = vbuf->vinfo->size * 4;
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!vbuf->translate ||
+       translate_key_compare(&vbuf->translate->key, &hw_key) != 0) 
+   {
+      translate_key_sanitize(&hw_key);
+      vbuf->translate = translate_cache_find(vbuf->cache, &hw_key);
+
+      vbuf->translate->set_buffer(vbuf->translate, 1, &vbuf->point_size, 0);
+   }
+
+   vbuf->point_size = vbuf->stage.draw->rasterizer->point_size;
+
+   /* Allocate new buffer?
+    */
+   if (!vbuf->vertices)
+      vbuf_alloc_vertices(vbuf);
+}
+
+
+static void 
+vbuf_first_tri( struct draw_stage *stage,
+                struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );   
+   stage->tri = vbuf_tri;
+   vbuf_set_prim(vbuf, PIPE_PRIM_TRIANGLES);
+   stage->tri( stage, prim );
+}
+
+
+static void 
+vbuf_first_line( struct draw_stage *stage,
+                 struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->line = vbuf_line;
+   vbuf_set_prim(vbuf, PIPE_PRIM_LINES);
+   stage->line( stage, prim );
+}
+
+
+static void 
+vbuf_first_point( struct draw_stage *stage,
+                  struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->point = vbuf_point;
+   vbuf_set_prim(vbuf, PIPE_PRIM_POINTS);
+   stage->point( stage, prim );
+}
+
+
+static void 
+vbuf_flush_indices( struct vbuf_stage *vbuf ) 
+{
+   if(!vbuf->nr_indices)
+      return;
+   
+   assert((uint) (vbuf->vertex_ptr - vbuf->vertices) == 
+          vbuf->nr_vertices * vbuf->vertex_size / sizeof(unsigned));
+
+   vbuf->render->draw(vbuf->render, vbuf->indices, vbuf->nr_indices);
+   
+   vbuf->nr_indices = 0;
+}
+
+
+/**
+ * Flush existing vertex buffer and allocate a new one.
+ * 
+ * XXX: We separate flush-on-index-full and flush-on-vb-full, but may 
+ * raise issues uploading vertices if the hardware wants to flush when
+ * we flush.
+ */
+static void 
+vbuf_flush_vertices( struct vbuf_stage *vbuf )
+{
+   if(vbuf->vertices) {      
+      vbuf_flush_indices(vbuf);
+      
+      /* Reset temporary vertices ids */
+      if(vbuf->nr_vertices)
+	 draw_reset_vertex_ids( vbuf->stage.draw );
+      
+      /* Free the vertex buffer */
+      vbuf->render->release_vertices(vbuf->render,
+                                     vbuf->vertices,
+                                     vbuf->vertex_size,
+                                     vbuf->nr_vertices);
+      vbuf->max_vertices = vbuf->nr_vertices = 0;
+      vbuf->vertex_ptr = vbuf->vertices = NULL;
+      
+   }
+}
+   
+
+static void 
+vbuf_alloc_vertices( struct vbuf_stage *vbuf )
+{
+   assert(!vbuf->nr_indices);
+   assert(!vbuf->vertices);
+   
+   /* Allocate a new vertex buffer */
+   vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
+
+   /* even number */
+   vbuf->max_vertices = vbuf->max_vertices & ~1;
+
+   /* Must always succeed -- driver gives us a
+    * 'max_vertex_buffer_bytes' which it guarantees it can allocate,
+    * and it will flush itself if necessary to do so.  If this does
+    * fail, we are basically without usable hardware.
+    */
+   assert(vbuf->max_vertices < UNDEFINED_VERTEX_ID);
+
+   vbuf->vertices = (uint *) vbuf->render->allocate_vertices(vbuf->render,
+							     (ushort) vbuf->vertex_size,
+							     (ushort) vbuf->max_vertices);
+   vbuf->vertex_ptr = vbuf->vertices;
+}
+
+
+
+static void 
+vbuf_flush( struct draw_stage *stage, unsigned flags )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+
+   stage->point = vbuf_first_point;
+   stage->line = vbuf_first_line;
+   stage->tri = vbuf_first_tri;
+
+   if (flags & DRAW_FLUSH_BACKEND)
+      vbuf_flush_vertices( vbuf );
+}
+
+
+static void 
+vbuf_reset_stipple_counter( struct draw_stage *stage )
+{
+   /* XXX: Need to do something here for hardware with linestipple.
+    */
+   (void) stage;
+}
+
+
+static void vbuf_destroy( struct draw_stage *stage )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if (vbuf->render)
+      vbuf->render->destroy( vbuf->render );
+
+   if (vbuf->cache)
+      translate_cache_destroy(vbuf->cache);
+
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
+                                    struct vbuf_render *render )
+{
+   struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
+   if (vbuf == NULL)
+      goto fail;
+   
+   vbuf->stage.draw = draw;
+   vbuf->stage.point = vbuf_first_point;
+   vbuf->stage.line = vbuf_first_line;
+   vbuf->stage.tri = vbuf_first_tri;
+   vbuf->stage.flush = vbuf_flush;
+   vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
+   vbuf->stage.destroy = vbuf_destroy;
+   
+   vbuf->render = render;
+   vbuf->max_indices = MAX2(render->max_indices, UNDEFINED_VERTEX_ID-1);
+
+   vbuf->indices = (ushort *) align_malloc( vbuf->max_indices * 
+					    sizeof(vbuf->indices[0]), 
+					    16 );
+   if (!vbuf->indices)
+      goto fail;
+
+   vbuf->cache = translate_cache_create();
+   if (!vbuf->cache) 
+      goto fail;
+      
+   
+   vbuf->vertices = NULL;
+   vbuf->vertex_ptr = vbuf->vertices;
+   
+   return &vbuf->stage;
+
+ fail:
+   if (vbuf)
+      vbuf_destroy(&vbuf->stage);
+   
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
new file mode 100644
index 0000000000..184e363594
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -0,0 +1,180 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "draw_private.h"
+#include "draw_pipe.h"
+
+
+struct wideline_stage {
+   struct draw_stage stage;
+
+   float half_line_width;
+};
+
+
+
+static INLINE struct wideline_stage *wideline_stage( struct draw_stage *stage )
+{
+   return (struct wideline_stage *)stage;
+}
+
+
+
+/**
+ * Draw a wide line by drawing a quad (two triangles).
+ * XXX need to disable polygon stipple.
+ */
+static void wideline_line( struct draw_stage *stage,
+                           struct prim_header *header )
+{
+   /*const struct wideline_stage *wide = wideline_stage(stage);*/
+   const unsigned pos = stage->draw->vs.position_output;
+   const float half_width = 0.5f * stage->draw->rasterizer->line_width;
+
+   struct prim_header tri;
+
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[1], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[1], 3);
+
+   float *pos0 = v0->data[pos];
+   float *pos1 = v1->data[pos];
+   float *pos2 = v2->data[pos];
+   float *pos3 = v3->data[pos];
+
+   const float dx = fabsf(pos0[0] - pos2[0]);
+   const float dy = fabsf(pos0[1] - pos2[1]);
+
+   /* small tweak to meet GL specification */
+   const float bias = 0.125f;
+
+   /*
+    * Draw wide line as a quad (two tris) by "stretching" the line along
+    * X or Y.
+    * We need to tweak coords in several ways to be conformant here.
+    */
+
+   if (dx > dy) {
+      /* x-major line */
+      pos0[1] = pos0[1] - half_width - bias;
+      pos1[1] = pos1[1] + half_width - bias;
+      pos2[1] = pos2[1] - half_width - bias;
+      pos3[1] = pos3[1] + half_width - bias;
+      if (pos0[0] < pos2[0]) {
+         /* left to right line */
+         pos0[0] -= 0.5f;
+         pos1[0] -= 0.5f;
+         pos2[0] -= 0.5f;
+         pos3[0] -= 0.5f;
+      }
+      else {
+         /* right to left line */
+         pos0[0] += 0.5f;
+         pos1[0] += 0.5f;
+         pos2[0] += 0.5f;
+         pos3[0] += 0.5f;
+      }
+   }
+   else {
+      /* y-major line */
+      pos0[0] = pos0[0] - half_width + bias;
+      pos1[0] = pos1[0] + half_width + bias;
+      pos2[0] = pos2[0] - half_width + bias;
+      pos3[0] = pos3[0] + half_width + bias;
+      if (pos0[1] < pos2[1]) {
+         /* top to bottom line */
+         pos0[1] -= 0.5f;
+         pos1[1] -= 0.5f;
+         pos2[1] -= 0.5f;
+         pos3[1] -= 0.5f;
+      }
+      else {
+         /* bottom to top line */
+         pos0[1] += 0.5f;
+         pos1[1] += 0.5f;
+         pos2[1] += 0.5f;
+         pos3[1] += 0.5f;
+      }
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void wideline_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void wideline_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void wideline_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+struct draw_stage *draw_wide_line_stage( struct draw_context *draw )
+{
+   struct wideline_stage *wide = CALLOC_STRUCT(wideline_stage);
+
+   draw_alloc_temp_verts( &wide->stage, 4 );
+
+   wide->stage.draw = draw;
+   wide->stage.next = NULL;
+   wide->stage.point = draw_pipe_passthrough_point;
+   wide->stage.line = wideline_line;
+   wide->stage.tri = draw_pipe_passthrough_tri;
+   wide->stage.flush = wideline_flush;
+   wide->stage.reset_stipple_counter = wideline_reset_stipple_counter;
+   wide->stage.destroy = wideline_destroy;
+
+   return &wide->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
new file mode 100644
index 0000000000..e1af9e56a2
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -0,0 +1,297 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
+
+
+struct widepoint_stage {
+   struct draw_stage stage;
+
+   float half_point_size;
+   float point_size_min;
+   float point_size_max;
+
+   float xbias;
+   float ybias;
+
+   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
+   uint texcoord_mode[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_texcoords;
+
+   int psize_slot;
+
+   int point_coord_fs_input;  /**< input for pointcoord (and fog) */
+};
+
+
+
+static INLINE struct widepoint_stage *
+widepoint_stage( struct draw_stage *stage )
+{
+   return (struct widepoint_stage *)stage;
+}
+
+
+
+
+/**
+ * Set the vertex texcoords for sprite mode.
+ * Coords may be left untouched or set to a right-side-up or upside-down
+ * orientation.
+ */
+static void set_texcoords(const struct widepoint_stage *wide,
+                          struct vertex_header *v, const float tc[4])
+{
+   uint i;
+   for (i = 0; i < wide->num_texcoords; i++) {
+      if (wide->texcoord_mode[i] != PIPE_SPRITE_COORD_NONE) {
+         uint j = wide->texcoord_slot[i];
+         v->data[j][0] = tc[0];
+         if (wide->texcoord_mode[i] == PIPE_SPRITE_COORD_LOWER_LEFT)
+            v->data[j][1] = 1.0f - tc[1];
+         else
+            v->data[j][1] = tc[1];
+         v->data[j][2] = tc[2];
+         v->data[j][3] = tc[3];
+      }
+   }
+
+   if (wide->point_coord_fs_input >= 0) {
+      /* put gl_PointCoord into extra vertex output's zw components */
+      uint k = wide->stage.draw->extra_vp_outputs.slot;
+      v->data[k][2] = tc[0];
+      v->data[k][3] = tc[1];
+   }
+}
+
+
+/* If there are lots of sprite points (and why wouldn't there be?) it
+ * would probably be more sensible to change hardware setup to
+ * optimize this rather than doing the whole thing in software like
+ * this.
+ */
+static void widepoint_point( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   const struct widepoint_stage *wide = widepoint_stage(stage);
+   const unsigned pos = stage->draw->vs.position_output;
+   const boolean sprite = (boolean) stage->draw->rasterizer->point_sprite;
+   float half_size;
+   float left_adj, right_adj, bot_adj, top_adj;
+
+   struct prim_header tri;
+
+   /* four dups of original vertex */
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[0], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[0], 3);
+
+   float *pos0 = v0->data[pos];
+   float *pos1 = v1->data[pos];
+   float *pos2 = v2->data[pos];
+   float *pos3 = v3->data[pos];
+
+   /* point size is either per-vertex or fixed size */
+   if (wide->psize_slot >= 0) {
+      half_size = header->v[0]->data[wide->psize_slot][0];
+
+      /* XXX: temporary -- do this in the vertex shader??
+       */
+      half_size = CLAMP(half_size,
+                        wide->point_size_min,
+                        wide->point_size_max);
+      
+      half_size *= 0.5f; 
+   }
+   else {
+      half_size = wide->half_point_size;
+   }
+
+   left_adj = -half_size + wide->xbias;
+   right_adj = half_size + wide->xbias;
+   bot_adj = half_size + wide->ybias;
+   top_adj = -half_size + wide->ybias;
+
+   pos0[0] += left_adj;
+   pos0[1] += top_adj;
+
+   pos1[0] += left_adj;
+   pos1[1] += bot_adj;
+
+   pos2[0] += right_adj;
+   pos2[1] += top_adj;
+
+   pos3[0] += right_adj;
+   pos3[1] += bot_adj;
+
+   if (sprite) {
+      static const float tex00[4] = { 0, 0, 0, 1 };
+      static const float tex01[4] = { 0, 1, 0, 1 };
+      static const float tex11[4] = { 1, 1, 0, 1 };
+      static const float tex10[4] = { 1, 0, 0, 1 };
+      set_texcoords( wide, v0, tex00 );
+      set_texcoords( wide, v1, tex01 );
+      set_texcoords( wide, v2, tex10 );
+      set_texcoords( wide, v3, tex11 );
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void widepoint_first_point( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct widepoint_stage *wide = widepoint_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   wide->half_point_size = 0.5f * draw->rasterizer->point_size;
+   wide->point_size_min = draw->rasterizer->point_size_min;
+   wide->point_size_max = draw->rasterizer->point_size_max;
+   wide->xbias = 0.0;
+   wide->ybias = 0.0;
+
+   if (draw->rasterizer->gl_rasterization_rules) {
+      wide->xbias = 0.125;
+   }
+
+   /* XXX we won't know the real size if it's computed by the vertex shader! */
+   if ((draw->rasterizer->point_size > draw->pipeline.wide_point_threshold) ||
+       (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)) {
+      stage->point = widepoint_point;
+   }
+   else {
+      stage->point = draw_pipe_passthrough_point;
+   }
+
+   if (draw->rasterizer->point_sprite) {
+      /* find vertex shader texcoord outputs */
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+      uint i, j = 0;
+      for (i = 0; i < vs->info.num_outputs; i++) {
+         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
+            wide->texcoord_slot[j] = i;
+            wide->texcoord_mode[j] = draw->rasterizer->sprite_coord_mode[j];
+            j++;
+         }
+      }
+      wide->num_texcoords = j;
+
+      /* find fragment shader PointCoord/Fog input */
+      wide->point_coord_fs_input = 0; /* XXX fix this! */
+
+      /* setup extra vp output (point coord implemented as a texcoord) */
+      draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
+      draw->extra_vp_outputs.semantic_index = 0;
+      draw->extra_vp_outputs.slot = draw->vs.num_vs_outputs;
+   }
+   else {
+      wide->point_coord_fs_input = -1;
+      draw->extra_vp_outputs.slot = 0;
+   }
+
+   wide->psize_slot = -1;
+   if (draw->rasterizer->point_size_per_vertex) {
+      /* find PSIZ vertex output */
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+      uint i;
+      for (i = 0; i < vs->info.num_outputs; i++) {
+         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
+            wide->psize_slot = i;
+            break;
+         }
+      }
+   }
+   
+   stage->point( stage, header );
+}
+
+
+static void widepoint_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->point = widepoint_first_point;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void widepoint_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void widepoint_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+struct draw_stage *draw_wide_point_stage( struct draw_context *draw )
+{
+   struct widepoint_stage *wide = CALLOC_STRUCT(widepoint_stage);
+   if (wide == NULL)
+      goto fail;
+
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
+
+   wide->stage.draw = draw;
+   wide->stage.next = NULL;
+   wide->stage.point = widepoint_first_point;
+   wide->stage.line = draw_pipe_passthrough_line;
+   wide->stage.tri = draw_pipe_passthrough_tri;
+   wide->stage.flush = widepoint_flush;
+   wide->stage.reset_stipple_counter = widepoint_reset_stipple_counter;
+   wide->stage.destroy = widepoint_destroy;
+
+   return &wide->stage;
+
+ fail:
+   if (wide)
+      wide->stage.destroy( &wide->stage );
+   
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
new file mode 100644
index 0000000000..81e4eae401
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -0,0 +1,318 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Private data structures, etc for the draw module.
+ */
+
+
+/**
+ * Authors:
+ * Keith Whitwell <keith@tungstengraphics.com>
+ * Brian Paul
+ */
+
+
+#ifndef DRAW_PRIVATE_H
+#define DRAW_PRIVATE_H
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_scan.h"
+
+
+struct pipe_context;
+struct gallivm_prog;
+struct gallivm_cpu_engine;
+struct draw_vertex_shader;
+struct draw_context;
+struct draw_stage;
+struct vbuf_render;
+
+
+/**
+ * Basic vertex info.
+ * Carry some useful information around with the vertices in the prim pipe.  
+ */
+struct vertex_header {
+   unsigned clipmask:12;
+   unsigned edgeflag:1;
+   unsigned pad:3;
+   unsigned vertex_id:16;
+
+   float clip[4];
+
+   /* This will probably become float (*data)[4] soon:
+    */
+   float data[][4];
+};
+
+/* NOTE: It should match vertex_id size above */
+#define UNDEFINED_VERTEX_ID 0xffff
+
+
+/**
+ * Private context for the drawing module.
+ */
+struct draw_context
+{
+   /** Drawing/primitive pipeline stages */
+   struct {
+      struct draw_stage *first;  /**< one of the following */
+
+      struct draw_stage *validate; 
+
+      /* stages (in logical order) */
+      struct draw_stage *flatshade;
+      struct draw_stage *clip;
+      struct draw_stage *cull;
+      struct draw_stage *twoside;
+      struct draw_stage *offset;
+      struct draw_stage *unfilled;
+      struct draw_stage *stipple;
+      struct draw_stage *aapoint;
+      struct draw_stage *aaline;
+      struct draw_stage *pstipple;
+      struct draw_stage *wide_line;
+      struct draw_stage *wide_point;
+      struct draw_stage *rasterize;
+
+      float wide_point_threshold; /**< convert pnts to tris if larger than this */
+      float wide_line_threshold;  /**< convert lines to tris if wider than this */
+      boolean line_stipple;       /**< do line stipple? */
+      boolean point_sprite;       /**< convert points to quads for sprites? */
+
+      /* Temporary storage while the pipeline is being run:
+       */
+      char *verts;
+      unsigned vertex_stride;
+      unsigned vertex_count;
+   } pipeline;
+
+
+   struct vbuf_render *render;
+
+   /* Support prototype passthrough path:
+    */
+   struct {
+      struct {
+         struct draw_pt_middle_end *fetch_emit;
+         struct draw_pt_middle_end *fetch_shade_emit;
+         struct draw_pt_middle_end *general;
+      } middle;
+
+      struct {
+         struct draw_pt_front_end *vcache;
+         struct draw_pt_front_end *varray;
+      } front;
+
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      unsigned nr_vertex_buffers;
+
+      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+      unsigned nr_vertex_elements;
+
+      /* user-space vertex data, buffers */
+      struct {
+         const unsigned *edgeflag;
+
+         /** vertex element/index buffer (ex: glDrawElements) */
+         const void *elts;
+         /** bytes per index (0, 1, 2 or 4) */
+         unsigned eltSize;
+         unsigned min_index;
+         unsigned max_index;
+         
+         /** vertex arrays */
+         const void *vbuffer[PIPE_MAX_ATTRIBS];
+         
+         /** constant buffer (for vertex shader) */
+         const void *constants;
+      } user;
+
+      boolean test_fse;         /* enable FSE even though its not correct (eg for softpipe) */
+      boolean no_fse;           /* disable FSE even when it is correct */
+   } pt;
+
+   struct {
+      boolean bypass_clipping;
+      boolean bypass_vs;
+   } driver;
+
+   boolean flushing;         /**< debugging/sanity */
+   boolean suspend_flushing; /**< internally set */
+   boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
+
+   boolean force_passthrough; /**< never clip or shade */
+
+   double mrd;  /**< minimum resolvable depth value, for polygon offset */
+
+   /* pipe state that we need: */
+   const struct pipe_rasterizer_state *rasterizer;
+   struct pipe_viewport_state viewport;
+   boolean identity_viewport;
+
+   struct {
+      struct draw_vertex_shader *vertex_shader;
+      uint num_vs_outputs;  /**< convenience, from vertex_shader */
+      uint position_output;
+
+      /** TGSI program interpreter runtime state */
+      struct tgsi_exec_machine machine;
+
+      uint num_samplers;
+      struct tgsi_sampler **samplers;
+
+      /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
+       */
+      struct gallivm_cpu_engine *engine;   
+
+      /* Here's another one:
+       */
+      struct aos_machine *aos_machine; 
+
+
+      const float (*aligned_constants)[4];
+
+      const float (*aligned_constant_storage)[4];
+      unsigned const_storage_size;
+
+
+      struct translate *fetch;
+      struct translate_cache *fetch_cache;
+      struct translate *emit;
+      struct translate_cache *emit_cache;
+   } vs;
+
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   /* If a prim stage introduces new vertex attributes, they'll be stored here
+    */
+   struct {
+      uint semantic_name;
+      uint semantic_index;
+      int slot;
+   } extra_vp_outputs;
+
+   unsigned reduced_prim;
+
+   void *driver_private;
+};
+
+
+/*******************************************************************************
+ * Vertex shader code:
+ */
+boolean draw_vs_init( struct draw_context *draw );
+void draw_vs_destroy( struct draw_context *draw );
+
+void draw_vs_set_viewport( struct draw_context *, 
+                           const struct pipe_viewport_state * );
+
+void draw_vs_set_constants( struct draw_context *,
+                            const float (*constants)[4],
+                            unsigned size );
+
+
+
+
+/*******************************************************************************
+ * Vertex processing (was passthrough) code:
+ */
+boolean draw_pt_init( struct draw_context *draw );
+void draw_pt_destroy( struct draw_context *draw );
+void draw_pt_reset_vertex_ids( struct draw_context *draw );
+
+
+/*******************************************************************************
+ * Primitive processing (pipeline) code: 
+ */
+
+boolean draw_pipeline_init( struct draw_context *draw );
+void draw_pipeline_destroy( struct draw_context *draw );
+
+
+
+
+
+/* We use the top few bits in the elts[] parameter to convey a little
+ * API information.  This limits the number of vertices we can address
+ * to only 4096 -- if that becomes a problem, we can switch to 32-bit
+ * draw indices.
+ *
+ * These flags expected at first vertex of lines & triangles when
+ * unfilled and/or line stipple modes are operational.
+ */
+#define DRAW_PIPE_MAX_VERTICES  (0x1<<12)
+#define DRAW_PIPE_EDGE_FLAG_0   (0x1<<12)
+#define DRAW_PIPE_EDGE_FLAG_1   (0x2<<12)
+#define DRAW_PIPE_EDGE_FLAG_2   (0x4<<12)
+#define DRAW_PIPE_EDGE_FLAG_ALL (0x7<<12)
+#define DRAW_PIPE_RESET_STIPPLE (0x8<<12)
+#define DRAW_PIPE_FLAG_MASK     (0xf<<12)
+
+void draw_pipeline_run( struct draw_context *draw,
+                        unsigned prim,
+                        struct vertex_header *vertices,
+                        unsigned vertex_count,
+                        unsigned stride,
+                        const ushort *elts,
+                        unsigned count );
+
+void draw_pipeline_run_linear( struct draw_context *draw,
+                               unsigned prim,
+                               struct vertex_header *vertices,
+                               unsigned count,
+                               unsigned stride );
+
+
+
+void draw_pipeline_flush( struct draw_context *draw, 
+                          unsigned flags );
+
+
+
+/*******************************************************************************
+ * Flushing 
+ */
+
+#define DRAW_FLUSH_STATE_CHANGE              0x8
+#define DRAW_FLUSH_BACKEND                   0x10
+
+
+void draw_do_flush( struct draw_context *draw, unsigned flags );
+
+
+
+
+#endif /* DRAW_PRIVATE_H */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
new file mode 100644
index 0000000000..18f24e5980
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -0,0 +1,323 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "tgsi/tgsi_dump.h"
+#include "util/u_math.h"
+
+static unsigned trim( unsigned count, unsigned first, unsigned incr )
+{
+   if (count < first)
+      return 0;
+   return count - (count - first) % incr; 
+}
+
+
+
+/* Overall we split things into:
+ *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
+ *     - middle   -- fetch, shade, cliptest, viewport
+ *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
+ *     - backend  -- the vbuf_render provided by the driver.
+ */
+static boolean
+draw_pt_arrays(struct draw_context *draw, 
+               unsigned prim,
+               unsigned start, 
+               unsigned count)
+{
+   struct draw_pt_front_end *frontend = NULL;
+   struct draw_pt_middle_end *middle = NULL;
+   unsigned opt = 0;
+
+   /* Sanitize primitive length:
+    */
+   {
+      unsigned first, incr;
+      draw_pt_split_prim(prim, &first, &incr);
+      count = trim(count, first, incr); 
+      if (count < first)
+         return TRUE;
+   }
+
+   if (!draw->force_passthrough) {
+      if (!draw->render) {
+         opt |= PT_PIPELINE;
+      }
+      
+      if (draw_need_pipeline(draw,
+                             draw->rasterizer,
+                             prim)) {
+         opt |= PT_PIPELINE;
+      }
+
+      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+         opt |= PT_CLIPTEST;
+      }
+      
+      if (!draw->rasterizer->bypass_vs) {
+         opt |= PT_SHADE;
+      }
+   }
+      
+   if (opt == 0) 
+      middle = draw->pt.middle.fetch_emit;
+   else if (opt == PT_SHADE && !draw->pt.no_fse)
+      middle = draw->pt.middle.fetch_shade_emit;
+   else
+      middle = draw->pt.middle.general;
+
+
+   /* Pick the right frontend
+    */
+   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {
+      frontend = draw->pt.front.vcache;
+   } else {
+      frontend = draw->pt.front.varray;
+   }
+
+   frontend->prepare( frontend, prim, middle, opt );
+
+   frontend->run(frontend, 
+                 draw_pt_elt_func(draw),
+                 draw_pt_elt_ptr(draw, start),
+                 count);
+
+   frontend->finish( frontend );
+
+   return TRUE;
+}
+
+
+boolean draw_pt_init( struct draw_context *draw )
+{
+   draw->pt.test_fse = debug_get_bool_option("DRAW_FSE", FALSE);
+   draw->pt.no_fse = debug_get_bool_option("DRAW_NO_FSE", FALSE);
+
+   draw->pt.front.vcache = draw_pt_vcache( draw );
+   if (!draw->pt.front.vcache)
+      return FALSE;
+
+   draw->pt.front.varray = draw_pt_varray(draw);
+   if (!draw->pt.front.varray)
+      return FALSE;
+
+   draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
+   if (!draw->pt.middle.fetch_emit)
+      return FALSE;
+
+   draw->pt.middle.fetch_shade_emit = draw_pt_middle_fse( draw );
+   if (!draw->pt.middle.fetch_shade_emit)
+      return FALSE;
+
+   draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );
+   if (!draw->pt.middle.general)
+      return FALSE;
+
+   return TRUE;
+}
+
+
+void draw_pt_destroy( struct draw_context *draw )
+{
+   if (draw->pt.middle.general) {
+      draw->pt.middle.general->destroy( draw->pt.middle.general );
+      draw->pt.middle.general = NULL;
+   }
+
+   if (draw->pt.middle.fetch_emit) {
+      draw->pt.middle.fetch_emit->destroy( draw->pt.middle.fetch_emit );
+      draw->pt.middle.fetch_emit = NULL;
+   }
+
+   if (draw->pt.middle.fetch_shade_emit) {
+      draw->pt.middle.fetch_shade_emit->destroy( draw->pt.middle.fetch_shade_emit );
+      draw->pt.middle.fetch_shade_emit = NULL;
+   }
+
+   if (draw->pt.front.vcache) {
+      draw->pt.front.vcache->destroy( draw->pt.front.vcache );
+      draw->pt.front.vcache = NULL;
+   }
+
+   if (draw->pt.front.varray) {
+      draw->pt.front.varray->destroy( draw->pt.front.varray );
+      draw->pt.front.varray = NULL;
+   }
+}
+
+
+/**
+ * Debug- print the first 'count' vertices.
+ */
+static void
+draw_print_arrays(struct draw_context *draw, uint prim, int start, uint count)
+{
+   uint i;
+
+   debug_printf("Draw arrays(prim = %u, start = %u, count = %u)\n",
+                prim, start, count);
+
+   for (i = 0; i < count; i++) {
+      uint ii, j;
+
+      if (draw->pt.user.elts) {
+         /* indexed arrays */
+         switch (draw->pt.user.eltSize) {
+         case 1:
+            {
+               const ubyte *elem = (const ubyte *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 2:
+            {
+               const ushort *elem = (const ushort *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         case 4:
+            {
+               const uint *elem = (const uint *) draw->pt.user.elts;
+               ii = elem[start + i];
+            }
+            break;
+         default:
+            assert(0);
+         }
+         debug_printf("Element[%u + %u] -> Vertex %u:\n", start, i, ii);
+      }
+      else {
+         /* non-indexed arrays */
+         ii = start + i;
+         debug_printf("Vertex %u:\n", ii);
+      }
+
+      for (j = 0; j < draw->pt.nr_vertex_elements; j++) {
+         uint buf = draw->pt.vertex_element[j].vertex_buffer_index;
+         ubyte *ptr = (ubyte *) draw->pt.user.vbuffer[buf];
+         ptr += draw->pt.vertex_buffer[buf].pitch * ii;
+         ptr += draw->pt.vertex_element[j].src_offset;
+
+         debug_printf("  Attr %u: ", j);
+         switch (draw->pt.vertex_element[j].src_format) {
+         case PIPE_FORMAT_R32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f  @ %p\n", v[0], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f  @ %p\n", v[0], v[1], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f  @ %p\n", v[0], v[1], v[2], (void *) v);
+            }
+            break;
+         case PIPE_FORMAT_R32G32B32A32_FLOAT:
+            {
+               float *v = (float *) ptr;
+               debug_printf("%f %f %f %f  @ %p\n", v[0], v[1], v[2], v[3],
+                            (void *) v);
+            }
+            break;
+         default:
+            debug_printf("other format (fix me)\n");
+            ;
+         }
+      }
+   }
+}
+
+
+/**
+ * Draw vertex arrays
+ * This is the main entrypoint into the drawing module.
+ * \param prim  one of PIPE_PRIM_x
+ * \param start  index of first vertex to draw
+ * \param count  number of vertices to draw
+ */
+void
+draw_arrays(struct draw_context *draw, unsigned prim,
+            unsigned start, unsigned count)
+{
+   unsigned reduced_prim = draw_pt_reduced_prim(prim);
+   if (reduced_prim != draw->reduced_prim) {
+      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw->reduced_prim = reduced_prim;
+   }
+
+   if (0)
+      draw_print_arrays(draw, prim, start, MIN2(count, 20));
+
+#if 0
+   {
+      int i;
+      debug_printf("draw_arrays(prim=%u start=%u count=%u):\n",
+                   prim, start, count);
+      tgsi_dump(draw->vs.vertex_shader->state.tokens, 0);
+      debug_printf("Elements:\n");
+      for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+         debug_printf("  format=%s comps=%u\n",
+                      pf_name(draw->pt.vertex_element[i].src_format),
+                      draw->pt.vertex_element[i].nr_components);
+      }
+      debug_printf("Buffers:\n");
+      for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+         debug_printf("  pitch=%u offset=%u ptr=%p\n",
+                      draw->pt.vertex_buffer[i].pitch,
+                      draw->pt.vertex_buffer[i].buffer_offset,
+                      draw->pt.user.vbuffer[i]);
+      }
+   }
+#endif
+
+   /* drawing done here: */
+   draw_pt_arrays(draw, prim, start, count);
+}
+
+boolean draw_pt_get_edgeflag( struct draw_context *draw,
+                              unsigned idx )
+{
+   if (draw->pt.user.edgeflag)
+      return (draw->pt.user.edgeflag[idx/32] & (1 << (idx%32))) != 0;
+   else
+      return 1;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
new file mode 100644
index 0000000000..c02f229110
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -0,0 +1,235 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef DRAW_PT_H
+#define DRAW_PT_H
+
+#include "pipe/p_compiler.h"
+
+typedef unsigned (*pt_elt_func)( const void *elts, unsigned idx );
+
+struct draw_pt_middle_end;
+struct draw_context;
+
+
+#define PT_SHADE      0x1
+#define PT_CLIPTEST   0x2
+#define PT_PIPELINE   0x4
+#define PT_MAX_MIDDLE 0x8
+
+
+/* The "front end" - prepare sets of fetch, draw elements for the
+ * middle end.
+ *
+ * Currenly one version of this:
+ *    - vcache - catchall implementation, decomposes to TRI/LINE/POINT prims
+ * Later:
+ *    - varray, varray_split
+ *    - velement, velement_split
+ *
+ * Currenly only using the vcache version.
+ */
+struct draw_pt_front_end {
+   void (*prepare)( struct draw_pt_front_end *,
+                    unsigned prim,
+                    struct draw_pt_middle_end *,
+		    unsigned opt );
+
+   void (*run)( struct draw_pt_front_end *,
+                pt_elt_func elt_func,
+                const void *elt_ptr,
+                unsigned count );
+
+   void (*finish)( struct draw_pt_front_end * );
+   void (*destroy)( struct draw_pt_front_end * );
+};
+
+
+/* The "middle end" - prepares actual hardware vertices for the
+ * hardware backend.
+ *
+ * Currently two versions of this:
+ *     - fetch, vertex shade, cliptest, prim-pipeline
+ *     - fetch, emit (ie passthrough)
+ */
+struct draw_pt_middle_end {
+   void (*prepare)( struct draw_pt_middle_end *,
+                    unsigned prim,
+		    unsigned opt,
+                    unsigned *max_vertices );
+
+   void (*run)( struct draw_pt_middle_end *,
+                const unsigned *fetch_elts,
+                unsigned fetch_count,
+                const ushort *draw_elts,
+                unsigned draw_count );
+
+   void (*run_linear)(struct draw_pt_middle_end *,
+                      unsigned start,
+                      unsigned count);
+
+   /* Transform all vertices in a linear range and then draw them with
+    * the supplied element list.  May fail and return FALSE.
+    */
+   boolean (*run_linear_elts)( struct draw_pt_middle_end *,
+                            unsigned fetch_start,
+                            unsigned fetch_count,
+                            const ushort *draw_elts,
+                            unsigned draw_count );
+
+   int (*get_max_vertex_count)( struct draw_pt_middle_end * );
+
+   void (*finish)( struct draw_pt_middle_end * );
+   void (*destroy)( struct draw_pt_middle_end * );
+};
+
+
+/* The "back end" - supplied by the driver, defined in draw_vbuf.h.
+ */
+struct vbuf_render;
+struct vertex_header;
+
+
+/* Helper functions.
+ */
+pt_elt_func draw_pt_elt_func( struct draw_context *draw );
+const void *draw_pt_elt_ptr( struct draw_context *draw,
+                             unsigned start );
+
+/* Frontends: 
+ *
+ * Currently only the general-purpose vcache implementation, could add
+ * a special case for tiny vertex buffers.
+ */
+struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
+struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
+
+
+/* Middle-ends:
+ *
+ * Currently one general-purpose case which can do all possibilities,
+ * at the slight expense of creating a vertex_header in some cases
+ * unecessarily.
+ *
+ * The special case fetch_emit code avoids pipeline vertices
+ * altogether and builds hardware vertices directly from API
+ * vertex_elements.
+ */
+struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw );
+struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );
+struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
+
+
+/* More helpers:
+ */
+boolean draw_pt_get_edgeflag( struct draw_context *draw,
+                              unsigned idx );
+
+
+/*******************************************************************************
+ * HW vertex emit:
+ */
+struct pt_emit;
+
+void draw_pt_emit_prepare( struct pt_emit *emit,
+			   unsigned prim,
+                           unsigned *max_vertices );
+
+void draw_pt_emit( struct pt_emit *emit,
+		   const float (*vertex_data)[4],
+		   unsigned vertex_count,
+		   unsigned stride,
+		   const ushort *elts,
+		   unsigned count );
+
+void draw_pt_emit_linear( struct pt_emit *emit,
+                          const float (*vertex_data)[4],
+                          unsigned vertex_count,
+                          unsigned stride,
+                          unsigned start,
+                          unsigned count );
+
+void draw_pt_emit_destroy( struct pt_emit *emit );
+
+struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
+
+
+/*******************************************************************************
+ * API vertex fetch:
+ */
+
+struct pt_fetch;
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    unsigned vertex_size );
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts );
+
+void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
+                               unsigned start,
+                               unsigned count,
+                               char *verts );
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch );
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
+
+/*******************************************************************************
+ * Post-VS: cliptest, rhw, viewport
+ */
+struct pt_post_vs;
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned stride,
+			     unsigned count );
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl );
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
+
+
+/*******************************************************************************
+ * Utils: 
+ */
+void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr);
+unsigned draw_pt_reduced_prim(unsigned prim);
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h
new file mode 100644
index 0000000000..3fb0695687
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h
@@ -0,0 +1,153 @@
+
+
+static void FUNC( ARGS,
+                  unsigned count )
+{
+   LOCAL_VARS;
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i ++) {
+	 POINT( (i + 0) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+         LINE( DRAW_PIPE_RESET_STIPPLE,
+               (i + 0),
+               (i + 1));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      if (count >= 2) {
+         flags = DRAW_PIPE_RESET_STIPPLE;
+
+         for (i = 1; i < count; i++, flags = 0) {
+            LINE( flags,
+                  (i - 1),
+                  (i ));
+         }
+
+	 LINE( flags,
+               (i - 1),
+               (0 ));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (i = 1; i < count; i++, flags = 0) {
+         LINE( flags,
+               (i - 1),
+               (i ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 0; i+2 < count; i += 3) {
+         TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                   (i + 0),
+                   (i + 1),
+                   (i + 2 ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatfirst) {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                      (i + 0),
+                      (i + 1 + (i&1)),
+                      (i + 2 - (i&1)));
+         }
+      }
+      else {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                      (i + 0 + (i&1)),
+                      (i + 1 - (i&1)),
+                      (i + 2 ));
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         if (flatfirst) {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                         (i + 1),
+                         (i + 2),
+                         (0 ));
+            }
+         }
+         else {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                         (0),
+                         (i + 1),
+                         (i + 2 ));
+            }
+         }
+      }
+      break;
+
+
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i+3 < count; i += 4) {
+         QUAD( (i + 0),
+               (i + 1),
+               (i + 2),
+               (i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i+3 < count; i += 2) {
+         QUAD( (i + 2),
+               (i + 0),
+               (i + 1),
+               (i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      {
+         /* These bitflags look a little odd because we submit the
+          * vertices as (1,2,0) to satisfy flatshade requirements.
+          */
+         const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+         const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+         const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+
+	 for (i = 0; i+2 < count; i++, flags = edge_middle) {
+
+            if (i + 3 == count)
+               flags |= edge_last;
+
+	    TRIANGLE( flags,
+                      (i + 1),
+                      (i + 2),
+                      (0));
+	 }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   FLUSH;
+}
+
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
new file mode 100644
index 0000000000..b7780fb507
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_elts.c
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "draw/draw_pt.h"
+#include "draw/draw_private.h"
+
+/* Neat get_elt func that also works for varrays drawing by encoding
+ * the start value into a pointer.  
+ */
+
+static unsigned elt_uint( const void *elts, unsigned idx )
+{
+   return *(((const uint *)elts) + idx);
+}
+
+static unsigned elt_ushort( const void *elts, unsigned idx )
+{
+   return *(((const ushort *)elts) + idx);
+}
+
+static unsigned elt_ubyte( const void *elts, unsigned idx )
+{
+   return *(((const ubyte *)elts) + idx);
+}
+
+static unsigned elt_vert( const void *elts, unsigned idx )
+{
+   return (const ubyte *)elts - (const ubyte *)NULL + idx;
+}
+
+pt_elt_func draw_pt_elt_func( struct draw_context *draw )
+{
+   switch (draw->pt.user.eltSize) {
+   case 0: return &elt_vert;
+   case 1: return &elt_ubyte;
+   case 2: return &elt_ushort; 
+   case 4: return &elt_uint;
+   default: return NULL;
+   }
+}     
+
+const void *draw_pt_elt_ptr( struct draw_context *draw,
+                             unsigned start )
+{
+   const char *elts = draw->pt.user.elts;
+
+   switch (draw->pt.user.eltSize) {
+   case 0: 
+      return (const void *)(((const ubyte *)NULL) + start);
+   case 1: 
+      return (const void *)(((const ubyte *)elts) + start);
+   case 2: 
+      return (const void *)(((const ushort *)elts) + start);
+   case 4: 
+      return (const void *)(((const uint *)elts) + start);
+   default:
+      return NULL;
+   }
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
new file mode 100644
index 0000000000..232dfdaed2
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -0,0 +1,308 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+struct pt_emit {
+   struct draw_context *draw;
+
+   struct translate *translate;
+
+   struct translate_cache *cache;
+   unsigned prim;
+
+   const struct vertex_info *vinfo;
+};
+
+void draw_pt_emit_prepare( struct pt_emit *emit,
+			   unsigned prim,
+                           unsigned *max_vertices )
+{
+   struct draw_context *draw = emit->draw;
+   const struct vertex_info *vinfo;
+   unsigned dst_offset;
+   struct translate_key hw_key;
+   unsigned i;
+   boolean ok;
+   
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+
+   /* XXX: may need to defensively reset this later on as clipping can
+    * clobber this state in the render backend.
+    */
+   emit->prim = prim;
+
+   ok = draw->render->set_primitive(draw->render, emit->prim);
+   if (!ok) {
+      assert(0);
+      return;
+   }
+
+   /* Must do this after set_primitive() above:
+    */
+   emit->vinfo = vinfo = draw->render->get_vertex_info(draw->render);
+
+
+   /* Translate from pipeline vertices to hw vertices.
+    */
+   dst_offset = 0;
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      unsigned emit_sz = 0;
+      unsigned src_buffer = 0;
+      unsigned output_format;
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
+
+
+         
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_4F:
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
+	 break;
+      case EMIT_3F:
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
+	 break;
+      case EMIT_2F:
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
+	 break;
+      case EMIT_1F:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 break;
+      case EMIT_1F_PSIZE:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 src_buffer = 1;
+	 src_offset = 0;
+	 break;
+      case EMIT_4UB:
+	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 emit_sz = 4 * sizeof(ubyte);
+         break;
+      default:
+	 assert(0);
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 break;
+      }
+      
+      hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      hw_key.element[i].input_buffer = src_buffer;
+      hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].output_format = output_format;
+      hw_key.element[i].output_offset = dst_offset;
+
+      dst_offset += emit_sz;
+   }
+
+   hw_key.nr_elements = vinfo->num_attribs;
+   hw_key.output_stride = vinfo->size * 4;
+
+   if (!emit->translate ||
+       translate_key_compare(&emit->translate->key, &hw_key) != 0)
+   {
+      translate_key_sanitize(&hw_key);
+      emit->translate = translate_cache_find(emit->cache, &hw_key);
+   }
+
+   *max_vertices = (draw->render->max_vertex_buffer_bytes / 
+                    (vinfo->size * 4));
+
+   /* even number */
+   *max_vertices = *max_vertices & ~1;
+}
+
+
+void draw_pt_emit( struct pt_emit *emit,
+		   const float (*vertex_data)[4],
+		   unsigned vertex_count,
+		   unsigned stride,
+		   const ushort *elts,
+		   unsigned count )
+{
+   struct draw_context *draw = emit->draw;
+   struct translate *translate = emit->translate;
+   struct vbuf_render *render = draw->render;
+   void *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (vertex_count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   /* XXX: and work out some way to coordinate the render primitive
+    * between vbuf.c and here...
+    */
+   if (!draw->render->set_primitive(draw->render, emit->prim)) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = render->allocate_vertices(render,
+					(ushort)translate->key.output_stride,
+					(ushort)vertex_count);
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   translate->set_buffer(translate, 
+			 0, 
+			 vertex_data,
+			 stride );
+
+   translate->set_buffer(translate, 
+			 1, 
+			 &draw->rasterizer->point_size,
+			 0);
+
+   translate->run( translate,
+		   0, 
+		   vertex_count,
+		   hw_verts );
+
+   render->draw(render,
+		elts,
+		count);
+
+   render->release_vertices(render,
+			    hw_verts,
+			    translate->key.output_stride,
+			    vertex_count);
+}
+
+
+void draw_pt_emit_linear(struct pt_emit *emit,
+                         const float (*vertex_data)[4],
+                         unsigned vertex_count,
+                         unsigned stride,
+                         unsigned start,
+                         unsigned count)
+{
+   struct draw_context *draw = emit->draw;
+   struct translate *translate = emit->translate;
+   struct vbuf_render *render = draw->render;
+   void *hw_verts;
+
+#if 0
+   debug_printf("Linear emit\n");
+#endif
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   /* XXX: and work out some way to coordinate the render primitive
+    * between vbuf.c and here...
+    */
+   if (!draw->render->set_primitive(draw->render, emit->prim)) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = render->allocate_vertices(render,
+					(ushort)translate->key.output_stride,
+					(ushort)count);
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   translate->set_buffer(translate, 0,
+			 vertex_data, stride);
+
+   translate->set_buffer(translate, 1,
+			 &draw->rasterizer->point_size,
+			 0);
+
+   translate->run(translate,
+                  0,
+                  vertex_count,
+                  hw_verts);
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < vertex_count; i++) {
+         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
+         draw_dump_emitted_vertex( emit->vinfo, 
+                                   (const uint8_t *)hw_verts + 
+                                   translate->key.output_stride * i );
+      }
+   }
+
+
+   render->draw_arrays(render, start, count);
+
+   render->release_vertices(render,
+			    hw_verts,
+			    translate->key.output_stride,
+			    vertex_count);
+}
+
+struct pt_emit *draw_pt_emit_create( struct draw_context *draw )
+{
+   struct pt_emit *emit = CALLOC_STRUCT(pt_emit);
+   if (!emit)
+      return NULL;
+
+   emit->draw = draw;
+   emit->cache = translate_cache_create();
+   if (!emit->cache) {
+      FREE(emit);
+      return NULL;
+   }
+
+   return emit;
+}
+
+void draw_pt_emit_destroy( struct pt_emit *emit )
+{
+   if (emit->cache)
+      translate_cache_destroy(emit->cache);
+
+   FREE(emit);
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
new file mode 100644
index 0000000000..6377f896fb
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -0,0 +1,228 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+
+struct pt_fetch {
+   struct draw_context *draw;
+
+   struct translate *translate;
+
+   unsigned vertex_size;
+   boolean need_edgeflags;
+
+   struct translate_cache *cache;
+};
+
+/* Perform the fetch from API vertex elements & vertex buffers, to a
+ * contiguous set of float[4] attributes as required for the
+ * vertex_shader->run_linear() method.
+ *
+ * This is used in all cases except pure passthrough
+ * (draw_pt_fetch_emit.c) which has its own version to translate
+ * directly to hw vertices.
+ *
+ */
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    unsigned vertex_size )
+{
+   struct draw_context *draw = fetch->draw;
+   unsigned i, nr = 0;
+   unsigned dst_offset = 0;
+   struct translate_key key;
+
+   fetch->vertex_size = vertex_size;
+
+   /* Always emit/leave space for a vertex header.
+    *
+    * It's worth considering whether the vertex headers should contain
+    * a pointer to the 'data', rather than having it inline.
+    * Something to look at after we've fully switched over to the pt
+    * paths.
+    */
+   {
+      /* Need to set header->vertex_id = 0xffff somehow.
+       */
+      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
+      key.element[nr].input_offset = 0;
+      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+      dst_offset += 1 * sizeof(float);
+      nr++;
+
+
+      /* Just leave the clip[] array untouched.
+       */
+      dst_offset += 4 * sizeof(float);
+   }
+      
+
+   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+      key.element[nr].input_format = draw->pt.vertex_element[i].src_format;
+      key.element[nr].input_buffer = draw->pt.vertex_element[i].vertex_buffer_index;
+      key.element[nr].input_offset = draw->pt.vertex_element[i].src_offset;
+      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+
+      dst_offset += 4 * sizeof(float);
+      nr++;
+   }
+
+   assert(dst_offset <= vertex_size);
+
+   key.nr_elements = nr;
+   key.output_stride = vertex_size;
+
+
+   if (!fetch->translate ||
+       translate_key_compare(&fetch->translate->key, &key) != 0)
+   {
+      translate_key_sanitize(&key);
+      fetch->translate = translate_cache_find(fetch->cache, &key);
+
+      {
+	 static struct vertex_header vh = { 0, 1, 0, 0xffff };
+	 fetch->translate->set_buffer(fetch->translate,
+				      draw->pt.nr_vertex_buffers,
+				      &vh,
+				      0);
+      }
+   }
+
+   fetch->need_edgeflags = ((draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+                             draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) &&
+                            draw->pt.user.edgeflag);
+}
+
+
+
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      translate->set_buffer(translate, 
+			    i, 
+			    ((char *)draw->pt.user.vbuffer[i] + 
+			     draw->pt.vertex_buffer[i].buffer_offset),
+			    draw->pt.vertex_buffer[i].pitch );
+   }
+
+   translate->run_elts( translate,
+			elts, 
+			count,
+			verts );
+
+   /* Edgeflags are hard to fit into a translate program, populate
+    * them separately if required.  In the setup above they are
+    * defaulted to one, so only need this if there is reason to change
+    * that default:
+    */
+   if (fetch->need_edgeflags) {
+      for (i = 0; i < count; i++) {
+         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size);
+         vh->edgeflag = draw_pt_get_edgeflag( draw, elts[i] );
+      }
+   }
+}
+
+
+void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
+                               unsigned start,
+                               unsigned count,
+                               char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      translate->set_buffer(translate,
+			    i,
+			    ((char *)draw->pt.user.vbuffer[i] +
+			     draw->pt.vertex_buffer[i].buffer_offset),
+			    draw->pt.vertex_buffer[i].pitch );
+   }
+
+   translate->run( translate,
+                   start,
+                   count,
+                   verts );
+
+   /* Edgeflags are hard to fit into a translate program, populate
+    * them separately if required.  In the setup above they are
+    * defaulted to one, so only need this if there is reason to change
+    * that default:
+    */
+   if (fetch->need_edgeflags) {
+      for (i = 0; i < count; i++) {
+         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size);
+         vh->edgeflag = draw_pt_get_edgeflag( draw, start + i );
+      }
+   }
+}
+
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )
+{
+   struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch);
+   if (!fetch)
+      return NULL;
+
+   fetch->draw = draw;
+   fetch->cache = translate_cache_create();
+   if (!fetch->cache) {
+      FREE(fetch);
+      return NULL;
+   }
+
+   return fetch;
+}
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch )
+{
+   if (fetch->cache)
+      translate_cache_destroy(fetch->cache);
+
+   FREE(fetch);
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
new file mode 100644
index 0000000000..0227652632
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -0,0 +1,426 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+/* The simplest 'middle end' in the new vertex code.  
+ * 
+ * The responsibilities of a middle end are to:
+ *  - perform vertex fetch using
+ *       - draw vertex element/buffer state
+ *       - a list of fetch indices we received as an input
+ *  - run the vertex shader
+ *  - cliptest, 
+ *  - clip coord calculation 
+ *  - viewport transformation
+ *  - if necessary, run the primitive pipeline, passing it:
+ *       - a linear array of vertex_header vertices constructed here
+ *       - a set of draw indices we received as an input
+ *  - otherwise, drive the hw backend,
+ *       - allocate space for hardware format vertices
+ *       - translate the vertex-shader output vertices to hw format
+ *       - calling the backend draw functions.
+ *
+ * For convenience, we provide a helper function to drive the hardware
+ * backend given similar inputs to those required to run the pipeline.
+ *
+ * In the case of passthrough mode, many of these actions are disabled
+ * or noops, so we end up doing:
+ *
+ *  - perform vertex fetch
+ *  - drive the hw backend
+ *
+ * IE, basically just vertex fetch to post-vs-format vertices,
+ * followed by a call to the backend helper function.
+ */
+
+
+struct fetch_emit_middle_end {
+   struct draw_pt_middle_end base;
+   struct draw_context *draw;
+   
+   struct translate *translate;
+   const struct vertex_info *vinfo;
+
+   /* Cache point size somewhere it's address won't change:
+    */
+   float point_size;
+
+   struct translate_cache *cache;
+};
+
+
+
+
+static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
+                                unsigned prim,
+				unsigned opt,
+                                unsigned *max_vertices )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+   struct draw_context *draw = feme->draw;
+   const struct vertex_info *vinfo;
+   unsigned i, dst_offset;
+   boolean ok;
+   struct translate_key key;
+
+
+   ok = draw->render->set_primitive( draw->render, 
+                                     prim );
+   if (!ok) {
+      assert(0);
+      return;
+   }
+   
+   /* Must do this after set_primitive() above:
+    */
+   vinfo = feme->vinfo = draw->render->get_vertex_info(draw->render);
+   
+   
+
+   /* Transform from API vertices to HW vertices, skipping the
+    * pipeline_vertex intermediate step.
+    */
+   dst_offset = 0;
+   memset(&key, 0, sizeof(key));
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index];
+
+      unsigned emit_sz = 0;
+      unsigned input_format = src->src_format;
+      unsigned input_buffer = src->vertex_buffer_index;
+      unsigned input_offset = src->src_offset;
+      unsigned output_format;
+
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_4F:
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
+         break;
+      case EMIT_3F:
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
+         break;
+      case EMIT_2F:
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
+         break;
+      case EMIT_1F:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+	 input_format = PIPE_FORMAT_R32_FLOAT;
+	 input_buffer = draw->pt.nr_vertex_buffers;
+	 input_offset = 0;
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+         break;
+      default:
+         assert(0);
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 continue;
+      }
+
+      key.element[i].input_format = input_format;
+      key.element[i].input_buffer = input_buffer;
+      key.element[i].input_offset = input_offset;
+      key.element[i].output_format = output_format;
+      key.element[i].output_offset = dst_offset;
+      
+      dst_offset += emit_sz;
+   }
+
+   key.nr_elements = vinfo->num_attribs;
+   key.output_stride = vinfo->size * 4;
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!feme->translate ||
+       translate_key_compare(&feme->translate->key, &key) != 0) 
+   {
+      translate_key_sanitize(&key);
+      feme->translate = translate_cache_find(feme->cache,
+                                             &key);
+
+
+      feme->translate->set_buffer(feme->translate, 
+				  draw->pt.nr_vertex_buffers, 
+				  &feme->point_size,
+				  0);
+   }
+   
+   feme->point_size = draw->rasterizer->point_size;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      feme->translate->set_buffer(feme->translate, 
+                                  i, 
+                                  ((char *)draw->pt.user.vbuffer[i] + 
+                                   draw->pt.vertex_buffer[i].buffer_offset),
+                                  draw->pt.vertex_buffer[i].pitch );
+   }
+
+   *max_vertices = (draw->render->max_vertex_buffer_bytes / 
+                    (vinfo->size * 4));
+
+   /* Return an even number of verts.
+    * This prevents "parity" errors when splitting long triangle strips which
+    * can lead to front/back culling mix-ups.
+    * Every other triangle in a strip has an alternate front/back orientation
+    * so splitting at an odd position can cause the orientation of subsequent
+    * triangles to get reversed.
+    */
+   *max_vertices = *max_vertices & ~1;
+}
+
+
+
+
+
+static void fetch_emit_run( struct draw_pt_middle_end *middle,
+                            const unsigned *fetch_elts,
+                            unsigned fetch_count,
+                            const ushort *draw_elts,
+                            unsigned draw_count )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+   struct draw_context *draw = feme->draw;
+   void *hw_verts;
+   
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (fetch_count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)feme->translate->key.output_stride,
+                                               (ushort)fetch_count );
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+         
+					
+   /* Single routine to fetch vertices and emit HW verts.
+    */
+   feme->translate->run_elts( feme->translate, 
+			      fetch_elts,
+			      fetch_count,
+			      hw_verts );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < fetch_count; i++) {
+         debug_printf("\n\nvertex %d:\n", i);
+         draw_dump_emitted_vertex( feme->vinfo, 
+                                   (const uint8_t *)hw_verts + feme->vinfo->size * 4 * i );
+      }
+   }
+
+   /* XXX: Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw( draw->render, 
+                       draw_elts, 
+                       draw_count );
+
+   /* Done -- that was easy, wasn't it: 
+    */
+   draw->render->release_vertices( draw->render, 
+                                   hw_verts, 
+                                   feme->translate->key.output_stride, 
+                                   fetch_count );
+
+}
+
+
+static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
+                                   unsigned start,
+                                   unsigned count )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+   struct draw_context *draw = feme->draw;
+   void *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)feme->translate->key.output_stride,
+                                               (ushort)count );
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   /* Single routine to fetch vertices and emit HW verts.
+    */
+   feme->translate->run( feme->translate,
+                         start,
+                         count,
+                         hw_verts );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < count; i++) {
+         debug_printf("\n\nvertex %d:\n", i);
+         draw_dump_emitted_vertex( feme->vinfo,
+                                   (const uint8_t *)hw_verts + feme->vinfo->size * 4 * i );
+      }
+   }
+
+   /* XXX: Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw_arrays( draw->render,
+                              0, /*start*/
+                              count );
+
+   /* Done -- that was easy, wasn't it:
+    */
+   draw->render->release_vertices( draw->render,
+                                   hw_verts,
+                                   feme->translate->key.output_stride,
+                                   count );
+
+}
+
+
+static boolean fetch_emit_run_linear_elts( struct draw_pt_middle_end *middle,
+                                        unsigned start,
+                                        unsigned count,
+                                        const ushort *draw_elts,
+                                        unsigned draw_count )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+   struct draw_context *draw = feme->draw;
+   void *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (count >= UNDEFINED_VERTEX_ID)
+      return FALSE;
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)feme->translate->key.output_stride,
+                                               (ushort)count );
+   if (!hw_verts) 
+      return FALSE;
+
+   /* Single routine to fetch vertices and emit HW verts.
+    */
+   feme->translate->run( feme->translate,
+                         start,
+                         count,
+                         hw_verts );
+
+   /* XXX: Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw( draw->render, 
+                       draw_elts, 
+                       draw_count );
+
+   /* Done -- that was easy, wasn't it:
+    */
+   draw->render->release_vertices( draw->render,
+                                   hw_verts,
+                                   feme->translate->key.output_stride,
+                                   count );
+
+   return TRUE;
+}
+
+
+
+
+static void fetch_emit_finish( struct draw_pt_middle_end *middle )
+{
+   /* nothing to do */
+}
+
+static void fetch_emit_destroy( struct draw_pt_middle_end *middle )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+
+   if (feme->cache)
+      translate_cache_destroy(feme->cache);
+
+   FREE(middle);
+}
+
+
+struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw )
+{
+   struct fetch_emit_middle_end *fetch_emit = CALLOC_STRUCT( fetch_emit_middle_end );
+   if (fetch_emit == NULL)
+      return NULL;
+
+   fetch_emit->cache = translate_cache_create();
+   if (!fetch_emit->cache) {
+      FREE(fetch_emit);
+      return NULL;
+   }
+
+   fetch_emit->base.prepare    = fetch_emit_prepare;
+   fetch_emit->base.run        = fetch_emit_run;
+   fetch_emit->base.run_linear = fetch_emit_run_linear;
+   fetch_emit->base.run_linear_elts = fetch_emit_run_linear_elts;
+   fetch_emit->base.finish     = fetch_emit_finish;
+   fetch_emit->base.destroy    = fetch_emit_destroy;
+
+   fetch_emit->draw = draw;
+     
+   return &fetch_emit->base;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
new file mode 100644
index 0000000000..1649cdc6cd
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -0,0 +1,422 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+
+#include "translate/translate.h"
+
+struct fetch_shade_emit;
+
+
+/* Prototype fetch, shade, emit-hw-verts all in one go.
+ */
+struct fetch_shade_emit {
+   struct draw_pt_middle_end base;
+   struct draw_context *draw;
+
+
+   /* Temporaries:
+    */
+   const float *constants;
+   unsigned pitch[PIPE_MAX_ATTRIBS];
+   const ubyte *src[PIPE_MAX_ATTRIBS];
+   unsigned prim;
+
+   struct draw_vs_varient_key key;
+   struct draw_vs_varient *active;
+
+
+   const struct vertex_info *vinfo;
+};
+
+
+
+			       
+static void fse_prepare( struct draw_pt_middle_end *middle,
+                         unsigned prim, 
+                         unsigned opt,
+                         unsigned *max_vertices )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
+   const struct vertex_info *vinfo;
+   unsigned i;
+   unsigned nr_vbs = 0;
+   
+
+   if (!draw->render->set_primitive( draw->render, 
+                                     prim )) {
+      assert(0);
+      return;
+   }
+
+   /* Must do this after set_primitive() above:
+    */
+   fse->vinfo = vinfo = draw->render->get_vertex_info(draw->render);
+   
+
+
+   fse->key.output_stride = vinfo->size * 4;
+   fse->key.nr_outputs = vinfo->num_attribs;
+   fse->key.nr_inputs = num_vs_inputs;
+
+   fse->key.nr_elements = MAX2(fse->key.nr_outputs,     /* outputs - translate to hw format */
+                               fse->key.nr_inputs);     /* inputs - fetch from api format */
+
+   fse->key.viewport = !draw->identity_viewport;
+   fse->key.clip = !draw->bypass_clipping;
+   fse->key.const_vbuffers = 0;
+
+   memset(fse->key.element, 0, 
+          fse->key.nr_elements * sizeof(fse->key.element[0]));
+
+   for (i = 0; i < num_vs_inputs; i++) {
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[i];
+      fse->key.element[i].in.format = src->src_format;
+
+      /* Consider ignoring these, ie make generated programs
+       * independent of this state:
+       */
+      fse->key.element[i].in.buffer = src->vertex_buffer_index;
+      fse->key.element[i].in.offset = src->src_offset;
+      nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
+   }
+   
+   for (i = 0; i < 5 && i < nr_vbs; i++) {
+      if (draw->pt.vertex_buffer[i].pitch == 0)
+         fse->key.const_vbuffers |= (1<<i);
+   }
+
+   if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+   
+   {
+      unsigned dst_offset = 0;
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         unsigned emit_sz = 0;
+
+         switch (vinfo->attrib[i].emit) {
+         case EMIT_4F:
+            emit_sz = 4 * sizeof(float);
+            break;
+         case EMIT_3F:
+            emit_sz = 3 * sizeof(float);
+            break;
+         case EMIT_2F:
+            emit_sz = 2 * sizeof(float);
+            break;
+         case EMIT_1F:
+            emit_sz = 1 * sizeof(float);
+            break;
+         case EMIT_1F_PSIZE:
+            emit_sz = 1 * sizeof(float);
+            break;
+         case EMIT_4UB:
+            emit_sz = 4 * sizeof(ubyte);
+            break;
+         default:
+            assert(0);
+            break;
+         }
+
+         /* The elements in the key correspond to vertex shader output
+          * numbers, not to positions in the hw vertex description --
+          * that's handled by the output_offset field.
+          */
+         fse->key.element[i].out.format = vinfo->attrib[i].emit;
+         fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index;
+         fse->key.element[i].out.offset = dst_offset;
+      
+         dst_offset += emit_sz;
+         assert(fse->key.output_stride >= dst_offset);
+      }
+   }
+
+   
+   fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
+                                         &fse->key );
+
+   if (!fse->active) {
+      assert(0);
+      return ;
+   }
+
+   if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, 
+                       fse->active->key.const_vbuffers);
+
+   /* Now set buffer pointers:
+    */
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      fse->active->set_buffer( fse->active, 
+                               i, 
+                               ((const ubyte *) draw->pt.user.vbuffer[i] + 
+                                draw->pt.vertex_buffer[i].buffer_offset),
+                              draw->pt.vertex_buffer[i].pitch );
+   }
+
+   *max_vertices = (draw->render->max_vertex_buffer_bytes / 
+                    (vinfo->size * 4));
+
+   /* Return an even number of verts.
+    * This prevents "parity" errors when splitting long triangle strips which
+    * can lead to front/back culling mix-ups.
+    * Every other triangle in a strip has an alternate front/back orientation
+    * so splitting at an odd position can cause the orientation of subsequent
+    * triangles to get reversed.
+    */
+   *max_vertices = *max_vertices & ~1;
+
+   /* Probably need to do this somewhere (or fix exec shader not to
+    * need it):
+    */
+   if (1) {
+      struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+      vs->prepare(vs, draw);
+   }
+   
+
+   //return TRUE;
+}
+
+
+
+
+
+
+
+static void fse_run_linear( struct draw_pt_middle_end *middle, 
+                            unsigned start, 
+                            unsigned count )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   char *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)fse->key.output_stride,
+                                               (ushort)count );
+
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    * Clipping is done elsewhere -- either by the API or on hardware,
+    * or for some other reason not required...
+    */
+   fse->active->run_linear( fse->active, 
+                            start, count,
+                            hw_verts );
+
+   /* Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw_arrays( draw->render,
+                              0,
+                              count );
+   
+   if (0) {
+      unsigned i;
+      for (i = 0; i < count; i++) {
+         debug_printf("\n\n%s vertex %d: (stride %d, offset %d)\n", __FUNCTION__, i,
+                      fse->key.output_stride,
+                      fse->key.output_stride * i);
+
+         draw_dump_emitted_vertex( fse->vinfo, 
+                                   (const uint8_t *)hw_verts + fse->key.output_stride * i );
+      }
+   }
+
+
+   draw->render->release_vertices( draw->render, 
+				   hw_verts, 
+				   fse->key.output_stride, 
+				   count );
+}
+
+
+static void
+fse_run(struct draw_pt_middle_end *middle,
+        const unsigned *fetch_elts,
+        unsigned fetch_count,
+        const ushort *draw_elts,
+        unsigned draw_count )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   void *hw_verts;
+   
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (fetch_count >= UNDEFINED_VERTEX_ID) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)fse->key.output_stride,
+                                               (ushort)fetch_count );
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+         
+					
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    */
+   fse->active->run_elts( fse->active, 
+                          fetch_elts,
+                          fetch_count,
+                          hw_verts );
+
+   draw->render->draw( draw->render, 
+                       draw_elts, 
+                       draw_count );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < fetch_count; i++) {
+         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
+         draw_dump_emitted_vertex( fse->vinfo, 
+                                   (const uint8_t *)hw_verts + 
+                                   fse->key.output_stride * i );
+      }
+   }
+
+
+   draw->render->release_vertices( draw->render, 
+                                   hw_verts, 
+                                   fse->key.output_stride, 
+                                   fetch_count );
+
+}
+
+
+
+static boolean fse_run_linear_elts( struct draw_pt_middle_end *middle, 
+                                 unsigned start, 
+                                 unsigned count,
+                                 const ushort *draw_elts,
+                                 unsigned draw_count )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   char *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   if (count >= UNDEFINED_VERTEX_ID)
+      return FALSE;
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)fse->key.output_stride,
+                                               (ushort)count );
+
+   if (!hw_verts) {
+      return FALSE;
+   }
+
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    * Clipping is done elsewhere -- either by the API or on hardware,
+    * or for some other reason not required...
+    */
+   fse->active->run_linear( fse->active, 
+                            start, count,
+                            hw_verts );
+
+
+   draw->render->draw( draw->render, 
+                       draw_elts, 
+                       draw_count );
+   
+
+
+   draw->render->release_vertices( draw->render, 
+				   hw_verts, 
+				   fse->key.output_stride, 
+				   count );
+
+   return TRUE;
+}
+
+
+
+static void fse_finish( struct draw_pt_middle_end *middle )
+{
+}
+
+
+static void
+fse_destroy( struct draw_pt_middle_end *middle ) 
+{
+   FREE(middle);
+}
+
+struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw )
+{
+   struct fetch_shade_emit *fse = CALLOC_STRUCT(fetch_shade_emit);
+   if (!fse)
+      return NULL;
+
+   fse->base.prepare = fse_prepare;
+   fse->base.run = fse_run;
+   fse->base.run_linear = fse_run_linear;
+   fse->base.run_linear_elts = fse_run_linear_elts;
+   fse->base.finish = fse_finish;
+   fse->base.destroy = fse_destroy;
+   fse->draw = draw;
+
+   return &fse->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
new file mode 100644
index 0000000000..ec3b41c320
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -0,0 +1,396 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "translate/translate.h"
+
+
+struct fetch_pipeline_middle_end {
+   struct draw_pt_middle_end base;
+   struct draw_context *draw;
+
+   struct pt_emit *emit;
+   struct pt_fetch *fetch;
+   struct pt_post_vs *post_vs;
+
+   unsigned vertex_data_offset;
+   unsigned vertex_size;
+   unsigned prim;
+   unsigned opt;
+};
+
+
+static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
+                                    unsigned prim,
+				    unsigned opt,
+                                    unsigned *max_vertices )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *vs = draw->vs.vertex_shader;
+
+   /* Add one to num_outputs because the pipeline occasionally tags on
+    * an additional texcoord, eg for AA lines.
+    */
+   unsigned nr = MAX2( vs->info.num_inputs,
+		       vs->info.num_outputs + 1 );
+
+   fpme->prim = prim;
+   fpme->opt = opt;
+
+   /* Always leave room for the vertex header whether we need it or
+    * not.  It's hard to get rid of it in particular because of the
+    * viewport code in draw_pt_post_vs.c.  
+    */
+   fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
+
+   
+
+   draw_pt_fetch_prepare( fpme->fetch, 
+			  fpme->vertex_size );
+
+   /* XXX: it's not really gl rasterization rules we care about here,
+    * but gl vs dx9 clip spaces.
+    */
+   draw_pt_post_vs_prepare( fpme->post_vs,
+			    (boolean)draw->bypass_clipping,
+			    (boolean)draw->identity_viewport,
+			    (boolean)draw->rasterizer->gl_rasterization_rules );
+			    
+
+   if (!(opt & PT_PIPELINE)) {
+      draw_pt_emit_prepare( fpme->emit, 
+			    prim,
+                            max_vertices );
+
+      *max_vertices = MAX2( *max_vertices,
+                            DRAW_PIPE_MAX_VERTICES );
+   }
+   else {
+      *max_vertices = DRAW_PIPE_MAX_VERTICES; 
+   }
+
+   /* return even number */
+   *max_vertices = *max_vertices & ~1;
+
+   /* No need to prepare the shader.
+    */
+   vs->prepare(vs, draw);
+}
+
+
+
+static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
+                                const unsigned *fetch_elts,
+                                unsigned fetch_count,
+                                const ushort *draw_elts,
+                                unsigned draw_count )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( fetch_count, 4 );
+
+   struct vertex_header *pipeline_verts = 
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
+      assert(0);
+      return;
+   }
+
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run( fpme->fetch,
+		      fetch_elts, 
+		      fetch_count,
+		      (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
+   {
+      shader->run_linear(shader, 
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->pt.user.constants,
+			 fetch_count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    fetch_count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run( fpme->draw,
+                         fpme->prim,
+                         pipeline_verts,
+                         fetch_count,
+                         fpme->vertex_size,
+                         draw_elts,
+                         draw_count );
+   }
+   else {
+      draw_pt_emit( fpme->emit,
+		    (const float (*)[4])pipeline_verts->data,
+		    fetch_count,
+		    fpme->vertex_size,
+		    draw_elts,
+		    draw_count );
+   }
+
+
+   FREE(pipeline_verts);
+}
+
+
+static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
+                                       unsigned start,
+                                       unsigned count)
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
+      assert(0);
+      return;
+   }
+
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run_linear( fpme->fetch,
+                             start,
+                             count,
+                             (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
+   {
+      shader->run_linear(shader,
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->pt.user.constants,
+			 count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run_linear( fpme->draw,
+                                fpme->prim,
+                                pipeline_verts,
+                                count,
+                                fpme->vertex_size);
+   }
+   else {
+      draw_pt_emit_linear( fpme->emit,
+                           (const float (*)[4])pipeline_verts->data,
+                           count,
+                           fpme->vertex_size,
+                           0, /*start*/
+                           count );
+   }
+
+   FREE(pipeline_verts);
+}
+
+
+
+static boolean fetch_pipeline_linear_run_elts( struct draw_pt_middle_end *middle,
+                                            unsigned start,
+                                            unsigned count,
+                                            const ushort *draw_elts,
+                                            unsigned draw_count )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align( count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) 
+      return FALSE;
+
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run_linear( fpme->fetch,
+                             start,
+                             count,
+                             (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
+   {
+      shader->run_linear(shader,
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->pt.user.constants,
+			 count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run( fpme->draw,
+                         fpme->prim,
+                         pipeline_verts,
+                         count,
+                         fpme->vertex_size,
+                         draw_elts,
+                         draw_count );
+   }
+   else {
+      draw_pt_emit( fpme->emit,
+		    (const float (*)[4])pipeline_verts->data,
+		    count,
+		    fpme->vertex_size,
+		    draw_elts,
+		    draw_count );
+   }
+
+   FREE(pipeline_verts);
+   return TRUE;
+}
+
+
+
+static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )
+{
+   /* nothing to do */
+}
+
+static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+
+   if (fpme->fetch)
+      draw_pt_fetch_destroy( fpme->fetch );
+
+   if (fpme->emit)
+      draw_pt_emit_destroy( fpme->emit );
+
+   if (fpme->post_vs)
+      draw_pt_post_vs_destroy( fpme->post_vs );
+
+   FREE(middle);
+}
+
+
+struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *draw )
+{
+   struct fetch_pipeline_middle_end *fpme = CALLOC_STRUCT( fetch_pipeline_middle_end );
+   if (!fpme)
+      goto fail;
+
+   fpme->base.prepare        = fetch_pipeline_prepare;
+   fpme->base.run            = fetch_pipeline_run;
+   fpme->base.run_linear     = fetch_pipeline_linear_run;
+   fpme->base.run_linear_elts = fetch_pipeline_linear_run_elts;
+   fpme->base.finish         = fetch_pipeline_finish;
+   fpme->base.destroy        = fetch_pipeline_destroy;
+
+   fpme->draw = draw;
+
+   fpme->fetch = draw_pt_fetch_create( draw );
+   if (!fpme->fetch)
+      goto fail;
+
+   fpme->post_vs = draw_pt_post_vs_create( draw );
+   if (!fpme->post_vs)
+      goto fail;
+
+   fpme->emit = draw_pt_emit_create( draw );
+   if (!fpme->emit) 
+      goto fail;
+
+   return &fpme->base;
+
+ fail:
+   if (fpme)
+      fetch_pipeline_destroy( &fpme->base );
+
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
new file mode 100644
index 0000000000..96dc706b99
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -0,0 +1,233 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+
+struct pt_post_vs {
+   struct draw_context *draw;
+
+   boolean (*run)( struct pt_post_vs *pvs,
+		struct vertex_header *vertices,
+		unsigned count,
+		unsigned stride );
+};
+
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+
+
+static INLINE unsigned
+compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0x0;
+   unsigned i;
+
+#if 0
+   debug_printf("compute clipmask %f %f %f %f\n",
+                clip[0], clip[1], clip[2], clip[3]);
+   assert(clip[3] != 0.0);
+#endif
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= (1<<0);
+   if ( clip[0] + clip[3] < 0) mask |= (1<<1);
+   if (-clip[1] + clip[3] < 0) mask |= (1<<2);
+   if ( clip[1] + clip[3] < 0) mask |= (1<<3);
+   if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
+   if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/* The normal case - cliptest, rhw divide, viewport transform.
+ *
+ * Also handle identity viewport here at the expense of a few wasted
+ * instructions
+ */
+static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
+					  struct vertex_header *vertices,
+					  unsigned count,
+					  unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   const unsigned pos = pvs->draw->vs.position_output;
+   unsigned clipped = 0;
+   unsigned j;
+
+   if (0) debug_printf("%s\n");
+
+   for (j = 0; j < count; j++) {
+      float *position = out->data[pos];
+
+      out->clip[0] = position[0];
+      out->clip[1] = position[1];
+      out->clip[2] = position[2];
+      out->clip[3] = position[3];
+
+      out->vertex_id = 0xffff;
+      out->clipmask = compute_clipmask_gl(out->clip, 
+					  pvs->draw->plane,
+					  pvs->draw->nr_planes);
+      clipped += out->clipmask;
+
+      if (out->clipmask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / position[3];
+
+	 /* Viewport mapping */
+	 position[0] = position[0] * w * scale[0] + trans[0];
+	 position[1] = position[1] * w * scale[1] + trans[1];
+	 position[2] = position[2] * w * scale[2] + trans[2];
+	 position[3] = w;
+#if 0
+         debug_printf("post viewport: %f %f %f %f\n",
+                      position[0],
+                      position[1],
+                      position[2],
+                      position[3]);
+#endif
+      }
+
+      out = (struct vertex_header *)( (char *)out + stride );
+   }
+
+   return clipped != 0;
+}
+
+
+
+/* If bypass_clipping is set, skip cliptest and rhw divide.
+ */
+static boolean post_vs_viewport( struct pt_post_vs *pvs,
+			      struct vertex_header *vertices,
+			      unsigned count,
+			      unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   const unsigned pos = pvs->draw->vs.position_output;
+   unsigned j;
+
+   if (0) debug_printf("%s\n", __FUNCTION__);
+   for (j = 0; j < count; j++) {
+      float *position = out->data[pos];
+
+      /* Viewport mapping only, no cliptest/rhw divide
+       */
+      position[0] = position[0] * scale[0] + trans[0];
+      position[1] = position[1] * scale[1] + trans[1];
+      position[2] = position[2] * scale[2] + trans[2];
+
+      out = (struct vertex_header *)((char *)out + stride);
+   }
+   
+   return FALSE;
+}
+
+
+/* If bypass_clipping is set and we have an identity viewport, nothing
+ * to do.
+ */
+static boolean post_vs_none( struct pt_post_vs *pvs,
+			     struct vertex_header *vertices,
+			     unsigned count,
+			     unsigned stride )
+{
+   if (0) debug_printf("%s\n", __FUNCTION__);
+   return FALSE;
+}
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned count,
+			     unsigned stride )
+{
+   return pvs->run( pvs, pipeline_verts, count, stride );
+}
+
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl )
+{
+   if (bypass_clipping) {
+      if (identity_viewport)
+	 pvs->run = post_vs_none;
+      else
+	 pvs->run = post_vs_viewport;
+   }
+   else {
+      //if (opengl) 
+      pvs->run = post_vs_cliptest_viewport_gl;
+   }
+}
+
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw )
+{
+   struct pt_post_vs *pvs = CALLOC_STRUCT( pt_post_vs );
+   if (!pvs)
+      return NULL;
+
+   pvs->draw = draw;
+   
+   return pvs;
+}
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs )
+{
+   FREE(pvs);
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
new file mode 100644
index 0000000000..3bc7939c55
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
+{
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      *first = 1;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINES:
+      *first = 2;
+      *incr = 2;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      *first = 3;
+      *incr = 3;
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      *first = 4;
+      *incr = 4;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      *first = 4;
+      *incr = 2;
+      break;
+   default:
+      assert(0);
+      *first = 0;
+      *incr = 1;		/* set to one so that count % incr works */
+      break;
+   }
+}
+
+
+unsigned draw_pt_reduced_prim(unsigned prim)
+{
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      return PIPE_PRIM_POINTS;
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      return PIPE_PRIM_LINES;
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      return PIPE_PRIM_TRIANGLES;
+   default:
+      assert(0);
+      return PIPE_PRIM_POINTS;
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
new file mode 100644
index 0000000000..c15afe65f1
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -0,0 +1,193 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+#define FETCH_MAX 256
+#define DRAW_MAX (FETCH_MAX+8)
+
+struct varray_frontend {
+   struct draw_pt_front_end base;
+   struct draw_context *draw;
+
+   ushort draw_elts[DRAW_MAX];
+   unsigned fetch_elts[FETCH_MAX];
+
+   unsigned driver_fetch_max;
+   unsigned fetch_max;
+
+   struct draw_pt_middle_end *middle;
+
+   unsigned input_prim;
+   unsigned output_prim;
+};
+
+
+static void varray_flush_linear(struct varray_frontend *varray,
+                                unsigned start, unsigned count)
+{
+   if (count) {
+      assert(varray->middle->run_linear);
+      varray->middle->run_linear(varray->middle, start, count);
+   }
+}
+
+static void varray_line_loop_segment(struct varray_frontend *varray,
+                                     unsigned start,
+                                     unsigned segment_start,
+                                     unsigned segment_count,
+                                     boolean end )
+{
+   assert(segment_count+1 < varray->fetch_max);
+   if (segment_count >= 1) {
+      unsigned nr = 0, i;
+
+      for (i = 0; i < segment_count; i++) 
+         varray->fetch_elts[nr++] = start + segment_start + i;
+
+      if (end) 
+         varray->fetch_elts[nr++] = start;
+
+      assert(nr < FETCH_MAX);
+
+      varray->middle->run(varray->middle, 
+                          varray->fetch_elts,
+                          nr,
+                          varray->draw_elts, /* ie. linear */
+                          nr);
+   }
+}
+
+
+
+static void varray_fan_segment(struct varray_frontend *varray,
+                               unsigned start, 
+                               unsigned segment_start,
+                               unsigned segment_count )
+{
+   assert(segment_count+1 < varray->fetch_max);
+   if (segment_count >= 2) {
+      unsigned nr = 0, i;
+
+      if (segment_start != 0)
+         varray->fetch_elts[nr++] = start;
+
+      for (i = 0 ; i < segment_count; i++) 
+         varray->fetch_elts[nr++] = start + segment_start + i;
+
+      assert(nr < FETCH_MAX);
+
+      varray->middle->run(varray->middle, 
+                          varray->fetch_elts,
+                          nr,
+                          varray->draw_elts, /* ie. linear */
+                          nr);
+   }
+}
+
+
+
+
+#define FUNC varray_run
+#include "draw_pt_varray_tmp_linear.h"
+
+static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_STRIP,        /* decomposed LINELOOP */
+   PIPE_PRIM_LINE_STRIP,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLE_STRIP,
+   PIPE_PRIM_TRIANGLE_FAN, 
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_POLYGON
+};
+
+
+
+static void varray_prepare(struct draw_pt_front_end *frontend,
+                           unsigned prim,
+                           struct draw_pt_middle_end *middle,
+                           unsigned opt)
+{
+   struct varray_frontend *varray = (struct varray_frontend *)frontend;
+
+   varray->base.run = varray_run;
+
+   varray->input_prim = prim;
+   varray->output_prim = decompose_prim[prim];
+
+   varray->middle = middle;
+   middle->prepare(middle, varray->output_prim, opt, &varray->driver_fetch_max );
+
+   /* check that the max is even */
+   assert((varray->driver_fetch_max & 1) == 0);
+
+   varray->fetch_max = MIN2(FETCH_MAX, varray->driver_fetch_max);
+}
+
+
+
+
+static void varray_finish(struct draw_pt_front_end *frontend)
+{
+   struct varray_frontend *varray = (struct varray_frontend *)frontend;
+   varray->middle->finish(varray->middle);
+   varray->middle = NULL;
+}
+
+static void varray_destroy(struct draw_pt_front_end *frontend)
+{
+   FREE(frontend);
+}
+
+
+struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw)
+{
+   ushort i;
+   struct varray_frontend *varray = CALLOC_STRUCT(varray_frontend);
+   if (varray == NULL)
+      return NULL;
+
+   varray->base.prepare = varray_prepare;
+   varray->base.run     = NULL;
+   varray->base.finish  = varray_finish;
+   varray->base.destroy = varray_destroy;
+   varray->draw = draw;
+
+   for (i = 0; i < DRAW_MAX; i++) {
+      varray->draw_elts[i] = i;
+   }
+
+   return &varray->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
new file mode 100644
index 0000000000..7c722457c3
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
@@ -0,0 +1,238 @@
+
+static void FUNC(struct draw_pt_front_end *frontend,
+                 pt_elt_func get_elt,
+                 const void *elts,
+                 unsigned count)
+{
+   struct varray_frontend *varray = (struct varray_frontend *)frontend;
+   struct draw_context *draw = varray->draw;
+   unsigned start = (unsigned)elts;
+
+   boolean flatfirst = (draw->rasterizer->flatshade &&
+                        draw->rasterizer->flatshade_first);
+   unsigned i, j;
+   ushort flags;
+   unsigned first, incr;
+
+   varray->fetch_start = start;
+
+   draw_pt_split_prim(varray->input_prim, &first, &incr);
+
+#if 0
+   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
+                varray->input_prim,
+                start, count);
+#endif
+
+   switch (varray->input_prim) {
+   case PIPE_PRIM_POINTS:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i < end; i++) {
+            POINT(varray, i + 0);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+1 < end; i += 2) {
+            LINE(varray, DRAW_PIPE_RESET_STIPPLE,
+                 i + 0, i + 1);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      if (count >= 2) {
+         flags = DRAW_PIPE_RESET_STIPPLE;
+
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end % incr);
+            for (i = 1; i < end; i++, flags = 0) {
+               LINE(varray, flags, i - 1, i);
+            }
+            LINE(varray, flags, i - 1, 0);
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+         }
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 1; i < end; i++, flags = 0) {
+            LINE(varray, flags, i - 1, i);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+2 < end; i += 3) {
+            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                     i + 0, i + 1, i + 2);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatfirst) {
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end % incr);
+            for (i = 0; i+2 < end; i++) {
+               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                        i + 0, i + 1 + (i&1), i + 2 - (i&1));
+            }
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+            if (j + first + i <= count) {
+               varray->fetch_start -= 2;
+               i -= 2;
+            }
+         }
+      }
+      else {
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end  % incr);
+            for (i = 0; i + 2 < end; i++) {
+               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                        i + 0 + (i&1), i + 1 - (i&1), i + 2);
+            }
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+            if (j + first + i <= count) {
+               varray->fetch_start -= 2;
+               i -= 2;
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         if (flatfirst) {
+            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+            for (j = 0; j + first <= count; j += i) {
+               unsigned end = MIN2(FETCH_MAX, count - j);
+               end -= (end % incr);
+               for (i = 0; i+2 < end; i++) {
+                  TRIANGLE(varray, flags, i + 1, i + 2, 0);
+               }
+               i = end;
+               fetch_init(varray, end);
+               varray_flush(varray);
+            }
+         }
+         else {
+            flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
+            for (j = 0; j + first <= count; j += i) {
+               unsigned end = MIN2(FETCH_MAX, count - j);
+               end -= (end % incr);
+               for (i = 0; i+2 < end; i++) {
+                  TRIANGLE(varray, flags, 0, i + 1, i + 2);
+               }
+               i = end;
+               fetch_init(varray, end);
+               varray_flush(varray);
+            }
+         }
+      }
+      break;
+
+   case PIPE_PRIM_QUADS:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+3 < end; i += 4) {
+            QUAD(varray, i + 0, i + 1, i + 2, i + 3);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+3 < end; i += 2) {
+            QUAD(varray, i + 2, i + 0, i + 1, i + 3);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+         if (j + first + i <= count) {
+            varray->fetch_start -= 2;
+            i -= 2;
+         }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+   {
+      /* These bitflags look a little odd because we submit the
+       * vertices as (1,2,0) to satisfy flatshade requirements.
+       */
+      const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+      const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+      const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+2 < end; i++, flags = edge_middle) {
+
+            if (i + 3 == count)
+               flags |= edge_last;
+
+            TRIANGLE(varray, flags, i + 1, i + 2, 0);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+   }
+   break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   varray_flush(varray);
+}
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
new file mode 100644
index 0000000000..55a8e6521d
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
@@ -0,0 +1,91 @@
+static unsigned trim( unsigned count, unsigned first, unsigned incr )
+{
+   return count - (count - first) % incr; 
+}
+
+static void FUNC(struct draw_pt_front_end *frontend,
+                 pt_elt_func get_elt,
+                 const void *elts,
+                 unsigned count)
+{
+   struct varray_frontend *varray = (struct varray_frontend *)frontend;
+   unsigned start = (unsigned)elts;
+
+   unsigned j;
+   unsigned first, incr;
+
+   draw_pt_split_prim(varray->input_prim, &first, &incr);
+   
+   /* Sanitize primitive length:
+    */
+   count = trim(count, first, incr); 
+   if (count < first)
+      return;
+
+#if 0
+   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
+                varray->input_prim,
+                start, count);
+#endif
+
+   switch (varray->input_prim) {
+   case PIPE_PRIM_POINTS:
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      for (j = 0; j < count;) {
+         unsigned remaining = count - j;
+         unsigned nr = trim( MIN2(varray->driver_fetch_max, remaining), first, incr );
+         varray_flush_linear(varray, start + j, nr);
+         j += nr;
+         if (nr != remaining) 
+            j -= (first - incr);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      /* Always have to decompose as we've stated that this will be
+       * emitted as a line-strip.
+       */
+      for (j = 0; j < count;) {
+         unsigned remaining = count - j;
+         unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
+         varray_line_loop_segment(varray, start, j, nr, nr == remaining);
+         j += nr;
+         if (nr != remaining) 
+            j -= (first - incr);
+      }
+      break;
+
+
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_TRIANGLE_FAN: 
+      if (count < varray->driver_fetch_max) {
+         varray_flush_linear(varray, start, count);
+      }
+      else {
+         for ( j = 0; j < count;) {
+            unsigned remaining = count - j;
+            unsigned nr = trim( MIN2(varray->fetch_max-1, remaining), first, incr );
+            varray_fan_segment(varray, start, j, nr);
+            j += nr;
+            if (nr != remaining) 
+               j -= (first - incr);
+         }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
new file mode 100644
index 0000000000..5d268a2226
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -0,0 +1,501 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+
+#define CACHE_MAX 256
+#define FETCH_MAX 256
+#define DRAW_MAX (16*1024)
+
+struct vcache_frontend {
+   struct draw_pt_front_end base;
+   struct draw_context *draw;
+
+   unsigned in[CACHE_MAX];
+   ushort out[CACHE_MAX];
+
+   ushort draw_elts[DRAW_MAX];
+   unsigned fetch_elts[FETCH_MAX];
+
+   unsigned draw_count;
+   unsigned fetch_count;
+   unsigned fetch_max;
+   
+   struct draw_pt_middle_end *middle;
+
+   unsigned input_prim;
+   unsigned output_prim;
+
+   unsigned middle_prim;
+   unsigned opt;
+};
+
+static INLINE void 
+vcache_flush( struct vcache_frontend *vcache )
+{
+   if (vcache->middle_prim != vcache->output_prim) {
+      vcache->middle_prim = vcache->output_prim;
+      vcache->middle->prepare( vcache->middle, 
+                               vcache->middle_prim, 
+                               vcache->opt, 
+                               &vcache->fetch_max );
+   }
+
+   if (vcache->draw_count) {
+      vcache->middle->run( vcache->middle,
+                           vcache->fetch_elts,
+                           vcache->fetch_count,
+                           vcache->draw_elts,
+                           vcache->draw_count );
+   }
+
+   memset(vcache->in, ~0, sizeof(vcache->in));
+   vcache->fetch_count = 0;
+   vcache->draw_count = 0;
+}
+
+static INLINE void 
+vcache_check_flush( struct vcache_frontend *vcache )
+{
+   if ( vcache->draw_count + 6 >= DRAW_MAX ||
+        vcache->fetch_count + 4 >= FETCH_MAX )
+   {
+      vcache_flush( vcache );
+   }
+}
+
+
+static INLINE void 
+vcache_elt( struct vcache_frontend *vcache,
+            unsigned felt,
+            ushort flags )
+{
+   unsigned idx = felt % CACHE_MAX;
+
+   if (vcache->in[idx] != felt) {
+      assert(vcache->fetch_count < FETCH_MAX);
+
+      vcache->in[idx] = felt;
+      vcache->out[idx] = (ushort)vcache->fetch_count;
+      vcache->fetch_elts[vcache->fetch_count++] = felt;
+   }
+
+   vcache->draw_elts[vcache->draw_count++] = vcache->out[idx] | flags;
+}
+
+
+                   
+static INLINE void 
+vcache_triangle( struct vcache_frontend *vcache,
+                 unsigned i0,
+                 unsigned i1,
+                 unsigned i2 )
+{
+   vcache_elt(vcache, i0, 0);
+   vcache_elt(vcache, i1, 0);
+   vcache_elt(vcache, i2, 0);
+   vcache_check_flush(vcache);
+}
+
+			  
+static INLINE void 
+vcache_triangle_flags( struct vcache_frontend *vcache,
+                       ushort flags,
+                       unsigned i0,
+                       unsigned i1,
+                       unsigned i2 )
+{
+   vcache_elt(vcache, i0, flags);
+   vcache_elt(vcache, i1, 0);
+   vcache_elt(vcache, i2, 0);
+   vcache_check_flush(vcache);
+}
+
+static INLINE void 
+vcache_line( struct vcache_frontend *vcache,
+             unsigned i0,
+             unsigned i1 )
+{
+   vcache_elt(vcache, i0, 0);
+   vcache_elt(vcache, i1, 0);
+   vcache_check_flush(vcache);
+}
+
+
+static INLINE void 
+vcache_line_flags( struct vcache_frontend *vcache,
+                   ushort flags,
+                   unsigned i0,
+                   unsigned i1 )
+{
+   vcache_elt(vcache, i0, flags);
+   vcache_elt(vcache, i1, 0);
+   vcache_check_flush(vcache);
+}
+
+
+static INLINE void 
+vcache_point( struct vcache_frontend *vcache,
+              unsigned i0 )
+{
+   vcache_elt(vcache, i0, 0);
+   vcache_check_flush(vcache);
+}
+
+static INLINE void 
+vcache_quad( struct vcache_frontend *vcache,
+             unsigned i0,
+             unsigned i1,
+             unsigned i2,
+             unsigned i3 )
+{
+   vcache_triangle( vcache, i0, i1, i3 );
+   vcache_triangle( vcache, i1, i2, i3 );
+}
+
+static INLINE void 
+vcache_ef_quad( struct vcache_frontend *vcache,
+                unsigned i0,
+                unsigned i1,
+                unsigned i2,
+                unsigned i3 )
+{
+   vcache_triangle_flags( vcache,
+                          ( DRAW_PIPE_RESET_STIPPLE |
+                            DRAW_PIPE_EDGE_FLAG_0 |
+                            DRAW_PIPE_EDGE_FLAG_2 ),
+                          i0, i1, i3 );
+
+   vcache_triangle_flags( vcache,
+                          ( DRAW_PIPE_EDGE_FLAG_0 |
+                            DRAW_PIPE_EDGE_FLAG_1 ),
+                          i1, i2, i3 );
+}
+
+/* At least for now, we're back to using a template include file for
+ * this.  The two paths aren't too different though - it may be
+ * possible to reunify them.
+ */
+#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle_flags(vc,flags,i0,i1,i2)
+#define QUAD(vc,i0,i1,i2,i3)        vcache_ef_quad(vc,i0,i1,i2,i3)
+#define LINE(vc,flags,i0,i1)        vcache_line_flags(vc,flags,i0,i1)
+#define POINT(vc,i0)                vcache_point(vc,i0)
+#define FUNC vcache_run_extras
+#include "draw_pt_vcache_tmp.h"
+
+#define TRIANGLE(vc,flags,i0,i1,i2) vcache_triangle(vc,i0,i1,i2)
+#define QUAD(vc,i0,i1,i2,i3)        vcache_quad(vc,i0,i1,i2,i3)
+#define LINE(vc,flags,i0,i1)        vcache_line(vc,i0,i1)
+#define POINT(vc,i0)                vcache_point(vc,i0)
+#define FUNC vcache_run
+#include "draw_pt_vcache_tmp.h"
+
+static INLINE void 
+rebase_uint_elts( const unsigned *src,
+                  unsigned count,
+                  int delta,
+                  ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i] + delta);
+}
+
+static INLINE void 
+rebase_ushort_elts( const ushort *src,
+                    unsigned count,
+                    int delta,
+                                ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i] + delta);
+}
+
+static INLINE void 
+rebase_ubyte_elts( const ubyte *src,
+                   unsigned count,
+                   int delta,
+                   ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i] + delta);
+}
+
+
+
+static INLINE void 
+translate_uint_elts( const unsigned *src,
+                     unsigned count,
+                     ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i]);
+}
+
+static INLINE void 
+translate_ushort_elts( const ushort *src,
+                       unsigned count,
+                       ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i]);
+}
+
+static INLINE void 
+translate_ubyte_elts( const ubyte *src,
+                      unsigned count,
+                      ushort *dest )
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) 
+      dest[i] = (ushort)(src[i]);
+}
+
+
+
+
+#if 0
+static INLINE enum pipe_format 
+format_from_get_elt( pt_elt_func get_elt )
+{
+   switch (draw->pt.user.eltSize) {
+   case 1: return PIPE_FORMAT_R8_UNORM;
+   case 2: return PIPE_FORMAT_R16_UNORM;
+   case 4: return PIPE_FORMAT_R32_UNORM;
+   default: return PIPE_FORMAT_NONE;
+   }
+}
+#endif
+
+static INLINE void 
+vcache_check_run( struct draw_pt_front_end *frontend, 
+                  pt_elt_func get_elt,
+                  const void *elts,
+                  unsigned draw_count )
+{
+   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
+   struct draw_context *draw = vcache->draw;
+   unsigned min_index = draw->pt.user.min_index;
+   unsigned max_index = draw->pt.user.max_index;
+   unsigned index_size = draw->pt.user.eltSize;
+   unsigned fetch_count = max_index + 1 - min_index;
+   const ushort *transformed_elts;
+   ushort *storage = NULL;
+   boolean ok = FALSE;
+
+
+   if (0) debug_printf("fetch_count %d fetch_max %d draw_count %d\n", fetch_count, 
+                       vcache->fetch_max,
+                       draw_count);
+      
+   if (max_index == 0xffffffff ||
+       fetch_count > draw_count) {
+      if (0) debug_printf("fail\n");
+      goto fail;
+   }
+      
+   if (vcache->middle_prim != vcache->input_prim) {
+      vcache->middle_prim = vcache->input_prim;
+      vcache->middle->prepare( vcache->middle, 
+                               vcache->middle_prim, 
+                               vcache->opt, 
+                               &vcache->fetch_max );
+   }
+
+
+   if (min_index == 0 &&
+       index_size == 2) 
+   {
+      transformed_elts = (const ushort *)elts;
+   }
+   else 
+   {
+      storage = MALLOC( draw_count * sizeof(ushort) );
+      if (!storage)
+         goto fail;
+      
+      if (min_index == 0) {
+         switch(index_size) {
+         case 1:
+            translate_ubyte_elts( (const ubyte *)elts,
+                                  draw_count,
+                                  storage );
+            break;
+
+         case 2:
+            translate_ushort_elts( (const ushort *)elts,
+                                   draw_count,
+                                   storage );
+            break;
+
+         case 4:
+            translate_uint_elts( (const uint *)elts,
+                                 draw_count,
+                                 storage );
+            break;
+
+         default:
+            assert(0);
+            return;
+         }
+      }
+      else {
+         switch(index_size) {
+         case 1:
+            rebase_ubyte_elts( (const ubyte *)elts,
+                                  draw_count,
+                                  0 - (int)min_index,
+                                  storage );
+            break;
+
+         case 2:
+            rebase_ushort_elts( (const ushort *)elts,
+                                   draw_count,
+                                   0 - (int)min_index,
+                                   storage );
+            break;
+
+         case 4:
+            rebase_uint_elts( (const uint *)elts,
+                                 draw_count,
+                                 0 - (int)min_index,
+                                 storage );
+            break;
+
+         default:
+            assert(0);
+            return;
+         }
+      }
+      transformed_elts = storage;
+   }
+
+   if (fetch_count < UNDEFINED_VERTEX_ID)
+      ok = vcache->middle->run_linear_elts( vcache->middle,
+                                            min_index, /* start */
+                                            fetch_count,
+                                            transformed_elts,
+                                            draw_count );
+   
+   FREE(storage);
+
+   if (ok)
+      return;
+
+   debug_printf("failed to execute atomic draw elts for %d/%d, splitting up\n",
+                fetch_count, draw_count);
+
+ fail:
+   vcache_run( frontend, get_elt, elts, draw_count );
+}
+
+
+
+
+static void 
+vcache_prepare( struct draw_pt_front_end *frontend,
+                unsigned prim,
+                struct draw_pt_middle_end *middle,
+                unsigned opt )
+{
+   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
+
+   if (opt & PT_PIPELINE)
+   {
+      vcache->base.run = vcache_run_extras;
+   }
+   else 
+   {
+      vcache->base.run = vcache_check_run;
+   }
+
+   vcache->input_prim = prim;
+   vcache->output_prim = draw_pt_reduced_prim(prim);
+
+   vcache->middle = middle;
+   vcache->opt = opt;
+
+   /* Have to run prepare here, but try and guess a good prim for
+    * doing so:
+    */
+   vcache->middle_prim = (opt & PT_PIPELINE) ? vcache->output_prim : vcache->input_prim;
+   middle->prepare( middle, vcache->middle_prim, opt, &vcache->fetch_max );
+}
+
+
+
+
+static void 
+vcache_finish( struct draw_pt_front_end *frontend )
+{
+   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
+   vcache->middle->finish( vcache->middle );
+   vcache->middle = NULL;
+}
+
+static void 
+vcache_destroy( struct draw_pt_front_end *frontend )
+{
+   FREE(frontend);
+}
+
+
+struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw )
+{
+   struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend );
+   if (vcache == NULL)
+      return NULL;
+ 
+   vcache->base.prepare = vcache_prepare;
+   vcache->base.run     = NULL;
+   vcache->base.finish  = vcache_finish;
+   vcache->base.destroy = vcache_destroy;
+   vcache->draw = draw;
+   
+   memset(vcache->in, ~0, sizeof(vcache->in));
+  
+   return &vcache->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
new file mode 100644
index 0000000000..ec05bbeab4
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache_tmp.h
@@ -0,0 +1,177 @@
+
+
+static void FUNC( struct draw_pt_front_end *frontend, 
+                  pt_elt_func get_elt,
+                  const void *elts,
+                  unsigned count )
+{
+   struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
+   struct draw_context *draw = vcache->draw;
+
+   boolean flatfirst = (draw->rasterizer->flatshade && 
+                        draw->rasterizer->flatshade_first);
+   unsigned i;
+   ushort flags;
+
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
+
+   switch (vcache->input_prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i ++) {
+	 POINT( vcache,
+                get_elt(elts, i + 0) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+         LINE( vcache, 
+               DRAW_PIPE_RESET_STIPPLE,
+               get_elt(elts, i + 0),
+               get_elt(elts, i + 1));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:  
+      if (count >= 2) {
+         flags = DRAW_PIPE_RESET_STIPPLE;
+
+         for (i = 1; i < count; i++, flags = 0) {
+            LINE( vcache, 
+                  flags,
+                  get_elt(elts, i - 1),
+                  get_elt(elts, i ));
+         }
+
+	 LINE( vcache, 
+               flags,
+               get_elt(elts, i - 1),
+               get_elt(elts, 0 ));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (i = 1; i < count; i++, flags = 0) {
+         LINE( vcache, 
+               flags,
+               get_elt(elts, i - 1),
+               get_elt(elts, i ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 0; i+2 < count; i += 3) {
+         TRIANGLE( vcache,
+                   DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
+                   get_elt(elts, i + 0),
+                   get_elt(elts, i + 1),
+                   get_elt(elts, i + 2 ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatfirst) {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( vcache,
+                      DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
+                      get_elt(elts, i + 0),
+                      get_elt(elts, i + 1 + (i&1)),
+                      get_elt(elts, i + 2 - (i&1)));
+         }
+      }
+      else {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( vcache,
+                      DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
+                      get_elt(elts, i + 0 + (i&1)),
+                      get_elt(elts, i + 1 - (i&1)),
+                      get_elt(elts, i + 2 ));
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         if (flatfirst) {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( vcache,
+                         DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
+                         get_elt(elts, i + 1),
+                         get_elt(elts, i + 2),
+                         get_elt(elts, 0 ));
+            }
+         }
+         else {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( vcache,
+                         DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL, 
+                         get_elt(elts, 0),
+                         get_elt(elts, i + 1),
+                         get_elt(elts, i + 2 ));
+            }
+         }
+      }
+      break;
+
+
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i+3 < count; i += 4) {
+         QUAD( vcache,
+               get_elt(elts, i + 0),
+               get_elt(elts, i + 1),
+               get_elt(elts, i + 2),
+               get_elt(elts, i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i+3 < count; i += 2) {
+         QUAD( vcache,
+               get_elt(elts, i + 2),
+               get_elt(elts, i + 0),
+               get_elt(elts, i + 1),
+               get_elt(elts, i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      {
+         /* These bitflags look a little odd because we submit the
+          * vertices as (1,2,0) to satisfy flatshade requirements.  
+          */
+         const ushort edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+         const ushort edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+         const ushort edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+
+	 for (i = 0; i+2 < count; i++, flags = edge_middle) {
+
+            if (i + 3 == count)
+               flags |= edge_last;
+
+	    TRIANGLE( vcache,
+                      flags,
+                      get_elt(elts, i + 1),
+                      get_elt(elts, i + 2),
+                      get_elt(elts, 0));
+	 }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+   
+   vcache_flush( vcache );
+}
+
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
new file mode 100644
index 0000000000..b0aa2df309
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -0,0 +1,113 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ * \author Jos� Fonseca <jrfonsec@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VBUF_H_
+#define DRAW_VBUF_H_
+
+
+
+struct draw_context;
+struct vertex_info;
+
+
+/**
+ * Interface for hardware vertex buffer rendering.
+ */
+struct vbuf_render {
+
+   /**
+    * Driver limits.  May be tuned lower to improve cache hits on
+    * index list.
+    */
+   unsigned max_indices;
+   unsigned max_vertex_buffer_bytes;
+
+   /**
+    * Get the hardware vertex format.
+    * 
+    * XXX: have this in draw_context instead?
+    */ 
+   const struct vertex_info *(*get_vertex_info)( struct vbuf_render * );
+	 
+   /**
+    * Request a destination for vertices.
+    * Hardware renderers will use ttm memory, others will just malloc
+    * something.
+    */
+   void *(*allocate_vertices)( struct vbuf_render *,
+			       ushort vertex_size,
+			       ushort nr_vertices );
+
+   /**
+    * Notify the renderer of the current primitive when it changes.
+    * Must succeed for TRIANGLES, LINES and POINTS.  Other prims at
+    * the discretion of the driver, for the benefit of the passthrough
+    * path.
+    */
+   boolean (*set_primitive)( struct vbuf_render *, unsigned prim );
+
+   /**
+    * DrawElements, note indices are ushort.  The driver must complete
+    * this call, if necessary splitting the index list itself.
+    */
+   void (*draw)( struct vbuf_render *,
+		 const ushort *indices,
+		 uint nr_indices );
+
+   /* Draw Arrays path too.
+    */
+   void (*draw_arrays)( struct vbuf_render *,
+			unsigned start,
+			uint nr );
+
+   /**
+    * Called when vbuf is done with this set of vertices:
+    */
+   void (*release_vertices)( struct vbuf_render *,
+			     void *vertices, 
+			     unsigned vertex_size,
+			     unsigned vertices_used );
+
+   void (*destroy)( struct vbuf_render * );
+};
+
+
+
+struct draw_stage *
+draw_vbuf_stage( struct draw_context *draw,
+                 struct vbuf_render *render );
+
+
+#endif /*DRAW_VBUF_H_*/
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
new file mode 100644
index 0000000000..3214213e44
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -0,0 +1,129 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Functions for specifying the post-transformation vertex layout.
+ *
+ * Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Compute the size of a vertex, in dwords/floats, to update the
+ * vinfo->size field.
+ */
+void
+draw_compute_vertex_size(struct vertex_info *vinfo)
+{
+   uint i;
+
+   vinfo->size = 0;
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_OMIT:
+         break;
+      case EMIT_4UB:
+         /* fall-through */
+      case EMIT_1F_PSIZE:
+         /* fall-through */
+      case EMIT_1F:
+         vinfo->size += 1;
+         break;
+      case EMIT_2F:
+         vinfo->size += 2;
+         break;
+      case EMIT_3F:
+         vinfo->size += 3;
+         break;
+      case EMIT_4F:
+         vinfo->size += 4;
+         break;
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+void
+draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   unsigned i, j;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->attrib[i].src_index;
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_OMIT:
+         debug_printf("EMIT_OMIT:");
+         break;
+      case EMIT_1F:
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
+}
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
new file mode 100644
index 0000000000..a943607d7e
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -0,0 +1,163 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Post-transform vertex format info.  The vertex_info struct is used by
+ * the draw_vbuf code to emit hardware-specific vertex layouts into hw
+ * vertex buffers.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+
+#ifndef DRAW_VERTEX_H
+#define DRAW_VERTEX_H
+
+
+#include "pipe/p_state.h"
+
+
+/**
+ * Vertex attribute emit modes
+ */
+enum attrib_emit {
+   EMIT_OMIT,      /**< don't emit the attribute */
+   EMIT_1F,
+   EMIT_1F_PSIZE,  /**< insert constant point size */
+   EMIT_2F,
+   EMIT_3F,
+   EMIT_4F,
+   EMIT_4UB  /**< XXX may need variations for RGBA vs BGRA, etc */
+};
+
+
+/**
+ * Attribute interpolation mode
+ */
+enum interp_mode {
+   INTERP_NONE,      /**< never interpolate vertex header info */
+   INTERP_POS,       /**< special case for frag position */
+   INTERP_CONSTANT,
+   INTERP_LINEAR,
+   INTERP_PERSPECTIVE
+};
+
+
+/**
+ * Information about hardware/rasterization vertex layout.
+ */
+struct vertex_info
+{
+   uint num_attribs;
+   uint hwfmt[4];      /**< hardware format info for this format */
+   uint size;          /**< total vertex size in dwords */
+   
+   /* Keep this small and at the end of the struct to allow quick
+    * memcmp() comparisons.
+    */
+   struct {
+      ubyte interp_mode:4;      /**< INTERP_x */
+      ubyte emit:4;             /**< EMIT_x */
+      ubyte src_index;          /**< map to post-xform attribs */
+   } attrib[PIPE_MAX_SHADER_INPUTS];
+};
+
+static INLINE int
+draw_vinfo_size( const struct vertex_info *a )
+{
+   return ((const char *)&a->attrib[a->num_attribs] -
+           (const char *)a);
+}
+
+static INLINE int
+draw_vinfo_compare( const struct vertex_info *a,
+                    const struct vertex_info *b )
+{
+   unsigned sizea = draw_vinfo_size( a );
+   return memcmp( a, b, sizea );
+}
+
+static INLINE void
+draw_vinfo_copy( struct vertex_info *dst,
+                 const struct vertex_info *src )
+{
+   unsigned size = draw_vinfo_size( src );
+   memcpy( dst, src, size );
+}
+
+
+
+/**
+ * Add another attribute to the given vertex_info object.
+ * \param src_index  indicates which post-transformed vertex attrib slot
+ *                   corresponds to this attribute.
+ * \return slot in which the attribute was added
+ */
+static INLINE uint
+draw_emit_vertex_attr(struct vertex_info *vinfo,
+                      enum attrib_emit emit, 
+                      enum interp_mode interp, /* only used by softpipe??? */
+                      uint src_index)
+{
+   const uint n = vinfo->num_attribs;
+   assert(n < PIPE_MAX_SHADER_INPUTS);
+   vinfo->attrib[n].emit = emit;
+   vinfo->attrib[n].interp_mode = interp;
+   vinfo->attrib[n].src_index = src_index;
+   vinfo->num_attribs++;
+   return n;
+}
+
+
+extern void draw_compute_vertex_size(struct vertex_info *vinfo);
+
+void draw_dump_emitted_vertex(const struct vertex_info *vinfo, 
+                              const uint8_t *data);
+
+
+static INLINE unsigned draw_translate_vinfo_format(unsigned format )
+{
+   switch (format) {
+   case EMIT_1F:
+   case EMIT_1F_PSIZE:
+      return PIPE_FORMAT_R32_FLOAT;
+   case EMIT_2F:
+      return PIPE_FORMAT_R32G32_FLOAT;
+   case EMIT_3F:
+      return PIPE_FORMAT_R32G32B32_FLOAT;
+   case EMIT_4F:
+      return PIPE_FORMAT_R32G32B32A32_FLOAT;
+   case EMIT_4UB:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   default:
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+
+#endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
new file mode 100644
index 0000000000..7f305304ff
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -0,0 +1,267 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+
+
+
+void draw_vs_set_constants( struct draw_context *draw,
+                            const float (*constants)[4],
+                            unsigned size )
+{
+   if (((unsigned)constants) & 0xf) {
+      if (size > draw->vs.const_storage_size) {
+         if (draw->vs.aligned_constant_storage)
+            align_free((void *)draw->vs.aligned_constant_storage);
+         draw->vs.aligned_constant_storage = align_malloc( size, 16 );
+      }
+      memcpy( (void*)draw->vs.aligned_constant_storage,
+              constants, 
+              size );
+      constants = draw->vs.aligned_constant_storage;
+   }
+      
+   draw->vs.aligned_constants = constants;
+   draw_vs_aos_machine_constants( draw->vs.aos_machine, constants );
+}
+
+
+void draw_vs_set_viewport( struct draw_context *draw,
+                           const struct pipe_viewport_state *viewport )
+{
+   draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport );
+}
+
+
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader)
+{
+   struct draw_vertex_shader *vs;
+
+   vs = draw_create_vs_llvm( draw, shader );
+   if (!vs) {
+      vs = draw_create_vs_sse( draw, shader );
+      if (!vs) {
+         vs = draw_create_vs_ppc( draw, shader );
+         if (!vs) {
+            vs = draw_create_vs_exec( draw, shader );
+         }
+      }
+   }
+
+   if (vs)
+   {
+      uint i;
+      for (i = 0; i < vs->info.num_outputs; i++) {
+         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_POSITION &&
+             vs->info.output_semantic_index[i] == 0)
+            vs->position_output = i;
+      }
+   }
+
+   assert(vs);
+   return vs;
+}
+
+
+void
+draw_bind_vertex_shader(struct draw_context *draw,
+                        struct draw_vertex_shader *dvs)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   
+   if (dvs) 
+   {
+      draw->vs.vertex_shader = dvs;
+      draw->vs.num_vs_outputs = dvs->info.num_outputs;
+      draw->vs.position_output = dvs->position_output;
+      dvs->prepare( dvs, draw );
+   }
+   else {
+      draw->vs.vertex_shader = NULL;
+      draw->vs.num_vs_outputs = 0;
+   }
+}
+
+
+void
+draw_delete_vertex_shader(struct draw_context *draw,
+                          struct draw_vertex_shader *dvs)
+{
+   unsigned i;
+
+   for (i = 0; i < dvs->nr_varients; i++) 
+      dvs->varient[i]->destroy( dvs->varient[i] );
+
+   dvs->nr_varients = 0;
+
+   dvs->delete( dvs );
+}
+
+
+
+boolean 
+draw_vs_init( struct draw_context *draw )
+{
+   tgsi_exec_machine_init(&draw->vs.machine);
+
+   /* FIXME: give this machine thing a proper constructor:
+    */
+   draw->vs.machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   if (!draw->vs.machine.Inputs)
+      return FALSE;
+
+   draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   if (!draw->vs.machine.Outputs)
+      return FALSE;
+
+   draw->vs.emit_cache = translate_cache_create();
+   if (!draw->vs.emit_cache) 
+      return FALSE;
+      
+   draw->vs.fetch_cache = translate_cache_create();
+   if (!draw->vs.fetch_cache) 
+      return FALSE;
+
+   draw->vs.aos_machine = draw_vs_aos_machine();
+#ifdef PIPE_ARCH_X86
+   if (!draw->vs.aos_machine)
+      return FALSE;
+#endif
+      
+   return TRUE;
+}
+
+void
+draw_vs_destroy( struct draw_context *draw )
+{
+   if (draw->vs.machine.Inputs)
+      align_free(draw->vs.machine.Inputs);
+
+   if (draw->vs.machine.Outputs)
+      align_free(draw->vs.machine.Outputs);
+
+   if (draw->vs.fetch_cache)
+      translate_cache_destroy(draw->vs.fetch_cache);
+
+   if (draw->vs.emit_cache)
+      translate_cache_destroy(draw->vs.emit_cache);
+
+   if (draw->vs.aos_machine)
+      draw_vs_aos_machine_destroy(draw->vs.aos_machine);
+
+   if (draw->vs.aligned_constant_storage)
+      align_free((void*)draw->vs.aligned_constant_storage);
+
+   tgsi_exec_machine_free_data(&draw->vs.machine);
+
+}
+
+
+struct draw_vs_varient *
+draw_vs_lookup_varient( struct draw_vertex_shader *vs,
+                        const struct draw_vs_varient_key *key )
+{
+   struct draw_vs_varient *varient;
+   unsigned i;
+
+   /* Lookup existing varient: 
+    */
+   for (i = 0; i < vs->nr_varients; i++)
+      if (draw_vs_varient_key_compare(key, &vs->varient[i]->key) == 0)
+         return vs->varient[i];
+   
+   /* Else have to create a new one: 
+    */
+   varient = vs->create_varient( vs, key );
+   if (varient == NULL)
+      return NULL;
+
+   /* Add it to our list, could be smarter: 
+    */
+   if (vs->nr_varients < Elements(vs->varient)) {
+      vs->varient[vs->nr_varients++] = varient;
+   }
+   else {
+      vs->last_varient++;
+      vs->last_varient %= Elements(vs->varient);
+      vs->varient[vs->last_varient]->destroy(vs->varient[vs->last_varient]);
+      vs->varient[vs->last_varient] = varient;
+   }
+
+   /* Done 
+    */
+   return varient;
+}
+
+
+struct translate *
+draw_vs_get_fetch( struct draw_context *draw,
+                   struct translate_key *key )
+{
+   if (!draw->vs.fetch ||
+       translate_key_compare(&draw->vs.fetch->key, key) != 0) 
+   {
+      translate_key_sanitize(key);
+      draw->vs.fetch = translate_cache_find(draw->vs.fetch_cache, key);
+   }
+   
+   return draw->vs.fetch;
+}
+
+struct translate *
+draw_vs_get_emit( struct draw_context *draw,
+                  struct translate_key *key )
+{
+   if (!draw->vs.emit ||
+       translate_key_compare(&draw->vs.emit->key, key) != 0) 
+   {
+      translate_key_sanitize(key);
+      draw->vs.emit = translate_cache_find(draw->vs.emit_cache, key);
+   }
+   
+   return draw->vs.emit;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
new file mode 100644
index 0000000000..89ae158751
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -0,0 +1,224 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VS_H
+#define DRAW_VS_H
+
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+struct draw_context;
+struct pipe_shader_state;
+
+struct draw_varient_input 
+{
+   enum pipe_format format;
+   unsigned buffer;
+   unsigned offset; 
+};
+
+struct draw_varient_output
+{
+   enum pipe_format format;     /* output format */
+   unsigned vs_output:8;        /* which vertex shader output is this? */
+   unsigned offset:24;          /* offset into output vertex */
+};
+
+struct draw_varient_element {
+   struct draw_varient_input in;
+   struct draw_varient_output out;
+};
+
+struct draw_vs_varient_key {
+   unsigned output_stride;
+   unsigned nr_elements:8;      /* max2(nr_inputs, nr_outputs) */
+   unsigned nr_inputs:8;
+   unsigned nr_outputs:8;
+   unsigned viewport:1;
+   unsigned clip:1;
+   unsigned const_vbuffers:5;
+   struct draw_varient_element element[PIPE_MAX_ATTRIBS];
+};
+
+struct draw_vs_varient;
+
+
+struct draw_vs_varient {
+   struct draw_vs_varient_key key;
+
+   struct draw_vertex_shader *vs;
+
+   void (*set_buffer)( struct draw_vs_varient *,
+                      unsigned i,
+                      const void *ptr,
+                      unsigned stride );
+
+   void (PIPE_CDECL *run_linear)( struct draw_vs_varient *shader,
+                                  unsigned start,
+                                  unsigned count,
+                                  void *output_buffer );
+
+   void (PIPE_CDECL *run_elts)( struct draw_vs_varient *shader,
+                                const unsigned *elts,
+                                unsigned count,
+                                void *output_buffer );
+
+   void (*destroy)( struct draw_vs_varient * );
+};
+
+
+/**
+ * Private version of the compiled vertex_shader
+ */
+struct draw_vertex_shader {
+   struct draw_context *draw;
+
+   /* This member will disappear shortly:
+    */
+   struct pipe_shader_state   state;
+
+   struct tgsi_shader_info info;
+   unsigned position_output;
+
+   /* Extracted from shader:
+    */
+   const float (*immediates)[4];
+
+   /* 
+    */
+   struct draw_vs_varient *varient[16];
+   unsigned nr_varients;
+   unsigned last_varient;
+   struct draw_vs_varient *(*create_varient)( struct draw_vertex_shader *shader,
+                                              const struct draw_vs_varient_key *key );
+
+
+   void (*prepare)( struct draw_vertex_shader *shader,
+		    struct draw_context *draw );
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   void (*run_linear)( struct draw_vertex_shader *shader,
+		       const float (*input)[4],
+		       float (*output)[4],
+		       const float (*constants)[4],
+		       unsigned count,
+		       unsigned input_stride,
+		       unsigned output_stride );
+
+
+   void (*delete)( struct draw_vertex_shader * );
+};
+
+
+struct draw_vs_varient *
+draw_vs_lookup_varient( struct draw_vertex_shader *base,
+                        const struct draw_vs_varient_key *key );
+
+
+/********************************************************************************
+ * Internal functions:
+ */
+
+struct draw_vertex_shader *
+draw_create_vs_exec(struct draw_context *draw,
+		    const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
+draw_create_vs_sse(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *templ);
+
+
+
+struct draw_vs_varient_key;
+struct draw_vertex_shader;
+
+struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key );
+
+
+
+/********************************************************************************
+ * Helpers for vs implementations that don't do their own fetch/emit varients.
+ * Means these can be shared between shaders.
+ */
+struct translate;
+struct translate_key;
+
+struct translate *draw_vs_get_fetch( struct draw_context *draw,
+                                     struct translate_key *key );
+
+
+struct translate *draw_vs_get_emit( struct draw_context *draw,
+                                    struct translate_key *key );
+
+struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key );
+
+
+
+static INLINE int draw_vs_varient_keysize( const struct draw_vs_varient_key *key )
+{
+   return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_varient_element);
+}
+
+static INLINE int draw_vs_varient_key_compare( const struct draw_vs_varient_key *a,
+                                         const struct draw_vs_varient_key *b )
+{
+   int keysize = draw_vs_varient_keysize(a);
+   return memcmp(a, b, keysize);
+}
+
+
+struct aos_machine *draw_vs_aos_machine( void );
+void draw_vs_aos_machine_destroy( struct aos_machine *machine );
+
+void draw_vs_aos_machine_constants( struct aos_machine *machine,
+                                    const float (*constants)[4] );
+
+void draw_vs_aos_machine_viewport( struct aos_machine *machine,
+                                   const struct pipe_viewport_state *viewport );
+
+
+#define MAX_TGSI_VERTICES 4
+   
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
new file mode 100644
index 0000000000..6141ba9cbf
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -0,0 +1,2246 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
+ * using the rtasm runtime assembler.  Based on the old
+ * t_vb_arb_program_sse.c
+ */
+
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw_vs.h"
+#include "draw_vs_aos.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+#ifdef PIPE_ARCH_X86
+#define DISASSEM 0
+#define FAST_MATH 1
+
+static const char *files[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM",
+   "INTERNAL",
+};
+
+static INLINE boolean eq( struct x86_reg a,
+			    struct x86_reg b )
+{
+   return (a.file == b.file &&
+	   a.idx == b.idx &&
+	   a.mod == b.mod &&
+	   a.disp == b.disp);
+}
+      
+struct x86_reg aos_get_x86( struct aos_compilation *cp,
+                            unsigned which_reg, /* quick hack */
+                            unsigned value )
+{
+   struct x86_reg reg;
+
+   if (which_reg == 0)
+      reg = cp->temp_EBP;
+   else
+      reg = cp->tmp_EAX;
+
+   if (cp->x86_reg[which_reg] != value) {
+      unsigned offset;
+
+      switch (value) {
+      case X86_IMMEDIATES:
+         assert(which_reg == 0);
+         offset = Offset(struct aos_machine, immediates);
+         break;
+      case X86_CONSTANTS:
+         assert(which_reg == 1);
+         offset = Offset(struct aos_machine, constants);
+         break;
+      case X86_BUFFERS:
+         assert(which_reg == 0);
+         offset = Offset(struct aos_machine, buffer);
+         break;
+      default:
+         assert(0);
+         offset = 0;
+      }
+
+
+      x86_mov(cp->func, reg, 
+              x86_make_disp(cp->machine_EDX, offset));
+
+      cp->x86_reg[which_reg] = value;
+   }
+
+   return reg;
+}
+
+
+static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
+                                  unsigned file,
+				  unsigned idx )
+{
+   struct x86_reg ptr = cp->machine_EDX;
+
+   switch (file) {
+   case TGSI_FILE_INPUT:
+      assert(idx < MAX_INPUTS);
+      return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
+
+   case TGSI_FILE_OUTPUT:
+      return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
+
+   case TGSI_FILE_TEMPORARY:
+      assert(idx < MAX_TEMPS);
+      return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
+
+   case AOS_FILE_INTERNAL:
+      assert(idx < MAX_INTERNALS);
+      return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
+
+   case TGSI_FILE_IMMEDIATE: 
+      assert(idx < MAX_IMMEDIATES);  /* just a sanity check */
+      return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
+
+   case TGSI_FILE_CONSTANT: 
+      assert(idx < MAX_CONSTANTS);  /* just a sanity check */
+      return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
+
+   default:
+      ERROR(cp, "unknown reg file");
+      return x86_make_reg(0,0);
+   }
+}
+		
+
+
+#define X87_CW_EXCEPTION_INV_OP       (1<<0)
+#define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
+#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
+#define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
+#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
+#define X87_CW_EXCEPTION_PRECISION    (1<<5)
+#define X87_CW_PRECISION_SINGLE       (0<<8)
+#define X87_CW_PRECISION_RESERVED     (1<<8)
+#define X87_CW_PRECISION_DOUBLE       (2<<8)
+#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
+#define X87_CW_PRECISION_MASK         (3<<8)
+#define X87_CW_ROUND_NEAREST          (0<<10)
+#define X87_CW_ROUND_DOWN             (1<<10)
+#define X87_CW_ROUND_UP               (2<<10)
+#define X87_CW_ROUND_ZERO             (3<<10)
+#define X87_CW_ROUND_MASK             (3<<10)
+#define X87_CW_INFINITY               (1<<12)
+
+
+
+
+static void spill( struct aos_compilation *cp, unsigned idx )
+{
+   if (!cp->xmm[idx].dirty ||
+       (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
+        cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
+        cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
+      ERROR(cp, "invalid spill");
+      return;
+   }
+   else {
+      struct x86_reg oldval = get_reg_ptr(cp,
+                                          cp->xmm[idx].file,
+                                          cp->xmm[idx].idx);
+     
+      if (0) debug_printf("\nspill %s[%d]", 
+                          files[cp->xmm[idx].file],
+                          cp->xmm[idx].idx);
+ 
+      assert(cp->xmm[idx].dirty);
+      sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
+      cp->xmm[idx].dirty = 0;
+   }
+}
+
+
+void aos_spill_all( struct aos_compilation *cp )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+}
+
+
+static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
+                                        struct x86_reg reg )
+{
+   if (reg.file != file_XMM ||
+       cp->xmm[reg.idx].file != TGSI_FILE_NULL)
+   {
+      struct x86_reg tmp = aos_get_xmm_reg(cp);
+      sse_movaps(cp->func, tmp, reg);
+      reg = tmp;
+   }
+
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+   return reg;
+}
+
+static struct x86_reg get_xmm( struct aos_compilation *cp,
+                               struct x86_reg reg )
+{
+   if (reg.file != file_XMM) 
+   {
+      struct x86_reg tmp = aos_get_xmm_reg(cp);
+      sse_movaps(cp->func, tmp, reg);
+      reg = tmp;
+   }
+
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+   return reg;
+}
+
+
+/* Allocate an empty xmm register, either as a temporary or later to
+ * "adopt" as a shader reg.
+ */
+struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
+{
+   unsigned i;
+   unsigned oldest = 0;
+   boolean found = FALSE;
+
+   for (i = 0; i < 8; i++) 
+      if (cp->xmm[i].last_used != cp->insn_counter &&
+          cp->xmm[i].file == TGSI_FILE_NULL) {
+	 oldest = i;
+         found = TRUE;
+      }
+
+   if (!found) {
+      for (i = 0; i < 8; i++) 
+         if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
+            oldest = i;
+   }
+
+   /* Need to write out the old value?
+    */
+   if (cp->xmm[oldest].dirty) 
+      spill(cp, oldest);
+
+   assert(cp->xmm[oldest].last_used != cp->insn_counter);
+
+   cp->xmm[oldest].file = TGSI_FILE_NULL;
+   cp->xmm[oldest].idx = 0;
+   cp->xmm[oldest].dirty = 0;
+   cp->xmm[oldest].last_used = cp->insn_counter;
+   return x86_make_reg(file_XMM, oldest);
+}
+
+void aos_release_xmm_reg( struct aos_compilation *cp,
+                          unsigned idx )
+{
+   cp->xmm[idx].file = TGSI_FILE_NULL;
+   cp->xmm[idx].idx = 0;
+   cp->xmm[idx].dirty = 0;
+   cp->xmm[idx].last_used = 0;
+}
+
+
+
+     
+/* Mark an xmm reg as holding the current copy of a shader reg.
+ */
+void aos_adopt_xmm_reg( struct aos_compilation *cp,
+                        struct x86_reg reg,
+                        unsigned file,
+                        unsigned idx,
+                        unsigned dirty )
+{
+   unsigned i;
+
+   if (reg.file != file_XMM) {
+      assert(0);
+      return;
+   }
+
+
+   /* If any xmm reg thinks it holds this shader reg, break the
+    * illusion.
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file && 
+          cp->xmm[i].idx == idx) 
+      {
+         /* If an xmm reg is already holding this shader reg, take into account its
+          * dirty flag...
+          */
+         dirty |= cp->xmm[i].dirty;
+         aos_release_xmm_reg(cp, i);
+      }
+   }
+
+   cp->xmm[reg.idx].file = file;
+   cp->xmm[reg.idx].idx = idx;
+   cp->xmm[reg.idx].dirty = dirty;
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+}
+
+
+/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
+ */
+static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, 
+                                              unsigned file,
+                                              unsigned idx )
+{
+   unsigned i;
+
+   /* Ensure the in-memory copy of this reg is up-to-date
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file && 
+          cp->xmm[i].idx == idx &&
+          cp->xmm[i].dirty) {
+         spill(cp, i);
+      }
+   }
+
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+/* As above, but return a pointer.  Note - this pointer may alias
+ * those returned by get_arg_ptr().
+ */
+static struct x86_reg get_dst_ptr( struct aos_compilation *cp, 
+                                   const struct tgsi_full_dst_register *dst )
+{
+   unsigned file = dst->DstRegister.File;
+   unsigned idx = dst->DstRegister.Index;
+   unsigned i;
+   
+
+   /* Ensure in-memory copy of this reg is up-to-date and invalidate
+    * any xmm copies.
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file &&
+          cp->xmm[i].idx == idx)
+      {
+         if (cp->xmm[i].dirty) 
+            spill(cp, i);
+         
+         aos_release_xmm_reg(cp, i);
+      }
+   }
+
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+
+
+
+/* Return an XMM reg if the argument is resident, otherwise return a
+ * base+offset pointer to the saved value.
+ */
+struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
+                                   unsigned file,
+                                   unsigned idx )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file &&
+	  cp->xmm[i].idx  == idx) 
+      {
+	 cp->xmm[i].last_used = cp->insn_counter;
+	 return x86_make_reg(file_XMM, i);
+      }
+   }
+
+   /* If not found in the XMM register file, return an indirect
+    * reference to the in-memory copy:
+    */
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+
+static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, 
+                                              unsigned file,
+                                              unsigned idx )
+{
+   struct x86_reg reg = get_xmm( cp,
+                                 aos_get_shader_reg( cp, file, idx ) );
+
+   aos_adopt_xmm_reg( cp,
+                      reg,
+                      file,
+                      idx,
+                      FALSE );
+   
+   return reg;
+}
+
+
+
+struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
+                                     unsigned imm )
+{
+   return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
+}
+
+
+struct x86_reg aos_get_internal( struct aos_compilation *cp,
+                                 unsigned imm )
+{
+   return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
+}
+
+
+
+
+
+/* Emulate pshufd insn in regular SSE, if necessary:
+ */
+static void emit_pshufd( struct aos_compilation *cp,
+			 struct x86_reg dst,
+			 struct x86_reg arg0,
+			 ubyte shuf )
+{
+   if (cp->have_sse2) {
+      sse2_pshufd(cp->func, dst, arg0, shuf);
+   }
+   else {
+      if (!eq(dst, arg0)) 
+	 sse_movaps(cp->func, dst, arg0);
+
+      sse_shufps(cp->func, dst, dst, shuf);
+   }
+}
+
+/* load masks (pack into negs??)
+ * pshufd - shuffle according to writemask
+ * and - result, mask
+ * nand - dest, mask
+ * or - dest, result
+ */
+static boolean mask_write( struct aos_compilation *cp,
+                           struct x86_reg dst,
+                           struct x86_reg result,
+                           unsigned mask )
+{
+   struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   
+   emit_pshufd(cp, tmp, imm_swz, 
+               SHUF((mask & 1) ? 2 : 3,
+                    (mask & 2) ? 2 : 3,
+                    (mask & 4) ? 2 : 3,
+                    (mask & 8) ? 2 : 3));
+
+   sse_andps(cp->func, dst, tmp);
+   sse_andnps(cp->func, tmp, result);
+   sse_orps(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   return TRUE;
+}
+
+
+
+
+/* Helper for writemask:
+ */
+static boolean emit_shuf_copy2( struct aos_compilation *cp,
+				  struct x86_reg dst,
+				  struct x86_reg arg0,
+				  struct x86_reg arg1,
+				  ubyte shuf )
+{
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+   emit_pshufd(cp, dst, arg1, shuf);
+   emit_pshufd(cp, tmp, arg0, shuf);
+   sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
+   emit_pshufd(cp, dst, dst, shuf);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   return TRUE;
+}
+
+
+
+#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+
+/* Locate a source register and perform any required (simple) swizzle.  
+ * 
+ * Just fail on complex swizzles at this point.
+ */
+static struct x86_reg fetch_src( struct aos_compilation *cp, 
+                                 const struct tgsi_full_src_register *src ) 
+{
+   struct x86_reg arg0 = aos_get_shader_reg(cp, 
+                                            src->SrcRegister.File, 
+                                            src->SrcRegister.Index);
+   unsigned i;
+   ubyte swz = 0;
+   unsigned negs = 0;
+   unsigned abs = 0;
+
+   for (i = 0; i < 4; i++) {
+      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
+      unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
+
+      switch (swizzle) {
+      case TGSI_EXTSWIZZLE_ZERO:
+      case TGSI_EXTSWIZZLE_ONE:
+         ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
+         break;
+
+      default:
+         swz |= (swizzle & 0x3) << (i * 2);
+         break;
+      }
+
+      switch (neg) {
+      case TGSI_UTIL_SIGN_TOGGLE:
+         negs |= (1<<i);
+         break;
+         
+      case TGSI_UTIL_SIGN_KEEP:
+         break;
+
+      case TGSI_UTIL_SIGN_CLEAR:
+         abs |= (1<<i);
+         break;
+
+      default:
+         ERROR(cp, "unsupported sign-mode");
+         break;
+      }
+   }
+
+   if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
+      struct x86_reg dst = aos_get_xmm_reg(cp);
+
+      if (swz != SSE_SWIZZLE_NOOP)
+         emit_pshufd(cp, dst, arg0, swz);
+      else
+         sse_movaps(cp->func, dst, arg0);
+
+      if (negs && negs != 0xf) {
+         struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
+         struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+         /* Load 1,-1,0,0
+          * Use neg as arg to pshufd
+          * Multiply
+          */
+         emit_pshufd(cp, tmp, imm_swz, 
+                     SHUF((negs & 1) ? 1 : 0,
+                          (negs & 2) ? 1 : 0,
+                          (negs & 4) ? 1 : 0,
+                          (negs & 8) ? 1 : 0));
+         sse_mulps(cp->func, dst, tmp);
+
+         aos_release_xmm_reg(cp, tmp.idx);
+      }
+      else if (negs) {
+         struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
+         sse_mulps(cp->func, dst, imm_negs);
+      }
+
+
+      if (abs && abs != 0xf) {
+         ERROR(cp, "unsupported partial abs");
+      }
+      else if (abs) {
+         struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
+         struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+         sse_movaps(cp->func, tmp, dst);
+         sse_mulps(cp->func, tmp, neg);
+         sse_maxps(cp->func, dst, tmp);
+
+         aos_release_xmm_reg(cp, tmp.idx);
+      }
+
+      return dst;
+   }
+      
+   return arg0;
+}
+
+static void x87_fld_src( struct aos_compilation *cp, 
+                         const struct tgsi_full_src_register *src,
+                         unsigned channel ) 
+{
+   struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, 
+                                                src->SrcRegister.File, 
+                                                src->SrcRegister.Index);
+
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
+   unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_ZERO:
+      x87_fldz( cp->func );
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      x87_fld1( cp->func );
+      break;
+
+   default:
+      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
+      break;
+   }
+   
+
+   switch (neg) {
+   case TGSI_UTIL_SIGN_TOGGLE:
+      /* Flip the sign:
+       */
+      x87_fchs( cp->func );
+      break;
+         
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+
+   case TGSI_UTIL_SIGN_CLEAR:
+      x87_fabs( cp->func );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      x87_fabs( cp->func );
+      x87_fchs( cp->func );
+      break;
+
+   default:
+      ERROR(cp, "unsupported sign-mode");
+      break;
+   }
+}
+
+
+
+
+
+
+/* Used to implement write masking.  This and most of the other instructions
+ * here would be easier to implement if there had been a translation
+ * to a 2 argument format (dst/arg0, arg1) at the shader level before
+ * attempting to translate to x86/sse code.
+ */
+static void store_dest( struct aos_compilation *cp, 
+                        const struct tgsi_full_dst_register *reg,
+                        struct x86_reg result )
+{
+   struct x86_reg dst;
+
+   switch (reg->DstRegister.WriteMask) {
+   case 0:
+      return;
+   
+   case TGSI_WRITEMASK_XYZW:
+      aos_adopt_xmm_reg(cp, 
+                        get_xmm_writable(cp, result), 
+                        reg->DstRegister.File,
+                        reg->DstRegister.Index,
+                        TRUE);
+      return;
+   default: 
+      break;
+   }
+
+   dst = aos_get_shader_reg_xmm(cp, 
+                                reg->DstRegister.File,
+                                reg->DstRegister.Index);
+
+   switch (reg->DstRegister.WriteMask) {
+   case TGSI_WRITEMASK_X:
+      sse_movss(cp->func, dst, get_xmm(cp, result));
+      break;
+      
+   case TGSI_WRITEMASK_ZW:
+      sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
+      break;
+
+   case TGSI_WRITEMASK_XY: 
+      result = get_xmm_writable(cp, result);
+      sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
+      dst = result;
+      break;
+
+   case TGSI_WRITEMASK_YZW: 
+      result = get_xmm_writable(cp, result);
+      sse_movss(cp->func, result, dst);
+      dst = result;
+      break;
+
+   default:
+      mask_write(cp, dst, result, reg->DstRegister.WriteMask);
+      break;
+   }
+
+   aos_adopt_xmm_reg(cp, 
+                     dst, 
+                     reg->DstRegister.File,
+                     reg->DstRegister.Index,
+                     TRUE);
+
+}
+
+static void inject_scalar( struct aos_compilation *cp,
+                           struct x86_reg dst,
+                           struct x86_reg result,
+                           ubyte swizzle )
+{
+   sse_shufps(cp->func, dst, dst, swizzle);
+   sse_movss(cp->func, dst, result);
+   sse_shufps(cp->func, dst, dst, swizzle);
+}
+
+
+static void store_scalar_dest( struct aos_compilation *cp, 
+                               const struct tgsi_full_dst_register *reg,
+                               struct x86_reg result )
+{
+   unsigned writemask = reg->DstRegister.WriteMask;
+   struct x86_reg dst;
+
+   if (writemask != TGSI_WRITEMASK_X &&
+       writemask != TGSI_WRITEMASK_Y &&
+       writemask != TGSI_WRITEMASK_Z &&
+       writemask != TGSI_WRITEMASK_W &&
+       writemask != 0) 
+   {
+      result = get_xmm_writable(cp, result); /* already true, right? */
+      sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
+      store_dest(cp, reg, result);
+      return;
+   }
+
+   result = get_xmm(cp, result);
+   dst = aos_get_shader_reg_xmm(cp, 
+                                reg->DstRegister.File,
+                                reg->DstRegister.Index);
+
+
+
+   switch (reg->DstRegister.WriteMask) {
+   case TGSI_WRITEMASK_X:
+      sse_movss(cp->func, dst, result);
+      break;
+
+   case TGSI_WRITEMASK_Y:
+      inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
+      break;
+
+   case TGSI_WRITEMASK_Z:
+      inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
+      break;
+
+   case TGSI_WRITEMASK_W:
+      inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
+      break;
+
+   default:
+      break;
+   }
+
+   aos_adopt_xmm_reg(cp, 
+                     dst, 
+                     reg->DstRegister.File,
+                     reg->DstRegister.Index,
+                     TRUE);
+}
+   
+
+
+static void x87_fst_or_nop( struct x86_function *func,
+                            unsigned writemask,
+                            unsigned channel,
+                            struct x86_reg ptr )
+{
+   assert(ptr.file == file_REG32);
+   if (writemask & (1<<channel)) 
+      x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
+}
+
+static void x87_fstp_or_pop( struct x86_function *func,
+                             unsigned writemask,
+                             unsigned channel,
+                             struct x86_reg ptr )
+{
+   assert(ptr.file == file_REG32);
+   if (writemask & (1<<channel)) 
+      x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
+   else
+      x87_fstp( func, x86_make_reg( file_x87, 0 ));
+}
+
+
+
+/* 
+ */
+static void x87_fstp_dest4( struct aos_compilation *cp,
+                            const struct tgsi_full_dst_register *dst )
+{
+   struct x86_reg ptr = get_dst_ptr(cp, dst); 
+   unsigned writemask = dst->DstRegister.WriteMask;
+
+   x87_fst_or_nop(cp->func, writemask, 0, ptr);
+   x87_fst_or_nop(cp->func, writemask, 1, ptr);
+   x87_fst_or_nop(cp->func, writemask, 2, ptr);
+   x87_fstp_or_pop(cp->func, writemask, 3, ptr);
+}
+
+/* Save current x87 state and put it into single precision mode.
+ */
+static void save_fpu_state( struct aos_compilation *cp )
+{
+   x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                       Offset(struct aos_machine, fpu_restore)));
+}
+
+static void restore_fpu_state( struct aos_compilation *cp )
+{
+   x87_fnclex(cp->func);
+   x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                      Offset(struct aos_machine, fpu_restore)));
+}
+
+static void set_fpu_round_neg_inf( struct aos_compilation *cp )
+{
+   if (cp->fpucntl != FPU_RND_NEG) {
+      cp->fpucntl = FPU_RND_NEG;
+      x87_fnclex(cp->func);
+      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, fpu_rnd_neg_inf)));
+   }
+}
+
+static void set_fpu_round_nearest( struct aos_compilation *cp )
+{
+   if (cp->fpucntl != FPU_RND_NEAREST) {
+      cp->fpucntl = FPU_RND_NEAREST;
+      x87_fnclex(cp->func);
+      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, fpu_rnd_nearest)));
+   }
+}
+
+
+static void x87_emit_ex2( struct aos_compilation *cp )
+{
+   struct x86_reg st0 = x86_make_reg(file_x87, 0);
+   struct x86_reg st1 = x86_make_reg(file_x87, 1);
+   int stack = cp->func->x87_stack;
+
+//   set_fpu_round_neg_inf( cp );
+
+   x87_fld(cp->func, st0);      /* a a */
+   x87_fprndint( cp->func );	/* int(a) a*/
+   x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */
+   x87_fxch(cp->func, st1);     /* frc(a) int(a) */
+   x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */
+   x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */
+   x87_faddp(cp->func, st1);	/* 2^frac(a) int(a)  */
+   x87_fscale(cp->func);	/* (2^frac(a)*2^int(int(a))) int(a) */
+                                /* 2^a int(a) */
+   x87_fstp(cp->func, st1);     /* 2^a */
+
+   assert( stack == cp->func->x87_stack);
+      
+}
+
+static void PIPE_CDECL print_reg( const char *msg,
+                                  const float *reg )
+{
+   debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
+}
+
+static void emit_print( struct aos_compilation *cp,
+                        const char *message, /* must point to a static string! */
+                        unsigned file,
+                        unsigned idx )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
+   unsigned i;
+
+   /* There shouldn't be anything on the x87 stack.  Can add this
+    * capacity later if need be.
+    */
+   assert(cp->func->x87_stack == 0);
+
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  We're obviously not concerned about performance on this
+    * debug path, so here goes:
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+
+      aos_release_xmm_reg(cp, i);
+   }
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+
+   /* Push the arguments:
+    */
+   x86_lea( cp->func, ecx, arg );
+   x86_push( cp->func, ecx );
+   x86_push_imm32( cp->func, (int)message );
+
+   /* Call the helper.  Could call debug_printf directly, but
+    * print_reg is a nice place to put a breakpoint if need be.
+    */
+   x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
+   x86_call( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+
+   /* Pop caller-save regs 
+    */
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   /* Done... 
+    */
+}
+
+/**
+ * The traditional instructions.  All operate on internal registers
+ * and ignore write masks and swizzling issues.
+ */
+
+static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+   sse_movaps(cp->func, tmp, arg0);
+   sse_mulps(cp->func, tmp, neg);
+   sse_maxps(cp->func, tmp, arg0);
+   
+   store_dest(cp, &op->FullDstRegisters[0], tmp);
+   return TRUE;
+}
+
+static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_addps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_fcos(cp->func);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+/* The dotproduct instructions don't really do that well in sse:
+ * XXX: produces wrong results -- disabled.
+ */
+static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp); 
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+   /* Now the hard bit: sum the first 3 values:
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+   
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);      
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+   
+   /* Now the hard bit: sum the values:
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+
+   /* Now the hard bit: sum the values (from DP3):
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+    struct x86_reg dst = aos_get_xmm_reg(cp);
+    struct x86_reg tmp = aos_get_xmm_reg(cp);
+    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+
+/*    dst[0] = 1.0     * 1.0F; */
+/*    dst[1] = arg0[1] * arg1[1]; */
+/*    dst[2] = arg0[2] * 1.0; */
+/*    dst[3] = 1.0     * arg1[3]; */
+
+    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
+    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
+    sse_mulps(cp->func, dst, tmp);
+
+    aos_release_xmm_reg(cp, tmp.idx);
+    store_dest(cp, &op->FullDstRegisters[0], dst);
+    return TRUE;
+}
+
+static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld1(cp->func);		/* 1 */
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0 1 */
+   x87_fyl2x(cp->func);	/* log2(a0) */
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_emit_ex2(cp);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_neg_inf( cp );
+
+   /* Load all sources first to avoid aliasing
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fprndint( cp->func );   
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_nearest( cp );
+
+   /* Load all sources first to avoid aliasing
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fprndint( cp->func );   
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   struct x86_reg st0 = x86_make_reg(file_x87, 0);
+   struct x86_reg st1 = x86_make_reg(file_x87, 1);
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_neg_inf( cp );
+
+   /* suck all the source values onto the stack before writing out any
+    * dst, which may alias...
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fld(cp->func, st0);     /* a a */
+         x87_fprndint( cp->func );   /* flr(a) a */
+         x87_fsubp(cp->func, st1);  /* frc(a) */
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+
+
+
+static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   unsigned lit_count = cp->lit_count++;
+   struct x86_reg result, arg0;
+   unsigned i;
+
+#if 1
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+#endif
+
+   if (writemask != TGSI_WRITEMASK_XYZW) 
+      result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
+   else 
+      result = get_dst_ptr(cp, &op->FullDstRegisters[0]);    
+
+   
+   arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
+   if (arg0.file == file_XMM) {
+      struct x86_reg tmp = x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, tmp[1]));
+      sse_movaps( cp->func, tmp, arg0 );
+      arg0 = tmp;
+   }
+                  
+      
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+   /* Push the arguments:
+    */
+   x86_push_imm32( cp->func, lit_count );
+
+   x86_lea( cp->func, ecx, arg0 );
+   x86_push( cp->func, ecx );
+
+   x86_lea( cp->func, ecx, result );
+   x86_push( cp->func, ecx );
+
+   x86_push( cp->func, cp->machine_EDX );
+
+   if (lit_count < MAX_LIT_INFO) {
+      x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, 
+                                             Offset(struct aos_machine, lit_info) + 
+                                             lit_count * sizeof(struct lit_info) + 
+                                             Offset(struct lit_info, func)));
+   }
+   else {
+      x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
+   }
+
+   x86_call( cp->func, ecx );
+            
+   x86_pop( cp->func, ecx );    /* fixme... */
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      store_dest( cp, 
+                  &op->FullDstRegisters[0],
+                  get_xmm_writable( cp, result ) );
+   }
+
+   return TRUE;
+}
+
+#if 0   
+static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+
+   if (writemask & TGSI_WRITEMASK_YZ) {
+      struct x86_reg st1 = x86_make_reg(file_x87, 1);
+      struct x86_reg st2 = x86_make_reg(file_x87, 2);
+
+      /* a1' = a1 <= 0 ? 1 : a1;  
+       */
+      x87_fldz(cp->func);                           /* 1 0  */
+#if 1
+      x87_fld1(cp->func);                           /* 1 0  */
+#else
+      /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
+       */
+      x87_fldz(cp->func);                           /* 1 0  */
+#endif
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0  */
+      x87_fcomi(cp->func, st2);	                    /* a1 1 0  */
+      x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */
+      x87_fstp(cp->func, st1);                      /* a1' 0  */
+      x87_fstp(cp->func, st1);                      /* a1'  */
+
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1'  */
+      x87_fxch(cp->func, st1);                      /* a1' a3  */
+      
+
+      /* Compute pow(a1, a3)
+       */
+      x87_fyl2x(cp->func);	/* a3*log2(a1)      */
+      x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */
+
+
+      /* a0' = max2(a0, 0):
+       */
+      x87_fldz(cp->func);                           /* 0 r2 */
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
+      x87_fcomi(cp->func, st1);	
+      x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */
+
+      x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
+
+      x87_fcomi(cp->func, st1);  /* a0' 0 r2 */
+      x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
+
+      x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
+      x87_fpop(cp->func);       /* r2 */
+      x87_fpop(cp->func);
+   }
+
+   if (writemask & TGSI_WRITEMASK_XW) {
+      x87_fld1(cp->func);
+      x87_fst_or_nop(cp->func, writemask, 0, dst);
+      x87_fstp_or_pop(cp->func, writemask, 3, dst);
+   }
+
+   return TRUE;
+}
+#endif
+
+
+
+static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_maxps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_minps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   /* potentially nothing to do */
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
+
+   /* If we can't clobber old contents of arg0, get a temporary & copy
+    * it there, then clobber it...
+    */
+   arg0 = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, arg0, arg1);
+   sse_addps(cp->func, arg0, arg2);
+   store_dest(cp, &op->FullDstRegisters[0], arg0);
+   return TRUE;
+}
+
+
+
+/* A wrapper for powf().
+ * Makes sure it is cdecl and operates on floats.
+ */
+static float PIPE_CDECL _powerf( float x, float y )
+{
+#if FAST_MATH
+   return util_fast_pow(x, y);
+#else
+   return powf( x, y );
+#endif
+}
+
+#if FAST_MATH
+static float PIPE_CDECL _exp2(float x)
+{
+   return util_fast_exp2(x);
+}
+#endif
+
+
+/* Really not sufficient -- need to check for conditions that could
+ * generate inf/nan values, which will slow things down hugely.
+ */
+static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+#if 0
+   x87_fld_src(cp, &op->FullSrcRegisters[1], 0);  /* a1.x */
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0.x a1.x */
+   x87_fyl2x(cp->func);	                                /* a1*log2(a0) */
+
+   x87_emit_ex2( cp );		/* 2^(a1*log2(a0)) */
+
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+#else
+   uint i;
+
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
+
+   x87_fld_src( cp, &op->FullSrcRegisters[1], 0 );
+   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
+   x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
+   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
+
+   /* tmp_EAX has been pushed & will be restored below */
+   x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
+   x86_call( cp->func, cp->tmp_EAX );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
+
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   /* Note retval on x87 stack:
+    */
+   cp->func->x87_stack++;
+
+   x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
+#endif
+   return TRUE;
+}
+
+
+#if FAST_MATH
+static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   uint i;
+
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
+
+   x87_fld_src( cp, &op->FullSrcRegisters[0], 0 );
+   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
+
+   /* tmp_EAX has been pushed & will be restored below */
+   x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
+   x86_call( cp->func, cp->tmp_EAX );
+
+   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
+
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   /* Note retval on x87 stack:
+    */
+   cp->func->x87_stack++;
+
+   x87_fstp_dest4( cp, &op->FullDstRegisters[0] );
+
+   return TRUE;
+}
+#endif
+
+
+static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg dst = aos_get_xmm_reg(cp);
+
+   if (cp->have_sse2) {
+      sse2_rcpss(cp->func, dst, arg0);
+      /* extend precision here...
+       */
+   }
+   else {
+      struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+      sse_movss(cp->func, dst, ones);
+      sse_divss(cp->func, dst, arg0);
+   }
+
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+/* Although rsqrtps() and rcpps() are low precision on some/all SSE
+ * implementations, it is possible to improve its precision at
+ * fairly low cost, using a newton/raphson step, as below:
+ * 
+ * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+ * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+ * or:
+ *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
+ * 
+ *
+ * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+
+   if (0) {
+      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+      struct x86_reg r = aos_get_xmm_reg(cp);
+      sse_rsqrtss(cp->func, r, arg0);
+      store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+      return TRUE;
+   }
+   else {
+      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+      struct x86_reg r = aos_get_xmm_reg(cp);
+
+      struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
+      struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
+      struct x86_reg src            = get_xmm_writable( cp, arg0 );
+      
+      sse_rsqrtss( cp->func, r, src  );             /* rsqrtss(a) */
+      sse_mulss(   cp->func, src, neg_half  );      /* -.5 * a */
+      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r */
+      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r * r */
+      sse_addss(   cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
+      sse_mulss(   cp->func, r,  src );             /* r * (1.5 - .5 * a * r * r) */
+
+      store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+      return TRUE;
+   }
+}
+
+
+static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
+   sse_andps(cp->func, dst, ones);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_fsin(cp->func);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+
+static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+   
+   sse_cmpps(cp->func, dst, arg1, cc_LessThan);
+   sse_andps(cp->func, dst, ones);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_subps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+
+   sse2_cvttps2dq(cp->func, tmp0, arg0);
+   sse2_cvtdq2ps(cp->func, tmp0, tmp0);
+
+   store_dest(cp, &op->FullDstRegisters[0], tmp0);
+   return TRUE;
+}
+
+static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+   struct x86_reg tmp1 = aos_get_xmm_reg(cp);
+
+   emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
+   sse_mulps(cp->func, tmp1, arg0);
+   emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
+   sse_mulps(cp->func, tmp0, arg1);
+   sse_subps(cp->func, tmp1, tmp0);
+   sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
+
+/*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
+/*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
+/*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
+/*    dst[3] is undef */
+
+
+   aos_release_xmm_reg(cp, tmp0.idx);
+   store_dest(cp, &op->FullDstRegisters[0], tmp1);
+   return TRUE;
+}
+
+
+
+static boolean
+emit_instruction( struct aos_compilation *cp,
+                  struct tgsi_full_instruction *inst )
+{
+   x87_assert_stack_empty(cp->func);
+
+   switch( inst->Instruction.Opcode ) {
+   case TGSI_OPCODE_MOV:
+      return emit_MOV( cp, inst );
+
+   case TGSI_OPCODE_LIT:
+      return emit_LIT(cp, inst);
+
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(cp, inst);
+
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(cp, inst);
+
+   case TGSI_OPCODE_EXP:
+      /*return emit_EXP(cp, inst);*/
+      return FALSE;
+
+   case TGSI_OPCODE_LOG:
+      /*return emit_LOG(cp, inst);*/
+      return FALSE;
+
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(cp, inst);
+
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(cp, inst);
+
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(cp, inst);
+
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(cp, inst);
+
+   case TGSI_OPCODE_DST:
+      return emit_DST(cp, inst);
+
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(cp, inst);
+
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(cp, inst);
+
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(cp, inst);
+
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(cp, inst);
+
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(cp, inst);
+
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(cp, inst);
+ 
+   case TGSI_OPCODE_LERP:
+//      return emit_LERP(cp, inst);
+      return FALSE;
+
+   case TGSI_OPCODE_FRAC:
+      return emit_FRC(cp, inst);
+
+   case TGSI_OPCODE_CLAMP:
+//      return emit_CLAMP(cp, inst);
+      return FALSE;
+
+   case TGSI_OPCODE_FLOOR:
+      return emit_FLR(cp, inst);
+
+   case TGSI_OPCODE_ROUND:
+      return emit_RND(cp, inst);
+
+   case TGSI_OPCODE_EXPBASE2:
+#if FAST_MATH
+      return emit_EXPBASE2(cp, inst);
+#elif 0
+      /* this seems to fail for "larger" exponents.
+       * See glean tvertProg1's EX2 test.
+       */
+      return emit_EX2(cp, inst);
+#else
+      return FALSE;
+#endif
+
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_LG2(cp, inst);
+
+   case TGSI_OPCODE_POWER:
+      return emit_POW(cp, inst);
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      return emit_XPD(cp, inst);
+
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(cp, inst);
+
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(cp, inst);
+
+   case TGSI_OPCODE_COS:
+      return emit_COS(cp, inst);
+
+   case TGSI_OPCODE_SIN:
+      return emit_SIN(cp, inst);
+
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(cp, inst);
+
+   case TGSI_OPCODE_END:
+      return TRUE;
+
+   default:
+      return FALSE;
+   }
+}
+
+
+static boolean emit_viewport( struct aos_compilation *cp )
+{
+   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
+                                               TGSI_FILE_OUTPUT, 
+                                               cp->vaos->draw->vs.position_output );
+
+   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, scale));
+
+   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, translate));
+
+   sse_mulps(cp->func, pos, scale);
+   sse_addps(cp->func, pos, translate);
+
+   aos_adopt_xmm_reg( cp,
+                      pos,
+                      TGSI_FILE_OUTPUT,
+                      cp->vaos->draw->vs.position_output,
+                      TRUE );
+   return TRUE;
+}
+
+
+/* This is useful to be able to see the results on softpipe.  Doesn't
+ * do proper clipping, just assumes the backend can do it during
+ * rasterization -- for debug only...
+ */
+static boolean emit_rhw_viewport( struct aos_compilation *cp )
+{
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
+                                               TGSI_FILE_OUTPUT, 
+                                               cp->vaos->draw->vs.position_output);
+
+   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, scale));
+
+   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, translate));
+
+
+
+   emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
+   sse2_rcpss(cp->func, tmp, tmp);
+   sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
+   
+   sse_mulps(cp->func, pos, scale);
+   sse_mulps(cp->func, pos, tmp);
+   sse_addps(cp->func, pos, translate);
+
+   /* Set pos[3] = w 
+    */
+   mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
+
+   aos_adopt_xmm_reg( cp,
+                      pos,
+                      TGSI_FILE_OUTPUT,
+                      cp->vaos->draw->vs.position_output,
+                      TRUE );
+   return TRUE;
+}
+
+
+#if 0
+static boolean note_immediate( struct aos_compilation *cp,
+                               struct tgsi_full_immediate *imm )
+{
+   unsigned pos = cp->num_immediates++;
+   unsigned j;
+
+   for (j = 0; j < imm->Immediate.Size; j++) {
+      cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
+   }
+
+   return TRUE;
+}
+#endif
+
+
+
+
+static void find_last_write_outputs( struct aos_compilation *cp )
+{
+   struct tgsi_parse_context parse;
+   unsigned this_instruction = 0;
+   unsigned i;
+
+   tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      
+      tgsi_parse_token( &parse );
+
+      if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) 
+         continue;
+
+      for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
+         if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
+             TGSI_FILE_OUTPUT) 
+         {
+            unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
+            cp->output_last_write[idx] = this_instruction;
+         }
+      }
+
+      this_instruction++;
+   }
+
+   tgsi_parse_free( &parse );
+}
+
+
+#define ARG_MACHINE    1
+#define ARG_START_ELTS 2
+#define ARG_COUNT      3
+#define ARG_OUTBUF     4
+
+
+static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
+                                     boolean linear )
+{ 
+   struct tgsi_parse_context parse;
+   struct aos_compilation cp;
+   unsigned fixup, label;
+
+   util_init_math();
+
+   tgsi_parse_init( &parse, varient->base.vs->state.tokens );
+
+   memset(&cp, 0, sizeof(cp));
+
+   cp.insn_counter = 1;
+   cp.vaos = varient;
+   cp.have_sse2 = 1;
+   cp.func = &varient->func[ linear ? 0 : 1 ];
+
+   cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
+   cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX);
+   cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
+   cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX);
+   cp.count_ESI     = x86_make_reg(file_REG32, reg_SI);
+   cp.temp_EBP     = x86_make_reg(file_REG32, reg_BP);
+   cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
+
+   x86_init_func(cp.func);
+
+   find_last_write_outputs(&cp);
+
+   x86_push(cp.func, cp.idx_EBX);
+   x86_push(cp.func, cp.count_ESI);
+   x86_push(cp.func, cp.temp_EBP);
+
+
+   /* Load arguments into regs:
+    */
+   x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
+   x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
+   x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
+   x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
+
+
+   /* Compare count to zero and possibly bail.
+    */
+   x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
+   x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
+   fixup = x86_jcc_forward(cp.func, cc_E);
+
+
+   save_fpu_state( &cp );
+   set_fpu_round_nearest( &cp );
+
+   aos_init_inputs( &cp, linear );
+
+   cp.x86_reg[0] = 0;
+   cp.x86_reg[1] = 0;
+   
+   /* Note address for loop jump 
+    */
+   label = x86_get_label(cp.func);
+   {
+      /* Fetch inputs...  TODO:  fetch lazily...
+       */
+      if (!aos_fetch_inputs( &cp, linear ))
+         goto fail;
+
+      /* Emit the shader:
+       */
+      while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) 
+      {
+         tgsi_parse_token( &parse );
+
+         switch (parse.FullToken.Token.Type) {
+         case TGSI_TOKEN_TYPE_IMMEDIATE:
+#if 0
+            if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
+               goto fail;
+#endif
+            break;
+
+         case TGSI_TOKEN_TYPE_INSTRUCTION:
+            if (DISASSEM)
+               tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
+
+            if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
+               goto fail;
+            break;
+         }
+
+         x87_assert_stack_empty(cp.func);
+         cp.insn_counter++;
+
+         if (DISASSEM)
+            debug_printf("\n");
+      }
+
+   
+      {
+         unsigned i;
+         for (i = 0; i < 8; i++) {
+            if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
+               cp.xmm[i].file = TGSI_FILE_NULL;
+               cp.xmm[i].dirty = 0;
+            }
+         }
+      }
+
+      if (cp.error)
+         goto fail;
+
+      if (cp.vaos->base.key.clip) {
+         /* not really handling clipping, just do the rhw so we can
+          * see the results...
+          */
+         emit_rhw_viewport(&cp); 
+      }
+      else if (cp.vaos->base.key.viewport) {
+         emit_viewport(&cp);
+      }
+
+      /* Emit output...  TODO: do this eagerly after the last write to a
+       * given output.
+       */
+      if (!aos_emit_outputs( &cp ))
+         goto fail;
+
+
+      /* Next vertex:
+       */
+      x86_lea(cp.func, 
+              cp.outbuf_ECX, 
+              x86_make_disp(cp.outbuf_ECX, 
+                            cp.vaos->base.key.output_stride));
+
+      /* Incr index
+       */   
+      aos_incr_inputs( &cp, linear );
+   }
+   /* decr count, loop if not zero
+    */
+   x86_dec(cp.func, cp.count_ESI);
+   x86_jcc(cp.func, cc_NZ, label);
+
+   restore_fpu_state(&cp);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(cp.func, fixup);
+
+   /* Exit mmx state?
+    */
+   if (cp.func->need_emms)
+      mmx_emms(cp.func);
+
+   x86_pop(cp.func, cp.temp_EBP);
+   x86_pop(cp.func, cp.count_ESI);
+   x86_pop(cp.func, cp.idx_EBX);
+
+   x87_assert_stack_empty(cp.func);
+   x86_ret(cp.func);
+
+   tgsi_parse_free( &parse );
+   return !cp.error;
+
+ fail:
+   tgsi_parse_free( &parse );
+   return FALSE;
+}
+
+
+
+static void vaos_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buf,
+                             const void *ptr,
+                             unsigned stride )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   if (buf < vaos->nr_vb) {
+      vaos->buffer[buf].base_ptr = (char *)ptr;
+      vaos->buffer[buf].stride = stride;
+   }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
+}
+
+
+
+static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
+                                      const unsigned *elts,
+                                      unsigned count,
+                                      void *output_buffer )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+   struct aos_machine *machine = vaos->draw->vs.aos_machine;
+
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
+   machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+   machine->constants = vaos->draw->vs.aligned_constants;
+   machine->immediates = vaos->base.vs->immediates;
+   machine->buffer = vaos->buffer;
+
+   vaos->gen_run_elts( machine,
+                       elts,
+                       count,
+                       output_buffer );
+}
+
+static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
+                                        unsigned start,
+                                        unsigned count,
+                                        void *output_buffer )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+   struct aos_machine *machine = vaos->draw->vs.aos_machine;
+
+   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
+                       vaos->base.key.const_vbuffers);
+
+   machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+   machine->constants = vaos->draw->vs.aligned_constants;
+   machine->immediates = vaos->base.vs->immediates;
+   machine->buffer = vaos->buffer;
+
+   vaos->gen_run_linear( machine,
+                         start,
+                         count,
+                         output_buffer );
+
+   /* Sanity spot checks to make sure we didn't trash our constants */
+   assert(machine->internal[IMM_ONES][0] == 1.0f);
+   assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
+   assert(machine->internal[IMM_NEGS][0] == -1.0f);
+}
+
+
+
+static void vaos_destroy( struct draw_vs_varient *varient )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   FREE( vaos->buffer );
+
+   x86_release_func( &vaos->func[0] );
+   x86_release_func( &vaos->func[1] );
+
+   FREE(vaos);
+}
+
+
+
+static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   unsigned i;
+   struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
+
+   if (!vaos)
+      goto fail;
+   
+   vaos->base.key = *key;
+   vaos->base.vs = vs;
+   vaos->base.set_buffer = vaos_set_buffer;
+   vaos->base.destroy = vaos_destroy;
+   vaos->base.run_linear = vaos_run_linear;
+   vaos->base.run_elts = vaos_run_elts;
+
+   vaos->draw = vs->draw;
+
+   for (i = 0; i < key->nr_inputs; i++) 
+      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+   if (!vaos->buffer)
+      goto fail;
+
+   if (0)
+      debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+
+#if 0
+   tgsi_dump(vs->state.tokens, 0);
+#endif
+
+   if (!build_vertex_program( vaos, TRUE ))
+      goto fail;
+
+   if (!build_vertex_program( vaos, FALSE ))
+      goto fail;
+
+   vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
+   if (!vaos->gen_run_linear)
+      goto fail;
+
+   vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
+   if (!vaos->gen_run_elts)
+      goto fail;
+
+   return &vaos->base;
+
+ fail:
+   if (vaos && vaos->buffer)
+      FREE(vaos->buffer);
+
+   if (vaos)
+      x86_release_func( &vaos->func[0] );
+
+   if (vaos)
+      x86_release_func( &vaos->func[1] );
+
+   FREE(vaos);
+   
+   return NULL;
+}
+
+
+struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   struct draw_vs_varient *varient = varient_aos_sse( vs, key );
+
+   if (varient == NULL) {
+      varient = draw_vs_varient_generic( vs, key );
+   }
+
+   return varient;
+}
+
+
+
+#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
new file mode 100644
index 0000000000..264387517b
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -0,0 +1,253 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VS_AOS_H
+#define DRAW_VS_AOS_H
+
+#include "pipe/p_config.h"
+
+#ifdef PIPE_ARCH_X86
+
+struct tgsi_token;
+struct x86_function;
+
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+#define MAX_INPUTS     PIPE_MAX_ATTRIBS
+#define MAX_OUTPUTS    PIPE_MAX_SHADER_OUTPUTS
+#define MAX_TEMPS      TGSI_EXEC_NUM_TEMPS
+#define MAX_CONSTANTS  1024  /** only used for sanity checking */
+#define MAX_IMMEDIATES 1024  /** only used for sanity checking */
+#define MAX_INTERNALS  8     /** see IMM_x values below */
+
+#define AOS_FILE_INTERNAL TGSI_FILE_COUNT
+
+#define FPU_RND_NEG    1
+#define FPU_RND_NEAREST 2
+
+struct aos_machine;
+typedef void (PIPE_CDECL *lit_func)( struct aos_machine *,
+                                    float *result,
+                                    const float *in,
+                                    unsigned count );
+
+void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
+                            float *result,
+                            const float *in,
+                            unsigned count );
+
+struct shine_tab {
+   float exponent;
+   float values[258];
+   unsigned last_used;
+};
+
+struct lit_info {
+   lit_func func;
+   struct shine_tab *shine_tab;
+};
+
+#define MAX_SHINE_TAB    4
+#define MAX_LIT_INFO     16
+
+struct aos_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
+};
+
+
+
+
+/* This is the temporary storage used by all the aos_sse vs varients.
+ * Create one per context and reuse by passing a pointer in at
+ * vs_varient creation??
+ */
+struct aos_machine {
+   float input    [MAX_INPUTS    ][4];
+   float output   [MAX_OUTPUTS   ][4];
+   float temp     [MAX_TEMPS     ][4];
+   float internal [MAX_INTERNALS ][4];
+
+   float scale[4];              /* viewport */
+   float translate[4];          /* viewport */
+
+   float tmp[2][4];             /* scratch space for LIT */
+
+   struct shine_tab shine_tab[MAX_SHINE_TAB];
+   struct lit_info  lit_info[MAX_LIT_INFO];
+   unsigned now;
+   
+
+   ushort fpu_rnd_nearest;
+   ushort fpu_rnd_neg_inf;
+   ushort fpu_restore;
+   ushort fpucntl;              /* one of FPU_* above */
+
+   const float (*immediates)[4];     /* points to shader data */
+   const float (*constants)[4];      /* points to draw data */
+
+   const struct aos_buffer *buffer; /* points to ? */
+};
+
+
+
+
+struct aos_compilation {
+   struct x86_function *func;
+   struct draw_vs_varient_aos_sse *vaos;
+
+   unsigned insn_counter;
+   unsigned num_immediates;
+   unsigned count;
+   unsigned lit_count;
+
+   struct {
+      unsigned idx:16;
+      unsigned file:8;
+      unsigned dirty:8;
+      unsigned last_used;
+   } xmm[8];
+
+   unsigned x86_reg[2];                /* one of X86_* */
+
+   boolean input_fetched[PIPE_MAX_ATTRIBS];
+   unsigned output_last_write[PIPE_MAX_ATTRIBS];
+
+   boolean have_sse2;
+   boolean error;
+   short fpucntl;
+
+   /* these are actually known values, but putting them in a struct
+    * like this is helpful to keep them in sync across the file.
+    */
+   struct x86_reg tmp_EAX;
+   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
+   struct x86_reg outbuf_ECX;
+   struct x86_reg machine_EDX;
+   struct x86_reg count_ESI;    /* decrements to zero */
+   struct x86_reg temp_EBP;
+   struct x86_reg stack_ESP;
+};
+
+struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp );
+void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx );
+
+void aos_adopt_xmm_reg( struct aos_compilation *cp,
+                        struct x86_reg reg,
+                        unsigned file,
+                        unsigned idx,
+                        unsigned dirty );
+
+void aos_spill_all( struct aos_compilation *cp );
+
+struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
+                                   unsigned file,
+                                   unsigned idx );
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
+
+boolean aos_emit_outputs( struct aos_compilation *cp );
+
+
+#define IMM_ONES     0              /* 1, 1,1,1 */
+#define IMM_SWZ      1              /* 1,-1,0, 0xffffffff */
+#define IMM_IDENTITY 2              /* 0, 0,0,1 */
+#define IMM_INV_255  3              /* 1/255, 1/255, 1/255, 1/255 */
+#define IMM_255      4              /* 255, 255, 255, 255 */
+#define IMM_NEGS     5              /* -1,-1,-1,-1 */
+#define IMM_RSQ      6              /* -.5,1.5,_,_ */
+#define IMM_PSIZE    7              /* not really an immediate - updated each run */
+
+struct x86_reg aos_get_internal( struct aos_compilation *cp,
+                                 unsigned imm );
+struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
+                                     unsigned imm );
+
+
+#define ERROR(cp, msg)                                                  \
+do {                                                                    \
+   if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \
+   cp->error = 1;                                                       \
+} while (0)
+
+
+#define X86_NULL       0
+#define X86_IMMEDIATES 1
+#define X86_CONSTANTS  2
+#define X86_BUFFERS    3
+
+struct x86_reg aos_get_x86( struct aos_compilation *cp,
+                            unsigned which_reg,
+                            unsigned value );
+
+
+typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *,
+                                               const unsigned *elts,
+                                               unsigned count,
+                                               void *output_buffer);
+
+typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *,
+                                                unsigned start,
+                                                unsigned count,
+                                                void *output_buffer);
+
+
+struct draw_vs_varient_aos_sse {
+   struct draw_vs_varient base;
+   struct draw_context *draw;
+
+   struct aos_buffer *buffer;
+   unsigned nr_vb;
+
+   vaos_run_linear_func gen_run_linear;
+   vaos_run_elts_func gen_run_elts;
+
+
+   struct x86_function func[2];
+};
+
+
+#endif
+
+#endif 
+
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
new file mode 100644
index 0000000000..39f75b50b7
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -0,0 +1,462 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "draw_vs.h"
+#include "draw_vs_aos.h"
+#include "draw_vertex.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+#ifdef PIPE_ARCH_X86
+
+/* Note - don't yet have to worry about interacting with the code in
+ * draw_vs_aos.c as there is no intermingling of generated code...
+ * That may have to change, we'll see.
+ */
+static void emit_load_R32G32B32A32( struct aos_compilation *cp, 			   
+				    struct x86_reg data,
+				    struct x86_reg src_ptr )
+{
+   sse_movups(cp->func, data, src_ptr);
+}
+
+static void emit_load_R32G32B32( struct aos_compilation *cp, 			   
+				 struct x86_reg data,
+				 struct x86_reg src_ptr )
+{
+#if 1
+   sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
+   /* data = z ? ? ? */
+   sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
+   /* data = z ? 0 1 */
+   sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
+   /* data = ? 0 z 1 */
+   sse_movlps(cp->func, data, src_ptr);
+   /* data = x y z 1 */
+#else
+   sse_movups(cp->func, data, src_ptr);
+   /* data = x y z ? */
+   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+   /* data = ? x y z */
+   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+   /* data = 1 x y z */
+   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+   /* data = x y z 1 */
+#endif
+}
+
+static void emit_load_R32G32( struct aos_compilation *cp, 
+			   struct x86_reg data,
+			   struct x86_reg src_ptr )
+{
+   sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
+   sse_movlps(cp->func, data, src_ptr);
+}
+
+
+static void emit_load_R32( struct aos_compilation *cp, 
+			   struct x86_reg data,
+			   struct x86_reg src_ptr )
+{
+   sse_movss(cp->func, data, src_ptr);
+   sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
+}
+
+
+static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
+				       struct x86_reg data,
+				       struct x86_reg src_ptr )
+{
+   sse_movss(cp->func, data, src_ptr);
+   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
+   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
+   sse2_cvtdq2ps(cp->func, data, data);
+   sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
+}
+
+
+
+/* Extended swizzles?  Maybe later.
+ */  
+static void emit_swizzle( struct aos_compilation *cp,
+			  struct x86_reg dest,
+			  struct x86_reg src,
+			  ubyte shuffle )
+{
+   sse_shufps(cp->func, dest, src, shuffle);
+}
+
+
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+                               boolean linear,
+                               unsigned buf_idx,
+                               struct x86_reg elt,
+                               struct x86_reg ptr)
+{
+   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                      buf_idx * sizeof(struct aos_buffer));
+
+   struct x86_reg buf_stride = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, stride));
+   if (linear) {
+      struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_ptr);
+      x86_mov(cp->func, elt, buf_stride);
+      x86_add(cp->func, elt, ptr);
+      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
+      x86_mov(cp->func, buf_ptr, elt);
+   }
+   else {
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_stride);
+      x86_imul(cp->func, ptr, elt);
+      x86_add(cp->func, ptr, buf_base_ptr);
+   }
+
+   cp->insn_counter++;
+
+   return TRUE;
+}
+
+
+static boolean load_input( struct aos_compilation *cp,
+                           unsigned idx,
+                           struct x86_reg bufptr )
+{
+   unsigned format = cp->vaos->base.key.element[idx].in.format;
+   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
+   struct x86_reg dataXMM = aos_get_xmm_reg(cp);
+
+   /* Figure out source pointer address:
+    */
+   struct x86_reg src = x86_make_disp(bufptr, offset);
+
+   aos_adopt_xmm_reg( cp,
+                      dataXMM,
+                      TGSI_FILE_INPUT,
+                      idx,
+                      TRUE );
+
+   switch (format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_load_R32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_load_R32G32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_load_R32G32B32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_load_R32G32B32A32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
+      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
+      break;
+   default:
+      ERROR(cp, "unhandled input format");
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+static boolean load_inputs( struct aos_compilation *cp,
+                            unsigned buffer,
+                            struct x86_reg ptr )
+{
+   unsigned i;
+
+   for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
+      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+         if (!load_input( cp, i, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+      }
+   }
+   
+   return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned i;
+   for (i = 0; i < cp->vaos->nr_vb; i++) {
+      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                         i * sizeof(struct aos_buffer));
+
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         x86_mov(cp->func, ptr, buf_base_ptr);
+
+         /* Load all inputs for this constant vertex buffer
+          */
+         load_inputs( cp, i, x86_deref(ptr) );
+         
+         /* Then just force them out to aos_machine.input[]
+          */
+         aos_spill_all( cp );
+
+      }
+      else if (linear) {
+
+         struct x86_reg elt = cp->idx_EBX;
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         struct x86_reg buf_stride = x86_make_disp(buf, 
+                                                   Offset(struct aos_buffer, stride));
+
+         struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                                Offset(struct aos_buffer, ptr));
+
+
+         /* Calculate pointer to current attrib:
+          */
+         x86_mov(cp->func, ptr, buf_stride);
+         x86_imul(cp->func, ptr, elt);
+         x86_add(cp->func, ptr, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (cp->vaos->nr_vb == 1) 
+            x86_mov( cp->func, elt, ptr );
+         else
+            x86_mov( cp->func, buf_ptr, ptr );
+
+         cp->insn_counter++;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned j;
+
+   for (j = 0; j < cp->vaos->nr_vb; j++) {
+      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+         /* just retreive pre-transformed input */
+      }
+      else if (linear && cp->vaos->nr_vb == 1) {
+         load_inputs( cp, 0, cp->idx_EBX );
+      }
+      else {
+         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
+            return FALSE;
+
+         if (!load_inputs( cp, j, ptr ))
+            return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                            (0 * sizeof(struct aos_buffer) + 
+                                             Offset(struct aos_buffer, stride)));
+
+      x86_add(cp->func, cp->idx_EBX, stride);
+      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
+   }
+   else if (linear) {
+      /* Nothing to do */
+   } 
+   else {
+      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+   }
+
+   return TRUE;
+}
+
+
+
+
+
+
+static void emit_store_R32G32B32A32( struct aos_compilation *cp, 			   
+				     struct x86_reg dst_ptr,
+				     struct x86_reg dataXMM )
+{
+   sse_movups(cp->func, dst_ptr, dataXMM);
+}
+
+static void emit_store_R32G32B32( struct aos_compilation *cp, 
+				  struct x86_reg dst_ptr,
+				  struct x86_reg dataXMM )
+{
+   sse_movlps(cp->func, dst_ptr, dataXMM);
+   sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+   sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
+}
+
+static void emit_store_R32G32( struct aos_compilation *cp, 
+			       struct x86_reg dst_ptr,
+			       struct x86_reg dataXMM )
+{
+   sse_movlps(cp->func, dst_ptr, dataXMM);
+}
+
+static void emit_store_R32( struct aos_compilation *cp, 
+			    struct x86_reg dst_ptr,
+			    struct x86_reg dataXMM )
+{
+   sse_movss(cp->func, dst_ptr, dataXMM);
+}
+
+
+
+static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
+				       struct x86_reg dst_ptr,
+				       struct x86_reg dataXMM )
+{
+   sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
+   sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
+   sse2_packssdw(cp->func, dataXMM, dataXMM);
+   sse2_packuswb(cp->func, dataXMM, dataXMM);
+   sse_movss(cp->func, dst_ptr, dataXMM);
+}
+
+
+
+
+
+static boolean emit_output( struct aos_compilation *cp,
+                            struct x86_reg ptr,
+                            struct x86_reg dataXMM, 
+                            unsigned format )
+{
+   switch (format) {
+   case EMIT_1F:
+   case EMIT_1F_PSIZE:
+      emit_store_R32(cp, ptr, dataXMM);
+      break;
+   case EMIT_2F:
+      emit_store_R32G32(cp, ptr, dataXMM);
+      break;
+   case EMIT_3F:
+      emit_store_R32G32B32(cp, ptr, dataXMM);
+      break;
+   case EMIT_4F:
+      emit_store_R32G32B32A32(cp, ptr, dataXMM);
+      break;
+   case EMIT_4UB:
+      if (1) {
+         emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
+      }
+      else {
+         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
+      }
+      break;
+   default:
+      ERROR(cp, "unhandled output format");
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
+boolean aos_emit_outputs( struct aos_compilation *cp )
+{
+   unsigned i;
+   
+   for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
+      unsigned format = cp->vaos->base.key.element[i].out.format;
+      unsigned offset = cp->vaos->base.key.element[i].out.offset;
+      unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
+
+      struct x86_reg data;
+
+      if (format == EMIT_1F_PSIZE) {
+         data = aos_get_internal_xmm( cp, IMM_PSIZE );
+      }
+      else {
+         data = aos_get_shader_reg( cp, 
+                                    TGSI_FILE_OUTPUT,
+                                    vs_output );
+      }
+
+      if (data.file != file_XMM) {
+         struct x86_reg tmp = aos_get_xmm_reg( cp );
+         sse_movaps(cp->func, tmp, data);
+         data = tmp;
+      }
+      
+      if (!emit_output( cp, 
+                        x86_make_disp( cp->outbuf_ECX, offset ),
+                        data, 
+                        format ))
+         return FALSE;
+
+      aos_release_xmm_reg( cp, data.idx );
+
+      cp->insn_counter++;
+   }
+
+   return TRUE;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
new file mode 100644
index 0000000000..b358bd2df4
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
@@ -0,0 +1,324 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_config.h"
+
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "draw_vs.h"
+#include "draw_vs_aos.h"
+#include "draw_vertex.h"
+
+#ifdef PIPE_ARCH_X86
+
+#include "rtasm/rtasm_x86sse.h"
+
+
+#define X87_CW_EXCEPTION_INV_OP       (1<<0)
+#define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
+#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
+#define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
+#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
+#define X87_CW_EXCEPTION_PRECISION    (1<<5)
+#define X87_CW_PRECISION_SINGLE       (0<<8)
+#define X87_CW_PRECISION_RESERVED     (1<<8)
+#define X87_CW_PRECISION_DOUBLE       (2<<8)
+#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
+#define X87_CW_PRECISION_MASK         (3<<8)
+#define X87_CW_ROUND_NEAREST          (0<<10)
+#define X87_CW_ROUND_DOWN             (1<<10)
+#define X87_CW_ROUND_UP               (2<<10)
+#define X87_CW_ROUND_ZERO             (3<<10)
+#define X87_CW_ROUND_MASK             (3<<10)
+#define X87_CW_INFINITY               (1<<12)
+
+
+void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
+                            float *result,
+                            const float *in,
+                            unsigned count )
+{
+   if (in[0] > 0) 
+   {
+      if (in[1] <= 0.0) 
+      {
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = 1.0;
+         result[3] = 1.0F;
+      }
+      else
+      {
+         const float epsilon = 1.0F / 256.0F;    
+         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = powf(in[1], exponent);
+         result[3] = 1.0;
+      }
+   }
+   else 
+   {
+      result[0] = 1.0F;
+      result[1] = 0.0;
+      result[2] = 0.0;
+      result[3] = 1.0F;
+   }
+}
+
+
+static void PIPE_CDECL do_lit_lut( struct aos_machine *machine,
+                                   float *result,
+                                   const float *in,
+                                   unsigned count )
+{
+   if (in[0] > 0) 
+   {
+      if (in[1] <= 0.0) 
+      {
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = 1.0;
+         result[3] = 1.0F;
+         return;
+      }
+      
+      if (machine->lit_info[count].shine_tab->exponent != in[3]) {
+         machine->lit_info[count].func = aos_do_lit;
+         goto no_luck;
+      }
+
+      if (in[1] <= 1.0)
+      {
+         const float *tab = machine->lit_info[count].shine_tab->values;
+         float f = in[1] * 256;
+         int k = (int)f;
+         float frac = f - (float)k;
+         
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
+         result[3] = 1.0;
+         return;
+      }
+      
+   no_luck:
+      {
+         const float epsilon = 1.0F / 256.0F;    
+         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = powf(in[1], exponent);
+         result[3] = 1.0;
+      }
+   }
+   else 
+   {
+      result[0] = 1.0F;
+      result[1] = 0.0;
+      result[2] = 0.0;
+      result[3] = 1.0F;
+   }
+}
+
+
+static void do_populate_lut( struct shine_tab *tab,
+                             float unclamped_exponent )
+{
+   const float epsilon = 1.0F / 256.0F;    
+   float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
+   unsigned i;
+
+   tab->exponent = unclamped_exponent; /* for later comparison */
+   
+   tab->values[0] = 0;
+   if (exponent == 0) {
+      for (i = 1; i < 258; i++) {
+         tab->values[i] = 1.0;
+      }      
+   }
+   else {
+      for (i = 1; i < 258; i++) {
+         tab->values[i] = powf((float)i * epsilon, exponent);
+      }
+   }
+}
+
+
+
+
+static void PIPE_CDECL populate_lut( struct aos_machine *machine,
+                                     float *result,
+                                     const float *in,
+                                     unsigned count )
+{
+   unsigned i, tab;
+
+   /* Search for an existing table for this value.  Note that without
+    * static analysis we don't really know if in[3] will be constant,
+    * but it usually is...
+    */
+   for (tab = 0; tab < 4; tab++) {
+      if (machine->shine_tab[tab].exponent == in[3]) {
+         goto found;
+      }
+   }
+
+   for (tab = 0, i = 1; i < 4; i++) {
+      if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
+         tab = i;
+   }
+
+   if (machine->shine_tab[tab].last_used == machine->now) {
+      /* No unused tables (this is not a ffvertex program...).  Just
+       * call pow each time:
+       */
+      machine->lit_info[count].func = aos_do_lit;
+      machine->lit_info[count].func( machine, result, in, count );
+      return;
+   }
+   else {
+      do_populate_lut( &machine->shine_tab[tab], in[3] );
+   }
+
+ found:
+   machine->shine_tab[tab].last_used = machine->now;
+   machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
+   machine->lit_info[count].func = do_lit_lut;
+   machine->lit_info[count].func( machine, result, in, count );
+}
+
+
+void draw_vs_aos_machine_constants( struct aos_machine *machine,
+                                    const float (*constants)[4] )
+{
+   machine->constants = constants;
+
+   {
+      unsigned i;
+      for (i = 0; i < MAX_LIT_INFO; i++) {
+         machine->lit_info[i].func = populate_lut;
+         machine->now++;
+      }
+   }
+}
+
+
+void draw_vs_aos_machine_viewport( struct aos_machine *machine,
+                                   const struct pipe_viewport_state *viewport )
+{
+   memcpy(machine->scale, viewport->scale, 4 * sizeof(float));
+   memcpy(machine->translate, viewport->translate, 4 * sizeof(float));
+}
+
+
+
+void draw_vs_aos_machine_destroy( struct aos_machine *machine )
+{
+   align_free(machine);
+}
+
+struct aos_machine *draw_vs_aos_machine( void )
+{
+   struct aos_machine *machine;
+   unsigned i;
+   float inv = 1.0f/255.0f;
+   float f255 = 255.0f;
+
+   machine = align_malloc(sizeof(struct aos_machine), 16);
+   if (!machine)
+      return NULL;
+
+   memset(machine, 0, sizeof(*machine));
+
+   ASSIGN_4V(machine->internal[IMM_SWZ],       1.0f,  -1.0f,  0.0f, 1.0f);
+   *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
+
+   ASSIGN_4V(machine->internal[IMM_ONES],      1.0f,  1.0f,  1.0f,  1.0f);
+   ASSIGN_4V(machine->internal[IMM_NEGS],     -1.0f, -1.0f, -1.0f, -1.0f);
+   ASSIGN_4V(machine->internal[IMM_IDENTITY],  0.0f,  0.0f,  0.0f,  1.0f);
+   ASSIGN_4V(machine->internal[IMM_INV_255],   inv,   inv,   inv,   inv);
+   ASSIGN_4V(machine->internal[IMM_255],       f255,  f255,  f255,  f255);
+   ASSIGN_4V(machine->internal[IMM_RSQ],       -.5f,  1.5f,  0.0f,  0.0f);
+
+
+   machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
+                               X87_CW_EXCEPTION_DENORM_OP |
+                               X87_CW_EXCEPTION_ZERO_DIVIDE |
+                               X87_CW_EXCEPTION_OVERFLOW |
+                               X87_CW_EXCEPTION_UNDERFLOW |
+                               X87_CW_EXCEPTION_PRECISION |
+                               (1<<6) |
+                               X87_CW_ROUND_NEAREST |
+                               X87_CW_PRECISION_DOUBLE_EXT);
+
+   assert(machine->fpu_rnd_nearest == 0x37f);
+                               
+   machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
+                               X87_CW_EXCEPTION_DENORM_OP |
+                               X87_CW_EXCEPTION_ZERO_DIVIDE |
+                               X87_CW_EXCEPTION_OVERFLOW |
+                               X87_CW_EXCEPTION_UNDERFLOW |
+                               X87_CW_EXCEPTION_PRECISION |
+                               (1<<6) |
+                               X87_CW_ROUND_DOWN |
+                               X87_CW_PRECISION_DOUBLE_EXT);
+
+   for (i = 0; i < MAX_SHINE_TAB; i++)
+      do_populate_lut( &machine->shine_tab[i], 1.0f );
+
+   return machine;
+}
+
+#else
+
+void draw_vs_aos_machine_viewport( struct aos_machine *machine,
+                                   const struct pipe_viewport_state *viewport )
+{
+}
+
+void draw_vs_aos_machine_constants( struct aos_machine *machine,
+                                    const float (*constants)[4] )
+{
+}
+
+void draw_vs_aos_machine_destroy( struct aos_machine *machine )
+{
+}
+
+struct aos_machine *draw_vs_aos_machine( void )
+{
+   return NULL;
+}
+#endif
+
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
new file mode 100644
index 0000000000..b3200df811
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+
+struct exec_vertex_shader {
+   struct draw_vertex_shader base;
+   struct tgsi_exec_machine *machine;
+};
+
+static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
+{
+   return (struct exec_vertex_shader *)vs;
+}
+
+
+/* Not required for run_linear.
+ */
+static void
+vs_exec_prepare( struct draw_vertex_shader *shader,
+		 struct draw_context *draw )
+{
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+
+   /* Specify the vertex program to interpret/execute.
+    * Avoid rebinding when possible.
+    */
+   if (evs->machine->Tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(evs->machine,
+                                    shader->state.tokens,
+                                    draw->vs.num_samplers,
+                                    draw->vs.samplers);
+   }
+}
+
+
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_exec_run_linear( struct draw_vertex_shader *shader,
+		    const float (*input)[4],
+		    float (*output)[4],
+		    const float (*constants)[4],
+		    unsigned count,
+		    unsigned input_stride,
+		    unsigned output_stride )
+{
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+   struct tgsi_exec_machine *machine = evs->machine;
+   unsigned int i, j;
+   unsigned slot;
+
+   machine->Consts = constants;
+
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+
+      /* Swizzle inputs.  
+       */
+      for (j = 0; j < max_vertices; j++) {
+#if 0
+         debug_printf("%d) Input vert:\n", i + j);
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+			 input[slot][0],
+			 input[slot][1],
+			 input[slot][2],
+			 input[slot][3]);
+         }
+#endif
+
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
+            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
+            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
+            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
+         }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
+      } 
+
+      tgsi_set_exec_mask(machine,
+                         1,
+                         max_vertices > 1,
+                         max_vertices > 2,
+                         max_vertices > 3);
+
+      /* run interpreter */
+      tgsi_exec_machine_run( machine );
+
+      /* Unswizzle all output results.  
+       */
+      for (j = 0; j < max_vertices; j++) {
+         for (slot = 0; slot < shader->info.num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+
+         }
+
+#if 0
+	 debug_printf("%d) Post xform vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_outputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 output[slot][0],
+			 output[slot][1],
+			 output[slot][2],
+			 output[slot][3]);
+         }
+#endif
+
+	 output = (float (*)[4])((char *)output + output_stride);
+      } 
+
+   }
+}
+
+
+
+
+static void
+vs_exec_delete( struct draw_vertex_shader *dvs )
+{
+   FREE((void*) dvs->state.tokens);
+   FREE( dvs );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_exec(struct draw_context *draw,
+		    const struct pipe_shader_state *state)
+{
+   struct exec_vertex_shader *vs = CALLOC_STRUCT( exec_vertex_shader );
+
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(state->tokens);
+   if (!vs->base.state.tokens) {
+      FREE(vs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(state->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+   vs->base.prepare = vs_exec_prepare;
+   vs->base.run_linear = vs_exec_run_linear;
+   vs->base.delete = vs_exec_delete;
+   vs->base.create_varient = draw_vs_varient_generic;
+   vs->machine = &draw->vs.machine;
+
+   return &vs->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
new file mode 100644
index 0000000000..727977bc3a
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -0,0 +1,161 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#ifdef MESA_LLVM
+
+#include "gallivm/gallivm.h"
+
+struct draw_llvm_vertex_shader {
+   struct draw_vertex_shader base;
+   struct gallivm_prog *llvm_prog;
+   struct tgsi_exec_machine *machine;
+};
+
+
+static void
+vs_llvm_prepare( struct draw_vertex_shader *base,
+		 struct draw_context *draw )
+{
+}
+
+
+
+
+static void
+vs_llvm_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_llvm_vertex_shader *shader =
+      (struct draw_llvm_vertex_shader *)base;
+
+   gallivm_cpu_vs_exec(shader->llvm_prog, shader->machine,
+                       input, base->info.num_inputs, output, base->info.num_outputs,
+                       constants, count, input_stride, output_stride);
+}
+
+
+
+static void
+vs_llvm_delete( struct draw_vertex_shader *base )
+{
+   struct draw_llvm_vertex_shader *shader =
+      (struct draw_llvm_vertex_shader *)base;
+
+   /* Do something to free compiled shader:
+    */
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *templ)
+{
+   struct draw_llvm_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_llvm_vertex_shader );
+   if (vs == NULL)
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens) {
+      FREE(vs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(vs->base.state.tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+   vs->base.prepare = vs_llvm_prepare;
+   vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.run_linear = vs_llvm_run_linear;
+   vs->base.delete = vs_llvm_delete;
+   vs->machine = &draw->vs.machine;
+
+   {
+      struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
+      gallivm_ir_set_layout(ir, GALLIVM_SOA);
+      gallivm_ir_set_components(ir, 4);
+      gallivm_ir_fill_from_tgsi(ir, vs->base.state.tokens);
+      vs->llvm_prog = gallivm_ir_compile(ir);
+      gallivm_ir_delete(ir);
+   }
+
+   draw->vs.engine = gallivm_global_cpu_engine();
+
+   /* XXX: Why are there two versions of this?  Shouldn't creating the
+    *      engine be a separate operation to compiling a shader?
+    */
+   if (!draw->vs.engine) {
+      draw->vs.engine = gallivm_cpu_engine_create(vs->llvm_prog);
+   }
+   else {
+      gallivm_cpu_jit_compile(draw->vs.engine, vs->llvm_prog);
+   }
+
+   return &vs->base;
+}
+
+
+
+
+
+#else
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+                          const struct pipe_shader_state *shader)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
new file mode 100644
index 0000000000..8b75136144
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_ppc.h"
+#include "tgsi/tgsi_ppc.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+typedef void (PIPE_CDECL *codegen_function) (float (*inputs)[4][4],
+                                             float (*outputs)[4][4],
+                                             float (*temps)[4][4],
+                                             float (*immeds)[4],
+                                             float (*consts)[4],
+                                             const float *builtins);
+
+
+struct draw_ppc_vertex_shader {
+   struct draw_vertex_shader base;
+   struct ppc_function ppc_program;
+
+   codegen_function func;
+};
+
+
+static void
+vs_ppc_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+   /* nothing */
+}
+
+
+/**
+ * Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_ppc_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   unsigned int i;
+
+#define MAX_VERTICES 4
+
+   /* loop over verts */
+   for (i = 0; i < count; i += MAX_VERTICES) {
+      const uint max_vertices = MIN2(MAX_VERTICES, count - i);
+      float inputs_soa[PIPE_MAX_SHADER_INPUTS][4][4] ALIGN16_ATTRIB;
+      float outputs_soa[PIPE_MAX_SHADER_OUTPUTS][4][4] ALIGN16_ATTRIB;
+      float temps_soa[TGSI_EXEC_NUM_TEMPS][4][4] ALIGN16_ATTRIB;
+      uint attr;
+
+      /* convert (up to) four input verts to SoA format */
+      for (attr = 0; attr < base->info.num_inputs; attr++) {
+         const float *vIn = (const float *) input;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+#if 0
+            if (attr==0)
+               printf("Input v%d a%d: %f %f %f %f\n",
+                      vert, attr, vIn[0], vIn[1], vIn[2], vIn[3]);
+#endif
+            inputs_soa[attr][0][vert] = vIn[attr * 4 + 0];
+            inputs_soa[attr][1][vert] = vIn[attr * 4 + 1];
+            inputs_soa[attr][2][vert] = vIn[attr * 4 + 2];
+            inputs_soa[attr][3][vert] = vIn[attr * 4 + 3];
+            vIn += input_stride / 4;
+         }
+      }
+
+      /* run compiled shader
+       */
+      shader->func(inputs_soa, outputs_soa, temps_soa,
+		   (float (*)[4]) shader->base.immediates,
+		   (float (*)[4]) constants,
+                   ppc_builtin_constants);
+
+      /* convert (up to) four output verts from SoA back to AoS format */
+      for (attr = 0; attr < base->info.num_outputs; attr++) {
+         float *vOut = (float *) output;
+         uint vert;
+         for (vert = 0; vert < max_vertices; vert++) {
+            vOut[attr * 4 + 0] = outputs_soa[attr][0][vert];
+            vOut[attr * 4 + 1] = outputs_soa[attr][1][vert];
+            vOut[attr * 4 + 2] = outputs_soa[attr][2][vert];
+            vOut[attr * 4 + 3] = outputs_soa[attr][3][vert];
+#if 0
+            if (attr==0)
+               printf("Output v%d a%d: %f %f %f %f\n",
+                      vert, attr, vOut[0], vOut[1], vOut[2], vOut[3]);
+#endif
+            vOut += output_stride / 4;
+         }
+      }
+
+      /* advance to next group of four input/output verts */
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+static void
+vs_ppc_delete( struct draw_vertex_shader *base )
+{
+   struct draw_ppc_vertex_shader *shader = (struct draw_ppc_vertex_shader *)base;
+   
+   ppc_release_func( &shader->ppc_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc(struct draw_context *draw,
+                   const struct pipe_shader_state *templ)
+{
+   struct draw_ppc_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_ppc_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+#if 0
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_ppc;
+   else
+#endif
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_ppc_prepare;
+   vs->base.run_linear = vs_ppc_run_linear;
+   vs->base.delete = vs_ppc_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
+                                      sizeof(float), 16);
+
+   ppc_init_func( &vs->ppc_program );
+
+   if (!tgsi_emit_ppc( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->ppc_program, 
+                       (float (*)[4]) vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) ppc_get_func( &vs->ppc_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   /*
+   debug_error("tgsi_emit_ppc() failed, falling back to interpreter\n");
+   */
+
+   ppc_release_func( &vs->ppc_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else /* PIPE_ARCH_PPC */
+
+
+struct draw_vertex_shader *
+draw_create_vs_ppc( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
new file mode 100644
index 0000000000..77ba5152f9
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_config.h"
+
+#include "draw_vs.h"
+
+#if defined(PIPE_ARCH_X86)
+
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_x86sse.h"
+#include "tgsi/tgsi_sse2.h"
+#include "tgsi/tgsi_parse.h"
+
+#define SSE_MAX_VERTICES 4
+
+typedef void (PIPE_CDECL *codegen_function) (
+   const struct tgsi_exec_vector *input, /* 1 */
+   struct tgsi_exec_vector *output, /* 2 */
+   float (*constant)[4],        /* 3 */
+   struct tgsi_exec_vector *temporary, /* 4 */
+   float (*immediates)[4],      /* 5 */
+   const float (*aos_input)[4], /* 6 */
+   uint num_inputs,             /* 7 */
+   uint input_stride,           /* 8 */
+   float (*aos_output)[4],      /* 9 */
+   uint num_outputs,            /* 10 */
+   uint output_stride );        /* 11 */
+
+struct draw_sse_vertex_shader {
+   struct draw_vertex_shader base;
+   struct x86_function sse2_program;
+
+   codegen_function func;
+   
+   struct tgsi_exec_machine *machine;
+};
+
+
+static void
+vs_sse_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+}
+
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
+ */
+static void
+vs_sse_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
+{
+   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i;
+
+   /* By default, execute all channels.  XXX move this inside the loop
+    * below when we support shader conditionals/loops.
+    */
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+
+      if (max_vertices < 4) {
+         /* disable the unused execution channels */
+         tgsi_set_exec_mask(machine,
+                            1,
+                            max_vertices > 1,
+                            max_vertices > 2,
+                            0);
+      }
+
+      /* run compiled shader
+       */
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   (float (*)[4])shader->base.immediates,
+                   input,
+                   base->info.num_inputs,
+                   input_stride,
+                   output,
+                   base->info.num_outputs,
+                   output_stride );
+
+      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
+      output = (float (*)[4])((char *)output + output_stride * max_vertices);
+   }
+}
+
+
+
+
+static void
+vs_sse_delete( struct draw_vertex_shader *base )
+{
+   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
+   
+   x86_release_func( &shader->sse2_program );
+
+   align_free( (void *) shader->base.immediates );
+
+   FREE( (void*) shader->base.state.tokens );
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_sse(struct draw_context *draw,
+                          const struct pipe_shader_state *templ)
+{
+   struct draw_sse_vertex_shader *vs;
+
+   if (!rtasm_cpu_has_sse2())
+      return NULL;
+
+   vs = CALLOC_STRUCT( draw_sse_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   /* we make a private copy of the tokens */
+   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!vs->base.state.tokens)
+      goto fail;
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
+   vs->base.draw = draw;
+   if (1)
+      vs->base.create_varient = draw_vs_varient_aos_sse;
+   else
+      vs->base.create_varient = draw_vs_varient_generic;
+   vs->base.prepare = vs_sse_prepare;
+   vs->base.run_linear = vs_sse_run_linear;
+   vs->base.delete = vs_sse_delete;
+   
+   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
+                                      sizeof(float), 16);
+
+   vs->machine = &draw->vs.machine;
+   
+   x86_init_func( &vs->sse2_program );
+
+   if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
+			&vs->sse2_program, 
+                        (float (*)[4])vs->base.immediates, 
+                        TRUE )) 
+      goto fail;
+      
+   vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
+   if (!vs->func) {
+      goto fail;
+   }
+   
+   return &vs->base;
+
+fail:
+   debug_error("tgsi_emit_sse2() failed, falling back to interpreter\n");
+
+   x86_release_func( &vs->sse2_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else
+
+struct draw_vertex_shader *
+draw_create_vs_sse( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif
+
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
new file mode 100644
index 0000000000..7ee567d478
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -0,0 +1,322 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vs.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+/* A first pass at incorporating vertex fetch/emit functionality into 
+ */
+struct draw_vs_varient_generic {
+   struct draw_vs_varient base;
+
+   struct draw_vertex_shader *shader;
+   struct draw_context *draw;
+   
+   /* Basic plan is to run these two translate functions before/after
+    * the vertex shader's existing run_linear() routine to simulate
+    * the inclusion of this functionality into the shader...  
+    * 
+    * Next will look at actually including it.
+    */
+   struct translate *fetch;
+   struct translate *emit;
+
+   unsigned temp_vertex_stride;
+};
+
+
+
+
+
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buffer,
+                             const void *ptr,
+                             unsigned stride )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+
+   vsvg->fetch->set_buffer(vsvg->fetch, 
+                           buffer, 
+                           ptr, 
+                           stride);
+}
+
+
+/* Mainly for debug at this stage:
+ */
+static void do_rhw_viewport( struct draw_vs_varient_generic *vsvg,
+                             unsigned count,
+                             void *output_buffer )
+{
+   char *ptr = (char *)output_buffer;
+   const float *scale = vsvg->base.vs->draw->viewport.scale;
+   const float *trans = vsvg->base.vs->draw->viewport.translate;
+   unsigned stride = vsvg->temp_vertex_stride;
+   unsigned j;
+
+   ptr += vsvg->base.vs->position_output * 4 * sizeof(float);
+
+   for (j = 0; j < count; j++, ptr += stride) {
+      float *data = (float *)ptr;
+      float w = 1.0f / data[3];
+
+      data[0] = data[0] * w * scale[0] + trans[0];
+      data[1] = data[1] * w * scale[1] + trans[1];
+      data[2] = data[2] * w * scale[2] + trans[2];
+      data[3] = w;
+   }
+}
+
+static void do_viewport( struct draw_vs_varient_generic *vsvg,
+                         unsigned count,
+                         void *output_buffer )
+{
+   char *ptr = (char *)output_buffer;
+   const float *scale = vsvg->base.vs->draw->viewport.scale;
+   const float *trans = vsvg->base.vs->draw->viewport.translate;
+   unsigned stride = vsvg->temp_vertex_stride;
+   unsigned j;
+
+   ptr += vsvg->base.vs->position_output * 4 * sizeof(float);
+
+   for (j = 0; j < count; j++, ptr += stride) {
+      float *data = (float *)ptr;
+
+      data[0] = data[0] * scale[0] + trans[0];
+      data[1] = data[1] * scale[1] + trans[1];
+      data[2] = data[2] * scale[2] + trans[2];
+   }
+}
+                         
+
+static void PIPE_CDECL vsvg_run_elts( struct draw_vs_varient *varient,
+                                      const unsigned *elts,
+                                      unsigned count,
+                                      void *output_buffer)
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+   unsigned temp_vertex_stride = vsvg->temp_vertex_stride;
+   void *temp_buffer = MALLOC( align(count,4) * temp_vertex_stride );
+   
+   if (0) debug_printf("%s %d \n", __FUNCTION__,  count);
+			
+   /* Want to do this in small batches for cache locality?
+    */
+   
+   vsvg->fetch->run_elts( vsvg->fetch, 
+                          elts,
+                          count,
+                          temp_buffer );
+
+   vsvg->base.vs->run_linear( vsvg->base.vs, 
+                              temp_buffer,
+                              temp_buffer,
+                              (const float (*)[4])vsvg->base.vs->draw->pt.user.constants,
+                              count,
+                              temp_vertex_stride, 
+                              temp_vertex_stride);
+
+
+   if (vsvg->base.key.clip) {
+      /* not really handling clipping, just do the rhw so we can
+       * see the results...
+       */
+      do_rhw_viewport( vsvg,
+                       count,
+                       temp_buffer );
+   }
+   else if (vsvg->base.key.viewport) {
+      do_viewport( vsvg,
+                   count,
+                   temp_buffer );
+   }
+
+
+   vsvg->emit->set_buffer( vsvg->emit,
+                           0, 
+                           temp_buffer,
+                           temp_vertex_stride );
+
+   vsvg->emit->set_buffer( vsvg->emit, 
+                           1,
+                           &vsvg->draw->rasterizer->point_size,
+                           0);
+
+   vsvg->emit->run( vsvg->emit,
+                    0, count,
+                    output_buffer );
+
+   FREE(temp_buffer);
+}
+
+
+static void PIPE_CDECL vsvg_run_linear( struct draw_vs_varient *varient,
+                                        unsigned start,
+                                        unsigned count,
+                                        void *output_buffer )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+   unsigned temp_vertex_stride = vsvg->temp_vertex_stride;
+   void *temp_buffer = MALLOC( align(count,4) * temp_vertex_stride );
+	
+   if (0) debug_printf("%s %d %d (sz %d, %d)\n", __FUNCTION__, start, count,
+                       vsvg->base.key.output_stride,
+                       temp_vertex_stride);
+
+   vsvg->fetch->run( vsvg->fetch, 
+                     start,
+                     count,
+                     temp_buffer );
+
+   vsvg->base.vs->run_linear( vsvg->base.vs, 
+                              temp_buffer,
+                              temp_buffer,
+                              (const float (*)[4])vsvg->base.vs->draw->pt.user.constants,
+                              count,
+                              temp_vertex_stride, 
+                              temp_vertex_stride);
+
+   if (vsvg->base.key.clip) {
+      /* not really handling clipping, just do the rhw so we can
+       * see the results...
+       */
+      do_rhw_viewport( vsvg,
+                       count,
+                       temp_buffer );
+   }
+   else if (vsvg->base.key.viewport) {
+      do_viewport( vsvg,
+                   count,
+                   temp_buffer );
+   }
+
+   vsvg->emit->set_buffer( vsvg->emit,
+                           0, 
+                           temp_buffer,
+                           temp_vertex_stride );
+   
+   vsvg->emit->set_buffer( vsvg->emit, 
+                           1,
+                           &vsvg->draw->rasterizer->point_size,
+                           0);
+   
+   vsvg->emit->run( vsvg->emit,
+                    0, count,
+                    output_buffer );
+
+   FREE(temp_buffer);
+}
+
+
+
+
+
+static void vsvg_destroy( struct draw_vs_varient *varient )
+{
+   FREE(varient);
+}
+
+
+struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   unsigned i;
+   struct translate_key fetch, emit;
+
+   struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic );
+   if (vsvg == NULL)
+      return NULL;
+
+   vsvg->base.key = *key;
+   vsvg->base.vs = vs;
+   vsvg->base.set_buffer    = vsvg_set_buffer;
+   vsvg->base.run_elts      = vsvg_run_elts;
+   vsvg->base.run_linear    = vsvg_run_linear;
+   vsvg->base.destroy       = vsvg_destroy;
+
+   vsvg->draw = vs->draw;
+
+   vsvg->temp_vertex_stride = MAX2(key->nr_inputs,
+                                   vsvg->base.vs->info.num_outputs) * 4 * sizeof(float);
+
+   /* Build free-standing fetch and emit functions:
+    */
+   fetch.nr_elements = key->nr_inputs;
+   fetch.output_stride = vsvg->temp_vertex_stride;
+   for (i = 0; i < key->nr_inputs; i++) {
+      fetch.element[i].input_format = key->element[i].in.format;
+      fetch.element[i].input_buffer = key->element[i].in.buffer;
+      fetch.element[i].input_offset = key->element[i].in.offset;
+      fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      fetch.element[i].output_offset = i * 4 * sizeof(float);
+      assert(fetch.element[i].output_offset < fetch.output_stride);
+   }
+
+
+   emit.nr_elements = key->nr_outputs;
+   emit.output_stride = key->output_stride;
+   for (i = 0; i < key->nr_outputs; i++) {
+      if (key->element[i].out.format != EMIT_1F_PSIZE)
+      {      
+         emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         emit.element[i].input_buffer = 0;
+         emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float);
+         emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format);
+         emit.element[i].output_offset = key->element[i].out.offset;
+         assert(emit.element[i].input_offset <= fetch.output_stride);
+      }
+      else {
+         emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT;
+         emit.element[i].input_buffer = 1;
+         emit.element[i].input_offset = 0;
+         emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT;
+         emit.element[i].output_offset = key->element[i].out.offset;
+      }
+   }
+
+   vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch );
+   vsvg->emit = draw_vs_get_emit( vs->draw, &emit );
+
+   return &vsvg->base;
+}
+
+
+
+
+
diff --git a/src/gallium/auxiliary/gallivm/Makefile b/src/gallium/auxiliary/gallivm/Makefile
new file mode 100644
index 0000000000..c3f7bfba93
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/Makefile
@@ -0,0 +1,92 @@
+# -*-makefile-*-
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = gallivm
+
+
+GALLIVM_SOURCES = \
+        gallivm.cpp  \
+        gallivm_cpu.cpp \
+        instructions.cpp  \
+        loweringpass.cpp \
+        tgsitollvm.cpp \
+        storage.cpp \
+        storagesoa.cpp \
+        instructionssoa.cpp
+
+INC_SOURCES = gallivm_builtins.cpp gallivmsoabuiltins.cpp
+
+CPP_SOURCES = \
+	$(GALLIVM_SOURCES)
+
+C_SOURCES =
+ASM_SOURCES =
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+          $(CPP_SOURCES:.cpp=.o) \
+	  $(ASM_SOURCES:.S=.o)
+
+### Include directories
+INCLUDES = \
+	-I. \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/include
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(LLVM_CFLAGS) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDES) $(LLVM_CXXFLAGS) $(CXXFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+##### TARGETS #####
+
+default:: depend symlinks $(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) Makefile
+	$(TOP)/bin/mklib -o $@ -static $(OBJECTS)
+
+
+depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(INC_SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) \
+		$(ASM_SOURCES) $(INC_SOURCES) 2> /dev/null
+
+
+gallivm_builtins.cpp: llvm_builtins.c
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp1.bin
+	(echo "static const unsigned char llvm_builtins_data[] = {"; od -txC temp1.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/};/") >$@
+	rm temp1.bin
+
+gallivmsoabuiltins.cpp: soabuiltins.c
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp2.bin
+	(echo "static const unsigned char soabuiltins_data[] = {"; od -txC temp2.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/};/") >$@
+	rm temp2.bin
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o
+	-rm -f depend depend.bak
+	-rm -f gallivm_builtins.cpp
+	-rm -f gallivmsoabuiltins.cpp
+
+symlinks:
+
+
+include depend
diff --git a/src/gallium/auxiliary/gallivm/SConscript b/src/gallium/auxiliary/gallivm/SConscript
new file mode 100644
index 0000000000..c0aa51b90a
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/SConscript
@@ -0,0 +1,16 @@
+Import('*')
+
+gallivm = env.ConvenienceLibrary(
+	target = 'gallivm',
+	source = [
+        'gallivm.cpp',
+        'gallivm_cpu.cpp',
+        'instructions.cpp',
+        'loweringpass.cpp',
+        'tgsitollvm.cpp',
+        'storage.cpp',
+        'storagesoa.cpp',
+        'instructionssoa.cpp',
+	])
+
+auxiliaries.insert(0, gallivm)
diff --git a/src/gallium/auxiliary/gallivm/gallivm.cpp b/src/gallium/auxiliary/gallivm/gallivm.cpp
new file mode 100644
index 0000000000..29adeea47d
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/gallivm.cpp
@@ -0,0 +1,332 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "instructions.h"
+#include "loweringpass.h"
+#include "storage.h"
+#include "tgsitollvm.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+static int GLOBAL_ID = 0;
+
+using namespace llvm;
+
+static inline
+void AddStandardCompilePasses(PassManager &PM)
+{
+   PM.add(new LoweringPass());
+   PM.add(createVerifierPass());                  // Verify that input is correct
+
+   PM.add(createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
+
+   //PM.add(createStripSymbolsPass(true));
+
+   PM.add(createRaiseAllocationsPass());     // call %malloc -> malloc inst
+   PM.add(createCFGSimplificationPass());    // Clean up disgusting code
+   PM.add(createPromoteMemoryToRegisterPass());// Kill useless allocas
+   PM.add(createGlobalOptimizerPass());      // Optimize out global vars
+   PM.add(createGlobalDCEPass());            // Remove unused fns and globs
+   PM.add(createIPConstantPropagationPass());// IP Constant Propagation
+   PM.add(createDeadArgEliminationPass());   // Dead argument elimination
+   PM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
+   PM.add(createCFGSimplificationPass());    // Clean up after IPCP & DAE
+
+   PM.add(createPruneEHPass());              // Remove dead EH info
+
+   PM.add(createFunctionInliningPass());   // Inline small functions
+   PM.add(createArgumentPromotionPass());    // Scalarize uninlined fn args
+
+   PM.add(createTailDuplicationPass());      // Simplify cfg by copying code
+   PM.add(createInstructionCombiningPass()); // Cleanup for scalarrepl.
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createScalarReplAggregatesPass()); // Break up aggregate allocas
+   PM.add(createInstructionCombiningPass()); // Combine silly seq's
+   PM.add(createCondPropagationPass());      // Propagate conditionals
+
+   PM.add(createTailCallEliminationPass());  // Eliminate tail calls
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createReassociatePass());          // Reassociate expressions
+   PM.add(createLoopRotatePass());
+   PM.add(createLICMPass());                 // Hoist loop invariants
+   PM.add(createLoopUnswitchPass());         // Unswitch loops.
+   PM.add(createLoopIndexSplitPass());       // Index split loops.
+   PM.add(createInstructionCombiningPass()); // Clean up after LICM/reassoc
+   PM.add(createIndVarSimplifyPass());       // Canonicalize indvars
+   PM.add(createLoopUnrollPass());           // Unroll small loops
+   PM.add(createInstructionCombiningPass()); // Clean up after the unroller
+   PM.add(createGVNPass());                  // Remove redundancies
+   PM.add(createSCCPPass());                 // Constant prop with SCCP
+
+   // Run instcombine after redundancy elimination to exploit opportunities
+   // opened up by them.
+   PM.add(createInstructionCombiningPass());
+   PM.add(createCondPropagationPass());      // Propagate conditionals
+
+   PM.add(createDeadStoreEliminationPass()); // Delete dead stores
+   PM.add(createAggressiveDCEPass());        // SSA based 'Aggressive DCE'
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createSimplifyLibCallsPass());     // Library Call Optimizations
+   PM.add(createDeadTypeEliminationPass());  // Eliminate dead types
+   PM.add(createConstantMergePass());        // Merge dup global constants
+}
+
+void gallivm_prog_delete(struct gallivm_prog *prog)
+{
+   delete prog->module;
+   prog->module = 0;
+   prog->function = 0;
+   free(prog);
+}
+
+static inline void
+constant_interpolation(float (*inputs)[16][4],
+                       const struct tgsi_interp_coef *coefs,
+                       unsigned attrib,
+                       unsigned chan)
+{
+   unsigned i;
+
+   for (i = 0; i < QUAD_SIZE; ++i) {
+      inputs[i][attrib][chan] = coefs[attrib].a0[chan];
+   }
+}
+
+static inline void
+linear_interpolation(float (*inputs)[16][4],
+                     const struct tgsi_interp_coef *coefs,
+                     unsigned attrib,
+                     unsigned chan)
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      const float x = inputs[i][0][0];
+      const float y = inputs[i][0][1];
+
+      inputs[i][attrib][chan] =
+         coefs[attrib].a0[chan] +
+         coefs[attrib].dadx[chan] * x +
+         coefs[attrib].dady[chan] * y;
+   }
+}
+
+static inline void
+perspective_interpolation(float (*inputs)[16][4],
+                          const struct tgsi_interp_coef *coefs,
+                          unsigned attrib,
+                          unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      const float x = inputs[i][0][0];
+      const float y = inputs[i][0][1];
+      /* WPOS.w here is really 1/w */
+      const float w = 1.0f / inputs[i][0][3];
+      assert(inputs[i][0][3] != 0.0);
+
+      inputs[i][attrib][chan] =
+         (coefs[attrib].a0[chan] +
+          coefs[attrib].dadx[chan] * x +
+          coefs[attrib].dady[chan] * y) * w;
+   }
+}
+
+void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
+{
+   if (!ir || !ir->module)
+      return;
+
+   if (file_prefix) {
+      std::ostringstream stream;
+      stream << file_prefix;
+      stream << ir->id;
+      stream << ".ll";
+      std::string name = stream.str();
+      std::ofstream out(name.c_str());
+      if (!out) {
+         std::cerr<<"Can't open file : "<<stream.str()<<std::endl;;
+         return;
+      }
+      out << (*ir->module);
+      out.close();
+   } else {
+      const llvm::Module::FunctionListType &funcs = ir->module->getFunctionList();
+      llvm::Module::FunctionListType::const_iterator itr;
+      std::cout<<"; ---------- Start shader "<<ir->id<<std::endl;
+      for (itr = funcs.begin(); itr != funcs.end(); ++itr) {
+         const llvm::Function &func = (*itr);
+         std::string name = func.getName();
+         const llvm::Function *found = 0;
+         if (name.find("vs_shader") != std::string::npos ||
+             name.find("fs_shader") != std::string::npos ||
+             name.find("function") != std::string::npos)
+            found = &func;
+         if (found) {
+            std::cout<<*found<<std::endl;
+         }
+      }
+      std::cout<<"; ---------- End shader "<<ir->id<<std::endl;
+   }
+}
+
+
+void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
+                                     float (*inputs)[16][4],
+                                     const struct tgsi_interp_coef *coef)
+{
+   for (int i = 0; i < prog->num_interp; ++i) {
+      const gallivm_interpolate &interp = prog->interpolators[i];
+      switch (interp.type) {
+      case TGSI_INTERPOLATE_CONSTANT:
+         constant_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      case TGSI_INTERPOLATE_LINEAR:
+         linear_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         perspective_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+}
+
+
+struct gallivm_ir * gallivm_ir_new(enum gallivm_shader_type type)
+{
+   struct gallivm_ir *ir =
+      (struct gallivm_ir *)calloc(1, sizeof(struct gallivm_ir));
+   ++GLOBAL_ID;
+   ir->id   = GLOBAL_ID;
+   ir->type = type;
+
+   return ir;
+}
+
+void gallivm_ir_set_layout(struct gallivm_ir *ir,
+                           enum gallivm_vector_layout layout)
+{
+   ir->layout = layout;
+}
+
+void gallivm_ir_set_components(struct gallivm_ir *ir, int num)
+{
+   ir->num_components = num;
+}
+
+void gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
+                               const struct tgsi_token *tokens)
+{
+   std::cout << "Creating llvm from: " <<std::endl;
+   tgsi_dump(tokens, 0);
+
+   llvm::Module *mod = tgsi_to_llvmir(ir, tokens);
+   ir->module = mod;
+   gallivm_ir_dump(ir, 0);
+}
+
+void gallivm_ir_delete(struct gallivm_ir *ir)
+{
+   delete ir->module;
+   free(ir);
+}
+
+struct gallivm_prog * gallivm_ir_compile(struct gallivm_ir *ir)
+{
+   struct gallivm_prog *prog =
+      (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
+
+   std::cout << "Before optimizations:"<<std::endl;
+   ir->module->dump();
+   std::cout<<"-------------------------------"<<std::endl;
+
+   PassManager veri;
+   veri.add(createVerifierPass());
+   veri.run(*ir->module);
+   llvm::Module *mod = llvm::CloneModule(ir->module);
+   prog->num_consts = ir->num_consts;
+   memcpy(prog->interpolators, ir->interpolators, sizeof(prog->interpolators));
+   prog->num_interp = ir->num_interp;
+
+   /* Run optimization passes over it */
+   PassManager passes;
+   passes.add(new TargetData(mod));
+   AddStandardCompilePasses(passes);
+   passes.run(*mod);
+   prog->module = mod;
+
+   std::cout << "After optimizations:"<<std::endl;
+   mod->dump();
+
+   return prog;
+}
+
+#endif /* MESA_LLVM */
diff --git a/src/gallium/auxiliary/gallivm/gallivm.h b/src/gallium/auxiliary/gallivm/gallivm.h
new file mode 100644
index 0000000000..36a64a7747
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/gallivm.h
@@ -0,0 +1,118 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+
+#ifndef GALLIVM_H
+#define GALLIVM_H
+
+/*
+  LLVM representation consists of two stages - layout independent
+  intermediate representation gallivm_ir and driver specific
+  gallivm_prog. TGSI is first being translated into gallivm_ir
+  after that driver can set number of options on gallivm_ir and
+  have it compiled into gallivm_prog. gallivm_prog can be either
+  executed (assuming there's LLVM JIT backend for the current
+  target) or machine code generation can be done (assuming there's
+  a LLVM code generator for thecurrent target)
+ */
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#include "pipe/p_state.h"
+
+#ifdef MESA_LLVM
+
+struct tgsi_token;
+
+struct gallivm_ir;
+struct gallivm_prog;
+struct gallivm_cpu_engine;
+struct tgsi_interp_coef;
+struct tgsi_sampler;
+struct tgsi_exec_vector;
+
+enum gallivm_shader_type {
+   GALLIVM_VS,
+   GALLIVM_FS
+};
+
+enum gallivm_vector_layout {
+   GALLIVM_AOS,
+   GALLIVM_SOA
+};
+
+struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
+void               gallivm_ir_set_layout(struct gallivm_ir *ir,
+                                         enum gallivm_vector_layout layout);
+void               gallivm_ir_set_components(struct gallivm_ir *ir, int num);
+void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
+                                             const struct tgsi_token *tokens);
+void               gallivm_ir_delete(struct gallivm_ir *ir);
+
+
+struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
+
+void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
+                                     float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
+                                     const struct tgsi_interp_coef *coefs);
+void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix);
+
+
+struct gallivm_cpu_engine *gallivm_cpu_engine_create(struct gallivm_prog *prog);
+struct gallivm_cpu_engine *gallivm_global_cpu_engine();
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_machine *machine,
+                        const float (*input)[4],
+                        unsigned num_inputs,
+                        float (*output)[4],
+                        unsigned num_outputs,
+                        const float (*constants)[4],
+                        unsigned count,
+                        unsigned input_stride,
+                        unsigned output_stride);
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float x, float y,
+                        float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers);
+void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *ee, struct gallivm_prog *prog);
+void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *ee);
+
+
+#endif /* MESA_LLVM */
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
new file mode 100644
index 0000000000..fcc5c05794
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -0,0 +1,140 @@
+static const unsigned char llvm_builtins_data[] = {
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
+0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
+0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
+0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
+0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
+0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
+0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
+0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
+0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
+0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
new file mode 100644
index 0000000000..93a9748bdb
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -0,0 +1,243 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "instructions.h"
+#include "loweringpass.h"
+#include "storage.h"
+#include "tgsitollvm.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+struct gallivm_cpu_engine {
+   llvm::ExecutionEngine *engine;
+};
+
+static struct gallivm_cpu_engine *CPU = 0;
+
+typedef int (*fragment_shader_runner)(float x, float y,
+                                      float (*dests)[16][4],
+                                      float (*inputs)[16][4],
+                                      int num_attribs,
+                                      float (*consts)[4], int num_consts,
+                                      struct tgsi_sampler *samplers);
+
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float fx, float fy,
+                        float (*dests)[16][4],
+                        float (*inputs)[16][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers)
+{
+   fragment_shader_runner runner = reinterpret_cast<fragment_shader_runner>(prog->function);
+   assert(runner);
+
+   return runner(fx, fy, dests, inputs, prog->num_interp,
+                 consts, prog->num_consts,
+                 samplers);
+}
+
+static inline llvm::Function *func_for_shader(struct gallivm_prog *prog)
+{
+   llvm::Module *mod = prog->module;
+   llvm::Function *func = 0;
+
+   switch (prog->type) {
+   case GALLIVM_VS:
+      func = mod->getFunction("vs_shader");
+      break;
+   case GALLIVM_FS:
+      func = mod->getFunction("fs_shader");
+      break;
+   default:
+      assert(!"Unknown shader type!");
+      break;
+   }
+   return func;
+}
+
+/*!
+  This function creates a CPU based execution engine for the given gallivm_prog.
+  gallivm_cpu_engine should be used as a singleton throughout the library. Before
+  executing gallivm_prog_exec one needs to call gallivm_cpu_jit_compile.
+  The gallivm_prog instance which is being passed to the constructor is being
+  automatically JIT compiled so one shouldn't call gallivm_cpu_jit_compile
+  with it again.
+ */
+struct gallivm_cpu_engine * gallivm_cpu_engine_create(struct gallivm_prog *prog)
+{
+   struct gallivm_cpu_engine *cpu = (struct gallivm_cpu_engine *)
+                                    calloc(1, sizeof(struct gallivm_cpu_engine));
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = llvm::ExecutionEngine::create(mp, false);
+   ee->DisableLazyCompilation();
+   cpu->engine = ee;
+
+   llvm::Function *func = func_for_shader(prog);
+
+   prog->function = ee->getPointerToFunction(func);
+   CPU = cpu;
+   return cpu;
+}
+
+
+/*!
+  This function JIT compiles the given gallivm_prog with the given cpu based execution engine.
+  The reference to the generated machine code entry point will be stored
+  in the gallivm_prog program. After executing this function one can call gallivm_prog_exec
+  in order to execute the gallivm_prog on the CPU.
+ */
+void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog *prog)
+{
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = cpu->engine;
+   assert(ee);
+   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+   ee->DisableLazyCompilation(false);
+   ee->addModuleProvider(mp);
+
+   llvm::Function *func = func_for_shader(prog);
+   prog->function = ee->getPointerToFunction(func);
+}
+
+void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *cpu)
+{
+   free(cpu);
+}
+
+struct gallivm_cpu_engine * gallivm_global_cpu_engine()
+{
+   return CPU;
+}
+
+
+typedef void (*vertex_shader_runner)(void *ainputs,
+                                     void *dests,
+                                     float (*aconsts)[4]);
+
+#define MAX_TGSI_VERTICES 4
+/*!
+  This function is used to execute the gallivm_prog in software. Before calling
+  this function the gallivm_prog has to be JIT compiled with the gallivm_cpu_jit_compile
+  function.
+ */
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_machine *machine,
+                        const float (*input)[4],
+                        unsigned num_inputs,
+                        float (*output)[4],
+                        unsigned num_outputs,
+                        const float (*constants)[4],
+                        unsigned count,
+                        unsigned input_stride,
+                        unsigned output_stride )
+{
+   unsigned int i, j;
+   unsigned slot;
+   vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
+   assert(runner);
+
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+
+      /* Swizzle inputs.
+       */
+      for (j = 0; j < max_vertices; j++) {
+	 for (slot = 0; slot < num_inputs; slot++) {
+	    machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
+	    machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
+	    machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
+	    machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
+	 }
+
+	 input = (const float (*)[4])((const char *)input + input_stride);
+      }
+
+      /* run shader */
+      runner(machine->Inputs,
+             machine->Outputs,
+             (float (*)[4]) constants);
+
+      /* Unswizzle all output results
+       */
+      for (j = 0; j < max_vertices; j++) {
+         for (slot = 0; slot < num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         }
+         output = (float (*)[4])((char *)output + output_stride);
+      }
+   }
+
+   return 0;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/gallivm_p.h b/src/gallium/auxiliary/gallivm/gallivm_p.h
new file mode 100644
index 0000000000..d2c5852bdf
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/gallivm_p.h
@@ -0,0 +1,110 @@
+#ifndef GALLIVM_P_H
+#define GALLIVM_P_H
+
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_compiler.h"
+
+namespace llvm {
+   class Module;
+}
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+enum gallivm_shader_type;
+enum gallivm_vector_layout;
+
+struct gallivm_interpolate {
+   int attrib;
+   int chan;
+   int type;
+};
+
+struct gallivm_ir {
+   llvm::Module *module;
+   int id;
+   enum gallivm_shader_type type;
+   enum gallivm_vector_layout layout;
+   int num_components;
+   int   num_consts;
+
+   /* FIXME: this might not be enough for some shaders */
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+struct gallivm_prog {
+   llvm::Module *module;
+   void *function;
+
+   int   id;
+   enum gallivm_shader_type type;
+
+   int   num_consts;
+
+   /* FIXME: this might not be enough for some shaders */
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+static INLINE void gallivm_swizzle_components(int swizzle,
+                                              int *xc, int *yc,
+                                              int *zc, int *wc)
+{
+   int x = swizzle / 1000; swizzle -= x * 1000;
+   int y = swizzle / 100;  swizzle -= y * 100;
+   int z = swizzle / 10;   swizzle -= z * 10;
+   int w = swizzle;
+
+   if (xc) *xc = x;
+   if (yc) *yc = y;
+   if (zc) *zc = z;
+   if (wc) *wc = w;
+}
+
+static INLINE boolean gallivm_is_swizzle(int swizzle)
+{
+   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
+                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
+   return swizzle != NO_SWIZZLE;
+}
+
+static INLINE int gallivm_x_swizzle(int swizzle)
+{
+   int x;
+   gallivm_swizzle_components(swizzle, &x, 0, 0, 0);
+   return x;
+}
+
+static INLINE int gallivm_y_swizzle(int swizzle)
+{
+   int y;
+   gallivm_swizzle_components(swizzle, 0, &y, 0, 0);
+   return y;
+}
+
+static INLINE int gallivm_z_swizzle(int swizzle)
+{
+   int z;
+   gallivm_swizzle_components(swizzle, 0, 0, &z, 0);
+   return z;
+}
+
+static INLINE int gallivm_w_swizzle(int swizzle)
+{
+   int w;
+   gallivm_swizzle_components(swizzle, 0, 0, 0, &w);
+   return w;
+}
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* MESA_LLVM */
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
new file mode 100644
index 0000000000..599975d5ad
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -0,0 +1,1193 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+#ifdef MESA_LLVM
+
+#include "instructions.h"
+
+#include "storage.h"
+
+#include "util/u_memory.h"
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Function.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+
+#include "gallivm_builtins.cpp"
+
+#if 0
+llvm::Value *arrayFromChannels(std::vector<llvm::Value*> &vals)
+{
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
+}
+#endif
+
+static inline std::string createFuncName(int label)
+{
+   std::ostringstream stream;
+   stream << "function";
+   stream << label;
+   return stream.str();
+}
+
+Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
+                           Storage *storage)
+   :  m_mod(mod), m_func(func), m_builder(block), m_idx(0),
+      m_storage(storage)
+{
+   m_floatVecType = VectorType::get(Type::FloatTy, 4);
+
+   m_llvmFSqrt = 0;
+   m_llvmFAbs  = 0;
+   m_llvmPow   = 0;
+   m_llvmFloor = 0;
+   m_llvmFlog  = 0;
+   m_llvmFexp  = 0;
+   m_llvmLit  = 0;
+   m_fmtPtr = 0;
+
+   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
+      (const char*)&llvm_builtins_data[0],
+      (const char*)&llvm_builtins_data[Elements(llvm_builtins_data)-1]);
+   m_mod = ParseBitcodeFile(buffer);
+}
+
+llvm::BasicBlock * Instructions::currentBlock() const
+{
+   return m_builder.GetInsertBlock();
+}
+
+llvm::Value * Instructions::abs(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
+}
+
+llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateAdd(in1, in2, name("add"));
+}
+
+llvm::Value * Instructions::arl(llvm::Value *in)
+{
+   return floor(in);
+}
+
+void Instructions::beginLoop()
+{
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
+}
+
+void Instructions::bgnSub(unsigned label)
+{
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
+}
+
+void Instructions::brk()
+{
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
+}
+
+void Instructions::cal(int label, llvm::Value *input)
+{
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
+}
+
+llvm::Value * Instructions::ceil(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+                         callCeil(vec[2]), callCeil(vec[3]));
+}
+
+llvm::Value * Instructions::clamp(llvm::Value *in1)
+{
+   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+   return min( max(zero, in1), one);
+}
+
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *half = ConstantFP::get(APFloat(0.5f));
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+llvm::Value * Instructions::cos(llvm::Value *in)
+{
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
+   call->setTailCall(false);
+   return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
+}
+
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
+}
+
+llvm::Value * Instructions::ddx(llvm::Value *in)
+{
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(in3,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   return vectorFromVals(xy, xy, xy, xy);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
+}
+
+llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   std::vector<llvm::Value*> vec = extractVector(mulRes);
+   Value *xy = m_builder.CreateAdd(vec[0], vec[1], name("xy"));
+   Value *xyz = m_builder.CreateAdd(xy, vec[2], name("xyz"));
+   Value *dot4 = m_builder.CreateAdd(xyz, vec[3], name("dot4"));
+   return vectorFromVals(dot4, dot4, dot4, dot4);
+}
+
+llvm::Value * Instructions::dph(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   std::vector<llvm::Value*> vec1 = extractVector(mulRes);
+   Value *xy = m_builder.CreateAdd(vec1[0], vec1[1], name("xy"));
+   Value *xyz = m_builder.CreateAdd(xy, vec1[2], name("xyz"));
+   Value *dph = m_builder.CreateAdd(xyz, vec1[3], name("dph"));
+   return vectorFromVals(dph, dph, dph, dph);
+}
+
+llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(2),
+                                             name("z"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *w = m_builder.CreateExtractElement(in2,
+                                             m_storage->constantInt(3),
+                                             name("w"));
+   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
+   return vectorFromVals(ConstantFP::get(APFloat(1.f)),
+                         ry, z, w);
+}
+
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+                             callFExp(vec[2]), callFExp(vec[3]));
+}
+
+llvm::Value * Instructions::ex2(llvm::Value *in)
+{
+   llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)),
+                              m_builder.CreateExtractElement(
+                                 in, m_storage->constantInt(0),
+                                 name("x1")));
+   return vectorFromVals(val, val, val, val);
+}
+
+llvm::Value * Instructions::floor(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFloor(vec[0]), callFloor(vec[1]),
+                         callFloor(vec[2]), callFloor(vec[3]));
+}
+
+llvm::Value * Instructions::frc(llvm::Value *in)
+{
+   llvm::Value *flr = floor(in);
+   return sub(in, flr);
+}
+
+void Instructions::ifop(llvm::Value *in)
+{
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kil");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
+llvm::Value * Instructions::lg2(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   llvm::Value *const_vec = constVector(1.442695f, 1.442695f,
+                                        1.442695f, 1.442695f);
+   return mul(vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3])), const_vec);
+}
+
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
+
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+   Value *neg = m_builder.CreateNeg(in, name("neg"));
+   return neg;
+}
+
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+   llvm::Value *v = rsq(in);
+   return mul(v, in);
+}
+
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
+}
+
+llvm::Value * Instructions::rcp(llvm::Value *in1)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                     x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                       sqrt,
+                                       name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+}
+
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   return vectorFromVals(const0f, const0f, const0f, const0f);
+}
+
+llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::sin(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+
+   return vectorFromVals(const1f, const1f, const1f, const1f);
+}
+
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
+}
+
+llvm::Value * Instructions::trunc(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
+                                          name("ftoix"));
+   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
+                                          name("ftoiy"));
+   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
+                                          name("ftoiz"));
+   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
+                                          name("ftoiw"));
+   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
+                                      name("fx"));
+   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
+                                      name("fy"));
+   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
+                                      name("fz"));
+   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
+                                      name("fw"));
+   return vectorFromVals(fx, fy, fz, fw);
+}
+
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+
+   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
+
+   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
+
+   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
+}
+
+void Instructions::printVector(llvm::Value *val)
+{
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
+   std::vector<Value*> params;
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
+}
+
+const char * Instructions::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+   if (!m_llvmCeil) {
+      // predeclare the intrinsic
+      std::vector<const Type*> ceilArgs;
+      ceilArgs.push_back(Type::FloatTy);
+      AttrListPtr ceilPal;
+      FunctionType* ceilType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/ceilArgs,
+         /*isVarArg=*/false);
+      m_llvmCeil = Function::Create(
+         /*Type=*/ceilType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"ceilf", m_mod);
+      m_llvmCeil->setCallingConv(CallingConv::C);
+      m_llvmCeil->setAttributes(ceilPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
+                                          name("ceilf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
+{
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      AttrListPtr fabsPal;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = Function::Create(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setAttributes(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFExp(llvm::Value *val)
+{
+   if (!m_llvmFexp) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fexpArgs;
+      fexpArgs.push_back(Type::FloatTy);
+      AttrListPtr fexpPal;
+      FunctionType* fexpType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fexpArgs,
+         /*isVarArg=*/false);
+      m_llvmFexp = Function::Create(
+         /*Type=*/fexpType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"expf", m_mod);
+      m_llvmFexp->setCallingConv(CallingConv::C);
+      m_llvmFexp->setAttributes(fexpPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+                                         name("expf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFLog(llvm::Value *val)
+{
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      AttrListPtr flogPal;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = Function::Create(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setAttributes(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFloor(llvm::Value *val)
+{
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      AttrListPtr floorPal;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = Function::Create(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setAttributes(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+{
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      AttrListPtr fsqrtPal;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = Function::Create(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setAttributes(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+{
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      AttrListPtr powPal;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = Function::Create(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setAttributes(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
+{
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(x));
+   vec[1] = ConstantFP::get(APFloat(y));
+   vec[2] = ConstantFP::get(APFloat(z));
+   vec[3] = ConstantFP::get(APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
+}
+
+llvm::Function * Instructions::declarePrintf()
+{
+   std::vector<const Type*> args;
+   AttrListPtr params;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = Function::Create(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setAttributes(params);
+   return func_printf;
+}
+
+llvm::Function * Instructions::declareFunc(int label)
+{
+   PointerType *vecPtr = PointerType::getUnqual(m_floatVecType);
+   std::vector<const Type*> args;
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   AttrListPtr params;
+   FunctionType *funcType = FunctionType::get(
+      /*Result=*/Type::VoidTy,
+      /*Params=*/args,
+      /*isVarArg=*/false);
+   std::string name = createFuncName(label);
+   Function *func = Function::Create(
+      /*Type=*/funcType,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/name.c_str(), m_mod);
+   func->setCallingConv(CallingConv::C);
+   func->setAttributes(params);
+   return func;
+}
+
+llvm::Function * Instructions::findFunction(int label)
+{
+   llvm::Function *func = m_functions[label];
+   if (!func) {
+      func = declareFunc(label);
+      m_functions[label] = func;
+   }
+   return func;
+}
+
+std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
+{
+   std::vector<llvm::Value*> elems(4);
+   elems[0] = m_builder.CreateExtractElement(vec, m_storage->constantInt(0),
+                                             name("x"));
+   elems[1] = m_builder.CreateExtractElement(vec, m_storage->constantInt(1),
+                                             name("y"));
+   elems[2] = m_builder.CreateExtractElement(vec, m_storage->constantInt(2),
+                                             name("z"));
+   elems[3] = m_builder.CreateExtractElement(vec, m_storage->constantInt(3),
+                                             name("w"));
+   return elems;
+}
+
+
+#endif //MESA_LLVM
+
+
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
new file mode 100644
index 0000000000..e18571251e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -0,0 +1,175 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+
+#ifndef INSTRUCTIONS_H
+#define INSTRUCTIONS_H
+
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+#include <llvm/Support/IRBuilder.h>
+
+#include <map>
+#include <stack>
+
+namespace llvm {
+   class VectorType;
+   class Function;
+}
+
+class Storage;
+
+class Instructions
+{
+public:
+   Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
+                Storage *storage);
+
+   llvm::BasicBlock *currentBlock() const;
+
+   llvm::Value *abs(llvm::Value *in1);
+   llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *arl(llvm::Value *in1);
+   void         beginLoop();
+   void         bgnSub(unsigned);
+   void         brk();
+   void         cal(int label, llvm::Value *input);
+   llvm::Value *ceil(llvm::Value *in);
+   llvm::Value *clamp(llvm::Value *in);
+   llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cos(llvm::Value *in);
+   llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *ddx(llvm::Value *in);
+   llvm::Value *ddy(llvm::Value *in);
+   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dst(llvm::Value *in1, llvm::Value *in2);
+   void         elseop();
+   void         endif();
+   void         endLoop();
+   void         end();
+   void         endSub();
+   llvm::Value *exp(llvm::Value *in);
+   llvm::Value *ex2(llvm::Value *in);
+   llvm::Value *floor(llvm::Value *in);
+   llvm::Value *frc(llvm::Value *in);
+   void         ifop(llvm::Value *in);
+   llvm::Value *kil(llvm::Value *in);
+   llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
+                     llvm::Value *in3);
+   llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *log(llvm::Value *in);
+   llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
+                     llvm::Value *in3);
+   llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *neg(llvm::Value *in);
+   llvm::Value *nrm(llvm::Value *in);
+   llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *rcp(llvm::Value *in);
+   llvm::Value *rsq(llvm::Value *in);
+   llvm::Value *scs(llvm::Value *in);
+   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sin(llvm::Value *in);
+   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *trunc(llvm::Value *in);
+   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+
+   void printVector(llvm::Value *val);
+private:
+   const char *name(const char *prefix);
+
+   llvm::Value *callCeil(llvm::Value *val);
+   llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFExp(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
+   llvm::Value *callFloor(llvm::Value *val);
+   llvm::Value *callFSqrt(llvm::Value *val);
+   llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
+
+   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
+                               llvm::Value *z, llvm::Value *w=0);
+
+   llvm::Value *constVector(float x, float y, float z, float w);
+
+   llvm::Function *declarePrintf();
+   llvm::Function *declareFunc(int label);
+
+   llvm::Function *findFunction(int label);
+
+   std::vector<llvm::Value*> extractVector(llvm::Value *vec);
+private:
+   llvm::Module             *m_mod;
+   llvm::Function           *m_func;
+   char                      m_name[32];
+   llvm::IRBuilder<>         m_builder;
+   int                       m_idx;
+
+   llvm::VectorType *m_floatVecType;
+
+   llvm::Function   *m_llvmCeil;
+   llvm::Function   *m_llvmFSqrt;
+   llvm::Function   *m_llvmFAbs;
+   llvm::Function   *m_llvmPow;
+   llvm::Function   *m_llvmFloor;
+   llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmFexp;
+   llvm::Function   *m_llvmLit;
+
+   llvm::Constant   *m_fmtPtr;
+
+   std::stack<llvm::BasicBlock*> m_ifStack;
+   struct Loop {
+      llvm::BasicBlock *begin;
+      llvm::BasicBlock *end;
+   };
+   std::stack<Loop> m_loopStack;
+   std::map<int, llvm::Function*> m_functions;
+   Storage *m_storage;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
new file mode 100644
index 0000000000..d5600fd22d
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -0,0 +1,522 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#include "instructionssoa.h"
+
+#include "storagesoa.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_memory.h"
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/Module.h>
+#include <llvm/Function.h>
+#include <llvm/Instructions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
+
+#include <iostream>
+
+
+/* disable some warnings. this file is autogenerated */
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+using namespace llvm;
+#include "gallivmsoabuiltins.cpp"
+#if defined(__GNUC__)
+#pragma GCC diagnostic warning "-Wunused-variable"
+#endif
+
+InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                                 llvm::BasicBlock *block, StorageSoa *storage)
+   : m_builder(block),
+     m_storage(storage),
+     m_idx(0)
+{
+   createFunctionMap();
+   createBuiltins();
+}
+
+const char * InstructionsSoa::name(const char *prefix) const
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                              llvm::Value *z, llvm::Value *w)
+{
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder.CreateInsertElement(constVector, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+void InstructionsSoa::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
+{
+   std::vector<llvm::Value*> res(4);
+   res[0] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(0),
+                                           name("extract1X"));
+   res[1] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(1),
+                                           name("extract2X"));
+   res[2] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(2),
+                                           name("extract3X"));
+   res[3] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(3),
+                                           name("extract4X"));
+
+   return res;
+}
+
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+   return &m_builder;
+}
+
+void InstructionsSoa::createFunctionMap()
+{
+   m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
+   m_functionsMap[TGSI_OPCODE_DP3]   = "dp3";
+   m_functionsMap[TGSI_OPCODE_DP4]   = "dp4";
+   m_functionsMap[TGSI_OPCODE_MIN]   = "min";
+   m_functionsMap[TGSI_OPCODE_MAX]   = "max";
+   m_functionsMap[TGSI_OPCODE_POWER] = "pow";
+   m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
+   m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
+   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
+}
+
+void InstructionsSoa::createDependencies()
+{
+   {
+      std::vector<std::string> powDeps(2);
+      powDeps[0] = "powf";
+      powDeps[1] = "powvec";
+      m_builtinDependencies["pow"] = powDeps;
+   }
+   {
+      std::vector<std::string> absDeps(2);
+      absDeps[0] = "fabsf";
+      absDeps[1] = "absvec";
+      m_builtinDependencies["abs"] = absDeps;
+   }
+   {
+      std::vector<std::string> maxDeps(1);
+      maxDeps[0] = "maxvec";
+      m_builtinDependencies["max"] = maxDeps;
+   }
+   {
+      std::vector<std::string> minDeps(1);
+      minDeps[0] = "minvec";
+      m_builtinDependencies["min"] = minDeps;
+   }
+   {
+      std::vector<std::string> litDeps(4);
+      litDeps[0] = "minvec";
+      litDeps[1] = "maxvec";
+      litDeps[2] = "powf";
+      litDeps[3] = "powvec";
+      m_builtinDependencies["lit"] = litDeps;
+   }
+   {
+      std::vector<std::string> rsqDeps(4);
+      rsqDeps[0] = "sqrtf";
+      rsqDeps[1] = "sqrtvec";
+      rsqDeps[2] = "fabsf";
+      rsqDeps[3] = "absvec";
+      m_builtinDependencies["rsq"] = rsqDeps;
+   }
+}
+
+llvm::Function * InstructionsSoa::function(int op)
+{
+    if (m_functions.find(op) != m_functions.end())
+       return m_functions[op];
+
+    std::string name = m_functionsMap[op];
+
+    std::cout <<"For op = "<<op<<", func is '"<<name<<"'"<<std::endl;
+
+    std::vector<std::string> deps = m_builtinDependencies[name];
+    for (unsigned int i = 0; i < deps.size(); ++i) {
+       llvm::Function *func = m_builtins->getFunction(deps[i]);
+       std::cout <<"\tinjecting dep = '"<<func->getName()<<"'"<<std::endl;
+       injectFunction(func);
+    }
+
+    llvm::Function *originalFunc = m_builtins->getFunction(name);
+    injectFunction(originalFunc, op);
+    return m_functions[op];
+}
+
+llvm::Module * InstructionsSoa::currentModule() const
+{
+   BasicBlock *block = m_builder.GetInsertBlock();
+   if (!block || !block->getParent())
+      return 0;
+
+   return block->getParent()->getParent();
+}
+
+void InstructionsSoa::createBuiltins()
+{
+   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
+      (const char*)&soabuiltins_data[0],
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)]);
+   m_builtins = ParseBitcodeFile(buffer);
+   std::cout<<"Builtins created at "<<m_builtins<<std::endl;
+   assert(m_builtins);
+   createDependencies();
+}
+
+
+std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> in1)
+{
+   llvm::Function *func = function(TGSI_OPCODE_ABS);
+   return callBuiltin(func, in1);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_DP3);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_LIT);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MAX);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MIN);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_POWER);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_RSQ);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_SLT);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+   return res;
+}
+
+void checkFunction(Function *func)
+{
+   for (Function::const_iterator BI = func->begin(), BE = func->end();
+        BI != BE; ++BI) {
+      const BasicBlock &BB = *BI;
+      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+           II != IE; ++II) {
+         const Instruction &I = *II;
+         std::cout<< "Instr = "<<I;
+         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+            const Value *Op = I.getOperand(op);
+            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+            //I->setOperand(op, V);
+  }
+      }
+   }
+}
+
+llvm::Value * InstructionsSoa::allocaTemp()
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, name("tmpRes"),
+                                       m_builder.GetInsertBlock());
+
+   std::vector<Value*> indices;
+   indices.push_back(m_storage->constantInt(0));
+   indices.push_back(m_storage->constantInt(0));
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(alloca,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("allocaPtr"),
+                                                          m_builder.GetInsertBlock());
+   return getElem;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::allocaToResult(llvm::Value *allocaPtr)
+{
+   GetElementPtrInst *xElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(0),
+                                                            name("xPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *yElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(1),
+                                                            name("yPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *zElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(2),
+                                                            name("zPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *wElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(3),
+                                                            name("wPtr"),
+                                                            m_builder.GetInsertBlock());
+
+   std::vector<llvm::Value*> res(4);
+   res[0] = new LoadInst(xElemPtr, name("xRes"), false, m_builder.GetInsertBlock());
+   res[1] = new LoadInst(yElemPtr, name("yRes"), false, m_builder.GetInsertBlock());
+   res[2] = new LoadInst(zElemPtr, name("zRes"), false, m_builder.GetInsertBlock());
+   res[3] = new LoadInst(wElemPtr, name("wRes"), false, m_builder.GetInsertBlock());
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::dp4(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_DP4);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
+                                                 const std::vector<llvm::Value*> in2)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   params.push_back(in2[0]);
+   params.push_back(in2[1]);
+   params.push_back(in2[2]);
+   params.push_back(in2[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
+                                                 const std::vector<llvm::Value*> in2,
+                                                 const std::vector<llvm::Value*> in3)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   params.push_back(in2[0]);
+   params.push_back(in2[1]);
+   params.push_back(in2[2]);
+   params.push_back(in2[3]);
+   params.push_back(in3[0]);
+   params.push_back(in3[1]);
+   params.push_back(in3[2]);
+   params.push_back(in3[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
+
+void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
+{
+   assert(originalFunc);
+   std::cout << "injecting function originalFunc " <<originalFunc->getName() <<std::endl;
+   if (op != TGSI_OPCODE_LAST) {
+      /* in this case it's possible the function has been already
+       * injected as part of the dependency chain, which gets
+       * injected below */
+      llvm::Function *func = currentModule()->getFunction(originalFunc->getName());
+      if (func) {
+         m_functions[op] = func;
+         return;
+      }
+   }
+   llvm::Function *func = 0;
+   if (originalFunc->isDeclaration()) {
+      func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
+                              originalFunc->getName(), currentModule());
+      func->setCallingConv(CallingConv::C);
+      const AttrListPtr pal;
+      func->setAttributes(pal);
+      currentModule()->dump();
+   } else {
+      DenseMap<const Value*, Value *> val;
+      val[m_builtins->getFunction("fabsf")] = currentModule()->getFunction("fabsf");
+      val[m_builtins->getFunction("powf")] = currentModule()->getFunction("powf");
+      val[m_builtins->getFunction("sqrtf")] = currentModule()->getFunction("sqrtf");
+      func = CloneFunction(originalFunc, val);
+#if 0
+      std::cout <<" replacing "<<m_builtins->getFunction("powf")
+                <<", with " <<currentModule()->getFunction("powf")<<std::endl;
+      std::cout<<"1111-------------------------------"<<std::endl;
+      checkFunction(originalFunc);
+      std::cout<<"2222-------------------------------"<<std::endl;
+      checkFunction(func);
+      std::cout <<"XXXX = " <<val[m_builtins->getFunction("powf")]<<std::endl;
+#endif
+      currentModule()->getFunctionList().push_back(func);
+   }
+   if (op != TGSI_OPCODE_LAST) {
+      m_functions[op] = func;
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
new file mode 100644
index 0000000000..d6831e0a6b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -0,0 +1,116 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INSTRUCTIONSSOA_H
+#define INSTRUCTIONSSOA_H
+
+#include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
+
+#include <map>
+#include <vector>
+
+namespace llvm {
+   class Module;
+   class Function;
+   class BasicBlock;
+   class Value;
+}
+class StorageSoa;
+
+class InstructionsSoa
+{
+public:
+   InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                   llvm::BasicBlock *block, StorageSoa *storage);
+
+   std::vector<llvm::Value*> abs(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> arl(const std::vector<llvm::Value*> in);
+   std::vector<llvm::Value*> add(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> dp3(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> dp4(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> lit(const std::vector<llvm::Value*> in);
+   std::vector<llvm::Value*> madd(const std::vector<llvm::Value*> in1,
+                                  const std::vector<llvm::Value*> in2,
+                                  const std::vector<llvm::Value*> in3);
+   std::vector<llvm::Value*> max(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> min(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> mul(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   void         end();
+
+   std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+   llvm::IRBuilder<>*  getIRBuilder();
+private:
+   const char * name(const char *prefix) const;
+   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
+                               llvm::Value *z, llvm::Value *w);
+   void createFunctionMap();
+   void createBuiltins();
+   void createDependencies();
+   llvm::Function *function(int);
+   llvm::Module *currentModule() const;
+   llvm::Value *allocaTemp();
+   std::vector<llvm::Value*> allocaToResult(llvm::Value *allocaPtr);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1,
+                                         const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1,
+                                         const std::vector<llvm::Value*> in2,
+                                         const std::vector<llvm::Value*> in3);
+   void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
+private:
+   llvm::IRBuilder<>  m_builder;
+   StorageSoa *m_storage;
+
+   std::map<int, std::string> m_functionsMap;
+   std::map<int, llvm::Function*> m_functions;
+   llvm::Module *m_builtins;
+   std::map<std::string, std::vector<std::string> > m_builtinDependencies;
+
+private:
+   mutable int  m_idx;
+   mutable char m_name[32];
+};
+
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/llvm_builtins.c b/src/gallium/auxiliary/gallivm/llvm_builtins.c
new file mode 100644
index 0000000000..d5a003a48b
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/llvm_builtins.c
@@ -0,0 +1,114 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+typedef __attribute__(( ext_vector_type(4) )) float float4;
+
+extern float powf(float a, float b);
+
+inline float approx(float a, float b)
+{
+    if (b < -128.0f) b = -128.0f;
+    if (b > 128.0f)   b = 128.0f;
+    if (a < 0) a = 0;
+    return powf(a, b);
+}
+
+inline float4 lit(float4 tmp)
+{
+    float4 result;
+    result.x = 1.0;
+    result.w = 1.0;
+    if (tmp.x > 0) {
+        result.y = tmp.x;
+        result.z = approx(tmp.y, tmp.w);
+    } else {
+        result.y = 0;
+        result.z = 0;
+    }
+    return result;
+}
+
+inline float4 cmp(float4 tmp0, float4 tmp1, float4 tmp2)
+{
+   float4 result;
+
+   result.x = (tmp0.x < 0.0) ? tmp1.x : tmp2.x;
+   result.y = (tmp0.y < 0.0) ? tmp1.y : tmp2.y;
+   result.z = (tmp0.z < 0.0) ? tmp1.z : tmp2.z;
+   result.w = (tmp0.w < 0.0) ? tmp1.w : tmp2.w;
+
+   return result;
+}
+
+extern float cosf(float  val);
+extern float sinf(float  val);
+
+inline float4 vcos(float4 val)
+{
+   float4 result;
+   printf("VEC IN   is %f %f %f %f\n", val.x, val.y, val.z, val.w);
+   result.x = cosf(val.x);
+   result.y = cosf(val.x);
+   result.z = cosf(val.x);
+   result.w = cosf(val.x);
+   printf("VEC OUT  is %f %f %f %f\n", result.x, result.y, result.z, result.w);
+   return result;
+}
+
+inline float4 scs(float4 val)
+{
+   float4 result;
+   float tmp = val.x;
+   result.x = cosf(tmp);
+   result.y = sinf(tmp);
+   return result;
+}
+
+
+inline float4 vsin(float4 val)
+{
+   float4 result;
+   float tmp = val.x;
+   float res = sinf(tmp);
+   result.x = res;
+   result.y = res;
+   result.z = res;
+   result.w = res;
+   return result;
+}
+
+inline int kil(float4 val)
+{
+   if (val.x < 0 || val.y < 0 || val.z < 0 || val.w < 0)
+      return 1;
+   else
+      return 0;
+}
diff --git a/src/gallium/auxiliary/gallivm/loweringpass.cpp b/src/gallium/auxiliary/gallivm/loweringpass.cpp
new file mode 100644
index 0000000000..556dbec366
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/loweringpass.cpp
@@ -0,0 +1,17 @@
+#include "loweringpass.h"
+
+using namespace llvm;
+
+char LoweringPass::ID = 0;
+RegisterPass<LoweringPass> X("lowering", "Lowering Pass");
+
+LoweringPass::LoweringPass()
+   :  ModulePass((intptr_t)&ID)
+{
+}
+
+bool LoweringPass::runOnModule(Module &m)
+{
+   llvm::cerr << "Hello: " << m.getModuleIdentifier() << "\n";
+   return false;
+}
diff --git a/src/gallium/auxiliary/gallivm/loweringpass.h b/src/gallium/auxiliary/gallivm/loweringpass.h
new file mode 100644
index 0000000000..f62dcf6ba7
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/loweringpass.h
@@ -0,0 +1,15 @@
+#ifndef LOWERINGPASS_H
+#define LOWERINGPASS_H
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+
+struct LoweringPass : public llvm::ModulePass
+{
+   static char ID;
+   LoweringPass();
+
+   virtual bool runOnModule(llvm::Module &m);
+};
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
new file mode 100644
index 0000000000..cb85e1734e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -0,0 +1,210 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * This file is compiled with clang into the LLVM bitcode
+  *
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+typedef __attribute__(( ext_vector_type(4) )) float float4;
+
+
+extern float fabsf(float val);
+
+/* helpers */
+
+float4 absvec(float4 vec)
+{
+   float4 res;
+   res.x = fabsf(vec.x);
+   res.y = fabsf(vec.y);
+   res.z = fabsf(vec.z);
+   res.w = fabsf(vec.w);
+
+   return res;
+}
+
+float4 maxvec(float4 a, float4 b)
+{
+   return (float4){(a.x > b.x) ? a.x : b.x,
+         (a.y > b.y) ? a.y : b.y,
+         (a.z > b.z) ? a.z : b.z,
+         (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+   return (float4){(a.x < b.x) ? a.x : b.x,
+         (a.y < b.y) ? a.y : b.y,
+         (a.z < b.z) ? a.z : b.z,
+         (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+   float4 p;
+   p.x = powf(vec.x, q.x);
+   p.y = powf(vec.y, q.y);
+   p.z = powf(vec.z, q.z);
+   p.w = powf(vec.w, q.w);
+   return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+   float4 p;
+   p.x = sqrtf(vec.x);
+   p.y = sqrtf(vec.y);
+   p.z = sqrtf(vec.z);
+   p.w = sqrtf(vec.w);
+   return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+   float4 p;
+   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+   return p;
+}
+
+
+/* instructions */
+
+void abs(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
+{
+   res[0] = absvec(tmp0x);
+   res[1] = absvec(tmp0y);
+   res[2] = absvec(tmp0z);
+   res[3] = absvec(tmp0w);
+}
+
+void dp3(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
+                (tmp0z * tmp1z);
+
+   res[0] = dot;
+   res[1] = dot;
+   res[2] = dot;
+   res[3] = dot;
+}
+
+void dp4(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
+                (tmp0z * tmp1z) + (tmp0w * tmp1w);
+
+   res[0] = dot;
+   res[1] = dot;
+   res[2] = dot;
+   res[3] = dot;
+}
+
+void lit(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
+{
+   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
+
+   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+   if (tmp0x.x > 0) {
+      float4 tmpy = maxvec(tmp0y, zerovec);
+      float4 tmpw = minvec(tmp0w, plus128);
+      tmpw = maxvec(tmpw, min128);
+      res[1] = tmp0x;
+      res[2] = powvec(tmpy, tmpw);
+   } else {
+      res[1] = zerovec;
+      res[2] = zerovec;
+   }
+   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
+}
+
+void min(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = minvec(tmp0x, tmp1x);
+   res[1] = minvec(tmp0y, tmp1y);
+   res[2] = minvec(tmp0z, tmp1z);
+   res[3] = minvec(tmp0w, tmp1w);
+}
+
+
+void max(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = maxvec(tmp0x, tmp1x);
+   res[1] = maxvec(tmp0y, tmp1y);
+   res[2] = maxvec(tmp0z, tmp1z);
+   res[3] = maxvec(tmp0w, tmp1w);
+}
+
+void pow(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = powvec(tmp0x, tmp1x);
+   res[1] = res[0];
+   res[2] = res[0];
+   res[3] = res[0];
+}
+
+void rsq(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
+{
+   const float4 onevec = (float4) {1., 1., 1., 1.};
+   res[0] = onevec/sqrtvec(absvec(tmp0x));
+   res[1] = onevec/sqrtvec(absvec(tmp0y));
+   res[2] = onevec/sqrtvec(absvec(tmp0z));
+   res[3] = onevec/sqrtvec(absvec(tmp0w));
+}
+
+void slt(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = sltvec(tmp0x, tmp1x);
+   res[1] = sltvec(tmp0y, tmp1y);
+   res[2] = sltvec(tmp0z, tmp1z);
+   res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/storage.cpp b/src/gallium/auxiliary/gallivm/storage.cpp
new file mode 100644
index 0000000000..73df24c976
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/storage.cpp
@@ -0,0 +1,364 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+#ifdef MESA_LLVM
+
+#include "storage.h"
+
+#include "gallivm_p.h"
+
+#include "pipe/p_shader_tokens.h"
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+
+using namespace llvm;
+
+Storage::Storage(llvm::BasicBlock *block, llvm::Value *input)
+   : m_block(block),
+     m_INPUT(input),
+     m_addrs(32),
+     m_idx(0)
+{
+   m_floatVecType = VectorType::get(Type::FloatTy, 4);
+   m_intVecType   = VectorType::get(IntegerType::get(32), 4);
+
+   m_undefFloatVec = UndefValue::get(m_floatVecType);
+   m_undefIntVec   = UndefValue::get(m_intVecType);
+   m_extSwizzleVec = 0;
+
+   m_numConsts = 0;
+}
+
+//can only build vectors with all members in the [0, 9] range
+llvm::Constant *Storage::shuffleMask(int vec)
+{
+   if (!m_extSwizzleVec) {
+      std::vector<Constant*> elems;
+      elems.push_back(ConstantFP::get(APFloat(0.f)));
+      elems.push_back(ConstantFP::get(APFloat(1.f)));
+      elems.push_back(ConstantFP::get(APFloat(0.f)));
+      elems.push_back(ConstantFP::get(APFloat(1.f)));
+      m_extSwizzleVec = ConstantVector::get(m_floatVecType, elems);
+   }
+
+   if (m_intVecs.find(vec) != m_intVecs.end()) {
+      return m_intVecs[vec];
+   }
+   int origVec = vec;
+   Constant* const_vec = 0;
+   if (origVec == 0) {
+      const_vec = Constant::getNullValue(m_intVecType);
+   } else {
+      int x = gallivm_x_swizzle(vec);
+      int y = gallivm_y_swizzle(vec);
+      int z = gallivm_z_swizzle(vec);
+      int w = gallivm_w_swizzle(vec);
+      std::vector<Constant*> elems;
+      elems.push_back(constantInt(x));
+      elems.push_back(constantInt(y));
+      elems.push_back(constantInt(z));
+      elems.push_back(constantInt(w));
+      const_vec = ConstantVector::get(m_intVecType, elems);
+   }
+
+   m_intVecs[origVec] = const_vec;
+   return const_vec;
+}
+
+llvm::ConstantInt *Storage::constantInt(int idx)
+{
+   if (m_constInts.find(idx) != m_constInts.end()) {
+      return m_constInts[idx];
+   }
+   ConstantInt *const_int = ConstantInt::get(APInt(32,  idx));
+   m_constInts[idx] = const_int;
+   return const_int;
+}
+
+llvm::Value *Storage::inputElement(int idx, llvm::Value *indIdx)
+{
+   Value *val = element(InputsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(val, name("input"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+llvm::Value *Storage::constElement(int idx, llvm::Value *indIdx)
+{
+   m_numConsts = ((idx + 1) > m_numConsts) ? (idx + 1) : m_numConsts;
+
+   Value *elem = element(ConstsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(elem, name("const"), false, m_block);
+   load->setAlignment(8);
+   return load;
+}
+
+llvm::Value *Storage::shuffleVector(llvm::Value *vec, int shuffle)
+{
+   Constant *mask = shuffleMask(shuffle);
+   ShuffleVectorInst *res =
+      new ShuffleVectorInst(vec, m_extSwizzleVec, mask,
+                            name("shuffle"), m_block);
+   return res;
+}
+
+
+llvm::Value *Storage::tempElement(int idx, llvm::Value *indIdx)
+{
+   Value *elem = element(TempsArg, idx, indIdx);
+
+   LoadInst *load = new LoadInst(elem, name("temp"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+void Storage::setTempElement(int idx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = 0;
+      if (m_tempWriteMap[idx])
+         templ = tempElement(idx);
+      val = maskWrite(val, mask, templ);
+   }
+   Value *elem = element(TempsArg, idx);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+   m_tempWriteMap[idx] = true;
+}
+
+void Storage::setOutputElement(int dstIdx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = 0;
+      if (m_destWriteMap[dstIdx])
+         templ = outputElement(dstIdx);
+      val = maskWrite(val, mask, templ);
+   }
+
+   Value *elem = element(DestsArg, dstIdx);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+   m_destWriteMap[dstIdx] = true;
+}
+
+llvm::Value *Storage::maskWrite(llvm::Value *src, int mask, llvm::Value *templ)
+{
+   llvm::Value *dst = templ;
+   if (!dst)
+      dst = Constant::getNullValue(m_floatVecType);
+   if ((mask & TGSI_WRITEMASK_X)) {
+      llvm::Value *x = new ExtractElementInst(src, unsigned(0),
+                                              name("x"), m_block);
+      dst = InsertElementInst::Create(dst, x, unsigned(0),
+                                      name("dstx"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Y)) {
+      llvm::Value *y = new ExtractElementInst(src, unsigned(1),
+                                              name("y"), m_block);
+      dst = InsertElementInst::Create(dst, y, unsigned(1),
+                                      name("dsty"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Z)) {
+      llvm::Value *z = new ExtractElementInst(src, unsigned(2),
+                                              name("z"), m_block);
+      dst = InsertElementInst::Create(dst, z, unsigned(2),
+                                      name("dstz"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_W)) {
+      llvm::Value *w = new ExtractElementInst(src, unsigned(3),
+                                              name("w"), m_block);
+      dst = InsertElementInst::Create(dst, w, unsigned(3),
+                                      name("dstw"), m_block);
+   }
+   return dst;
+}
+
+const char * Storage::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+int Storage::numConsts() const
+{
+   return m_numConsts;
+}
+
+llvm::Value * Storage::addrElement(int idx) const
+{
+   Value *ret = m_addrs[idx];
+   if (!ret)
+      return m_undefFloatVec;
+   return ret;
+}
+
+void Storage::setAddrElement(int idx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = m_addrs[idx];
+      val = maskWrite(val, mask, templ);
+   }
+   m_addrs[idx] = val;
+}
+
+llvm::Value * Storage::extractIndex(llvm::Value *vec)
+{
+   llvm::Value *x = new ExtractElementInst(vec, unsigned(0),
+                                           name("x"), m_block);
+   return new FPToSIInst(x, IntegerType::get(32), name("intidx"), m_block);
+}
+
+void Storage::setCurrentBlock(llvm::BasicBlock *block)
+{
+   m_block = block;
+}
+
+llvm::Value * Storage::outputElement(int idx, llvm::Value *indIdx)
+{
+   Value *elem = element(DestsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(elem, name("output"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+llvm::Value * Storage::inputPtr() const
+{
+   return m_INPUT;
+}
+
+void Storage::pushArguments(llvm::Value *input)
+{
+   m_argStack.push(m_INPUT);
+
+   m_INPUT = input;
+}
+
+void Storage::popArguments()
+{
+   m_INPUT = m_argStack.top();
+   m_argStack.pop();
+}
+
+void Storage::pushTemps()
+{
+   m_extSwizzleVec = 0;
+}
+
+void Storage::popTemps()
+{
+}
+
+llvm::Value * Storage::immediateElement(int idx)
+{
+   return m_immediates[idx];
+}
+
+void Storage::addImmediate(float *val)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(val[0]));
+   vec[1] = ConstantFP::get(APFloat(val[1]));
+   vec[2] = ConstantFP::get(APFloat(val[2]));
+   vec[3] = ConstantFP::get(APFloat(val[3]));
+   m_immediates.push_back(ConstantVector::get(m_floatVecType, vec));
+}
+
+
+llvm::Value * Storage::elemPtr(Args arg)
+{
+   std::vector<Value*> indices;
+   indices.push_back(constantInt(0));
+   indices.push_back(constantInt(static_cast<int>(arg)));
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(m_INPUT,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("input_ptr"),
+                                                          m_block);
+   return new LoadInst(getElem, name("input_field"), false, m_block);
+}
+
+llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
+                               llvm::Value *indIdx )
+{
+   GetElementPtrInst *getElem = 0;
+
+   if (indIdx) {
+      getElem = GetElementPtrInst::Create(ptr,
+                                      BinaryOperator::Create(Instruction::Add,
+                                                             indIdx,
+                                                             constantInt(idx),
+                                                             name("add"),
+                                                             m_block),
+                                      name("field"),
+                                      m_block);
+   } else {
+      getElem = GetElementPtrInst::Create(ptr,
+                                      constantInt(idx),
+                                      name("field"),
+                                      m_block);
+   }
+   return getElem;
+}
+
+llvm::Value * Storage::element(Args arg, int idx, llvm::Value *indIdx )
+{
+   Value *val = elemPtr(arg);
+   return elemIdx(val, idx, indIdx);
+}
+
+void Storage::setKilElement(llvm::Value *val)
+{
+   std::vector<Value*> indices;
+   indices.push_back(constantInt(0));
+   indices.push_back(constantInt(static_cast<int>(KilArg)));
+   GetElementPtrInst *elem = GetElementPtrInst::Create(m_INPUT,
+                                                   indices.begin(),
+                                                   indices.end(),
+                                                   name("kil_ptr"),
+                                                   m_block);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+}
+
+#endif //MESA_LLVM
+
+
diff --git a/src/gallium/auxiliary/gallivm/storage.h b/src/gallium/auxiliary/gallivm/storage.h
new file mode 100644
index 0000000000..8574f7554e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/storage.h
@@ -0,0 +1,133 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+
+#ifndef STORAGE_H
+#define STORAGE_H
+
+#include <map>
+#include <set>
+#include <stack>
+#include <vector>
+
+namespace llvm {
+   class BasicBlock;
+   class Constant;
+   class ConstantInt;
+   class LoadInst;
+   class Value;
+   class VectorType;
+}
+
+class Storage
+{
+public:
+   Storage(llvm::BasicBlock *block,
+           llvm::Value *input);
+
+   llvm::Value *inputPtr() const;
+
+   void setCurrentBlock(llvm::BasicBlock *block);
+
+   llvm::ConstantInt *constantInt(int);
+   llvm::Constant *shuffleMask(int vec);
+   llvm::Value *inputElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *constElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *outputElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *tempElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *immediateElement(int idx);
+
+   void setOutputElement(int dstIdx, llvm::Value *val, int mask);
+   void setTempElement(int idx, llvm::Value *val, int mask);
+
+   llvm::Value *addrElement(int idx) const;
+   void setAddrElement(int idx, llvm::Value *val, int mask);
+
+   void setKilElement(llvm::Value *val);
+
+   llvm::Value *shuffleVector(llvm::Value *vec, int shuffle);
+
+   llvm::Value *extractIndex(llvm::Value *vec);
+
+   int numConsts() const;
+
+   void pushArguments(llvm::Value *input);
+   void popArguments();
+   void pushTemps();
+   void popTemps();
+
+   void addImmediate(float *val);
+
+private:
+   llvm::Value *maskWrite(llvm::Value *src, int mask, llvm::Value *templ);
+   const char *name(const char *prefix);
+
+   enum Args {
+      DestsArg   = 0,
+      InputsArg  = 1,
+      TempsArg   = 2,
+      ConstsArg  = 3,
+      KilArg     = 4
+   };
+   llvm::Value *elemPtr(Args arg);
+   llvm::Value *elemIdx(llvm::Value *ptr, int idx,
+                        llvm::Value *indIdx = 0);
+   llvm::Value *element(Args arg, int idx, llvm::Value *indIdx = 0);
+
+private:
+   llvm::BasicBlock *m_block;
+   llvm::Value *m_INPUT;
+
+   std::map<int, llvm::ConstantInt*> m_constInts;
+   std::map<int, llvm::Constant*>    m_intVecs;
+   std::vector<llvm::Value*>         m_addrs;
+   std::vector<llvm::Constant*>      m_immediates;
+
+   llvm::VectorType *m_floatVecType;
+   llvm::VectorType *m_intVecType;
+
+   char        m_name[32];
+   int         m_idx;
+
+   int         m_numConsts;
+
+   std::map<int, bool > m_destWriteMap;
+   std::map<int, bool > m_tempWriteMap;
+
+   llvm::Value      *m_undefFloatVec;
+   llvm::Value      *m_undefIntVec;
+   llvm::Value      *m_extSwizzleVec;
+
+   std::stack<llvm::Value*> m_argStack;
+   std::stack<std::vector<llvm::Value*> > m_tempStack;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
new file mode 100644
index 0000000000..e1e5cabcf5
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -0,0 +1,438 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "storagesoa.h"
+
+#include "gallivm_p.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+
+using namespace llvm;
+
+
+StorageSoa::StorageSoa(llvm::BasicBlock *block,
+                       llvm::Value *input,
+                       llvm::Value *output,
+                       llvm::Value *consts)
+   : m_block(block),
+     m_input(input),
+     m_output(output),
+     m_consts(consts),
+     m_immediates(0),
+     m_idx(0)
+{
+}
+
+void StorageSoa::addImmediate(float *vec)
+{
+   std::vector<float> vals(4);
+   vals[0] = vec[0];
+   vals[1] = vec[1];
+   vals[2] = vec[2];
+   vals[3] = vec[3];
+   m_immediatesToFlush.push_back(vals);
+}
+
+void StorageSoa::declareImmediates()
+{
+   if (m_immediatesToFlush.empty())
+      return;
+
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vectorChannels = ArrayType::get(vectorType, 4);
+   ArrayType  *arrayType = ArrayType::get(vectorChannels, m_immediatesToFlush.size());
+
+   m_immediates = new GlobalVariable(
+      /*Type=*/arrayType,
+      /*isConstant=*/false,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Initializer=*/0, // has initializer, specified below
+      /*Name=*/name("immediates"),
+      currentModule());
+
+   std::vector<Constant*> arrayVals;
+   for (unsigned int i = 0; i < m_immediatesToFlush.size(); ++i) {
+      std::vector<float> vec = m_immediatesToFlush[i];
+      std::vector<float> vals(4);
+      std::vector<Constant*> channelArray;
+
+      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
+      llvm::Constant *xChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
+      llvm::Constant *yChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[2]; vals[1] = vec[2]; vals[2] = vec[2]; vals[3] = vec[2];
+      llvm::Constant *zChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[3]; vals[1] = vec[3]; vals[2] = vec[3]; vals[3] = vec[3];
+      llvm::Constant *wChannel = createConstGlobalVector(vals);
+      channelArray.push_back(xChannel);
+      channelArray.push_back(yChannel);
+      channelArray.push_back(zChannel);
+      channelArray.push_back(wChannel);
+      Constant *constChannels = ConstantArray::get(vectorChannels,
+                                                   channelArray);
+      arrayVals.push_back(constChannels);
+   }
+   Constant *constArray = ConstantArray::get(arrayType, arrayVals);
+   m_immediates->setInitializer(constArray);
+
+   m_immediatesToFlush.clear();
+}
+
+llvm::Value *StorageSoa::addrElement(int idx) const
+{
+   std::map<int, llvm::Value*>::const_iterator itr = m_addresses.find(idx);
+   if (itr == m_addresses.end()) {
+      debug_printf("Trying to access invalid shader 'address'\n");
+      return 0;
+   }
+   llvm::Value * res = (*itr).second;
+
+   res = new LoadInst(res, name("addr"), false, m_block);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_input, idx, 0);
+   res[1] = element(m_input, idx, 1);
+   res[2] = element(m_input, idx, 2);
+   res[3] = element(m_input, idx, 3);
+
+   return res;
+}
+
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
+{
+   std::vector<llvm::Value*> x(4);
+   x[0] = m_builder->CreateExtractElement(vector,
+                                           constantInt(cc),
+                                           name("x"));
+
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder->CreateInsertElement(constVector, x[0],
+                                              constantInt(0),
+                                              name("vecx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+                               name("vecxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+                               name("vecxxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+                               name("vecxxxx"));
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+   llvm::Value* res;
+   std::vector<llvm::Value*> res2(4);
+   llvm::Value *xChannel;
+
+   xChannel = elementPointer(m_consts, idx, 0);
+
+   res = alignedArrayLoad(xChannel);
+
+   res2[0]=unpackConstElement(m_builder, res,0);
+   res2[1]=unpackConstElement(m_builder, res,1);
+   res2[2]=unpackConstElement(m_builder, res,2);
+   res2[3]=unpackConstElement(m_builder, res,3);
+
+   return res2;
+}
+
+std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_output, idx, 0);
+   res[1] = element(m_output, idx, 1);
+   res[2] = element(m_output, idx, 2);
+   res[3] = element(m_output, idx, 3);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::IRBuilder<>* m_builder, int idx)
+{
+   std::vector<llvm::Value*> res(4);
+   llvm::Value *temp = m_temps[idx];
+
+   res[0] = element(temp, constantInt(0), 0);
+   res[1] = element(temp, constantInt(0), 1);
+   res[2] = element(temp, constantInt(0), 2);
+   res[3] = element(temp, constantInt(0), 3);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::immediateElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_immediates, idx, 0);
+   res[1] = element(m_immediates, idx, 1);
+   res[2] = element(m_immediates, idx, 2);
+   res[3] = element(m_immediates, idx, 3);
+
+   return res;
+}
+
+llvm::Value * StorageSoa::elementPointer(llvm::Value *ptr, llvm::Value *index,
+                                         int channel) const
+{
+   std::vector<Value*> indices;
+   if (m_immediates == ptr)
+      indices.push_back(constantInt(0));
+   indices.push_back(index);
+   indices.push_back(constantInt(channel));
+
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(ptr,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("ptr"),
+                                                          m_block);
+   return getElem;
+}
+
+llvm::Value * StorageSoa::element(llvm::Value *ptr, llvm::Value *index,
+                                  int channel) const
+{
+   llvm::Value *res = elementPointer(ptr, index, channel);
+   LoadInst *load = new LoadInst(res, name("element"), false, m_block);
+   //load->setAlignment(8);
+   return load;
+}
+
+const char * StorageSoa::name(const char *prefix) const
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::ConstantInt * StorageSoa::constantInt(int idx) const
+{
+   if (m_constInts.find(idx) != m_constInts.end()) {
+      return m_constInts[idx];
+   }
+   ConstantInt *constInt = ConstantInt::get(APInt(32,  idx));
+   m_constInts[idx] = constInt;
+   return constInt;
+}
+
+llvm::Value *StorageSoa::alignedArrayLoad(llvm::Value *val)
+{
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   PointerType *vectorPtr  = PointerType::get(vectorType, 0);
+
+   CastInst *cast = new BitCastInst(val, vectorPtr, name("toVector"), m_block);
+   LoadInst *load = new LoadInst(cast, name("alignLoad"), false, m_block);
+   load->setAlignment(8);
+   return load;
+}
+
+llvm::Module * StorageSoa::currentModule() const
+{
+    if (!m_block || !m_block->getParent())
+       return 0;
+
+    return m_block->getParent()->getParent();
+}
+
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+   Constant*c = ConstantFP::get(APFloat(val));
+   return c;
+}
+
+llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
+{
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   std::vector<Constant*> immValues;
+   ConstantFP *constx = ConstantFP::get(APFloat(vec[0]));
+   ConstantFP *consty = ConstantFP::get(APFloat(vec[1]));
+   ConstantFP *constz = ConstantFP::get(APFloat(vec[2]));
+   ConstantFP *constw = ConstantFP::get(APFloat(vec[3]));
+   immValues.push_back(constx);
+   immValues.push_back(consty);
+   immValues.push_back(constz);
+   immValues.push_back(constw);
+   Constant  *constVector = ConstantVector::get(vectorType, immValues);
+
+   return constVector;
+}
+
+std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
+                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> val(4);
+
+   //if we have an indirect index, always use that
+   //   if not use the integer offset to create one
+   llvm::Value *realIndex = 0;
+   if (indIdx)
+      realIndex = indIdx;
+   else
+      realIndex = constantInt(idx);
+   debug_printf("XXXXXXXXX realIdx = %p, indIdx = %p\n", realIndex, indIdx);
+
+   switch(type) {
+   case TGSI_FILE_INPUT:
+      val = inputElement(realIndex);
+      break;
+   case TGSI_FILE_OUTPUT:
+      val = outputElement(realIndex);
+      break;
+   case TGSI_FILE_TEMPORARY:
+      val = tempElement(m_builder, idx);
+      break;
+   case TGSI_FILE_CONSTANT:
+      val = constElement(m_builder, realIndex);
+      break;
+   case TGSI_FILE_IMMEDIATE:
+      val = immediateElement(realIndex);
+      break;
+   case TGSI_FILE_ADDRESS:
+      debug_printf("Address not handled in the load phase!\n");
+      assert(0);
+      break;
+   default:
+      debug_printf("Unknown load!\n");
+      assert(0);
+      break;
+   }
+   if (!gallivm_is_swizzle(swizzle))
+      return val;
+
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = val[gallivm_x_swizzle(swizzle)];
+   res[1] = val[gallivm_y_swizzle(swizzle)];
+   res[2] = val[gallivm_z_swizzle(swizzle)];
+   res[3] = val[gallivm_w_swizzle(swizzle)];
+   return res;
+}
+
+llvm::Value * StorageSoa::allocaTemp(llvm::IRBuilder<>* m_builder)
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, "temp",
+                                       m_builder->GetInsertBlock());
+
+   return alloca;
+}
+
+
+void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
+                       int mask, llvm::IRBuilder<>* m_builder)
+{
+   llvm::Value *out = 0;
+   llvm::Value *realIndex = 0;
+   switch(type) {
+   case TGSI_FILE_OUTPUT:
+      out = m_output;
+      realIndex = constantInt(idx);
+      break;
+   case TGSI_FILE_TEMPORARY:
+      // if that temp doesn't already exist, alloca it
+      if (m_temps.find(idx) == m_temps.end())
+         m_temps[idx] = allocaTemp(m_builder);
+
+      out = m_temps[idx];
+
+      realIndex = constantInt(0);
+      break;
+   case TGSI_FILE_INPUT:
+      out = m_input;
+      realIndex = constantInt(idx);
+      break;
+   case TGSI_FILE_ADDRESS: {
+      llvm::Value *addr = m_addresses[idx];
+      if (!addr) {
+         addAddress(idx);
+         addr = m_addresses[idx];
+         assert(addr);
+      }
+      new StoreInst(val[0], addr, false, m_block);
+      return;
+      break;
+   }
+   default:
+      debug_printf("Can't save output of this type: %d !\n", type);
+      assert(0);
+      break;
+   }
+   if ((mask & TGSI_WRITEMASK_X)) {
+      llvm::Value *xChannel = elementPointer(out, realIndex, 0);
+      new StoreInst(val[0], xChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Y)) {
+      llvm::Value *yChannel = elementPointer(out, realIndex, 1);
+      new StoreInst(val[1], yChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Z)) {
+      llvm::Value *zChannel = elementPointer(out, realIndex, 2);
+      new StoreInst(val[2], zChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_W)) {
+      llvm::Value *wChannel = elementPointer(out, realIndex, 3);
+      new StoreInst(val[3], wChannel, false, m_block);
+   }
+}
+
+void StorageSoa::addAddress(int idx)
+{
+   GlobalVariable *val = new GlobalVariable(
+      /*Type=*/IntegerType::get(32),
+      /*isConstant=*/false,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Initializer=*/0, // has initializer, specified below
+      /*Name=*/name("address"),
+      currentModule());
+   val->setInitializer(Constant::getNullValue(IntegerType::get(32)));
+
+   debug_printf("adding to %d\n", idx);
+   m_addresses[idx] = val;
+}
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
new file mode 100644
index 0000000000..56886f85e7
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef STORAGESOA_H
+#define STORAGESOA_H
+
+#include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
+
+#include <vector>
+#include <list>
+#include <map>
+
+namespace llvm {
+   class BasicBlock;
+   class Constant;
+   class ConstantInt;
+   class GlobalVariable;
+   class LoadInst;
+   class Value;
+   class VectorType;
+   class Module;
+}
+
+class StorageSoa
+{
+public:
+   StorageSoa(llvm::BasicBlock *block,
+              llvm::Value *input,
+              llvm::Value *output,
+              llvm::Value *consts);
+
+
+   std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
+                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
+   void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
+              int mask, llvm::IRBuilder<>* m_builder);
+
+   void addImmediate(float *vec);
+   void declareImmediates();
+
+   void addAddress(int idx);
+
+   llvm::Value  * addrElement(int idx) const;
+
+   llvm::ConstantInt *constantInt(int) const;
+private:
+   llvm::Value *elementPointer(llvm::Value *ptr, llvm::Value *indIdx,
+                               int channel) const;
+   llvm::Value *element(llvm::Value *ptr, llvm::Value *idx,
+                        int channel) const;
+   const char *name(const char *prefix) const;
+   llvm::Value  *alignedArrayLoad(llvm::Value *val);
+   llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalFloat(const float val);
+   llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
+
+   std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
+   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
+   std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> tempElement(llvm::IRBuilder<>* m_builder, int idx);
+   std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
+private:
+   llvm::BasicBlock *m_block;
+
+   llvm::Value *m_input;
+   llvm::Value *m_output;
+   llvm::Value *m_consts;
+   std::map<int, llvm::Value*> m_temps;
+   llvm::GlobalVariable *m_immediates;
+
+   std::map<int, llvm::Value*> m_addresses;
+
+   std::vector<std::vector<float> > m_immediatesToFlush;
+   llvm::Value * allocaTemp(llvm::IRBuilder<>* m_builder);
+
+   mutable std::map<int, llvm::ConstantInt*> m_constInts;
+   mutable char        m_name[32];
+   mutable int         m_idx;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
new file mode 100644
index 0000000000..c11b88af9e
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -0,0 +1,1164 @@
+#include "tgsitollvm.h"
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "storage.h"
+#include "instructions.h"
+#include "storagesoa.h"
+#include "instructionssoa.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
+
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+
+static inline FunctionType *vertexShaderFunctionType()
+{
+   //Function takes three arguments,
+   // the calling code has to make sure the types it will
+   // pass are castable to the following:
+   // [4 x <4 x float>] inputs,
+   // [4 x <4 x float>] output,
+   // [4 x [1 x float]] consts,
+
+   std::vector<const Type*> funcArgs;
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
+   PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
+
+   ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
+   PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
+
+   funcArgs.push_back(vectorArrayPtr);//inputs
+   funcArgs.push_back(vectorArrayPtr);//output
+   funcArgs.push_back(constsArrayPtr);//consts
+
+   FunctionType *functionType = FunctionType::get(
+      /*Result=*/Type::VoidTy,
+      /*Params=*/funcArgs,
+      /*isVarArg=*/false);
+
+   return functionType;
+}
+
+static inline void
+add_interpolator(struct gallivm_ir *ir,
+                 struct gallivm_interpolate *interp)
+{
+   ir->interpolators[ir->num_interp] = *interp;
+   ++ir->num_interp;
+}
+
+static void
+translate_declaration(struct gallivm_ir *prog,
+                      llvm::Module *module,
+                      Storage *storage,
+                      struct tgsi_full_declaration *decl,
+                      struct tgsi_full_declaration *fd)
+{
+   if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      unsigned first, last, mask;
+      uint interp_method;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      /* Do not touch WPOS.xy */
+      if (first == 0) {
+         mask &= ~TGSI_WRITEMASK_XY;
+         if (mask == TGSI_WRITEMASK_NONE) {
+            first++;
+            if (first > last) {
+               return;
+            }
+         }
+      }
+
+      interp_method = decl->Declaration.Interpolate;
+
+      if (mask == TGSI_WRITEMASK_XYZW) {
+         unsigned i, j;
+
+         for (i = first; i <= last; i++) {
+            for (j = 0; j < NUM_CHANNELS; j++) {
+               //interp( mach, i, j );
+               struct gallivm_interpolate interp;
+               interp.type = interp_method;
+               interp.attrib = i;
+               interp.chan = j;
+               add_interpolator(prog, &interp);
+            }
+         }
+      } else {
+         unsigned i, j;
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               for( i = first; i <= last; i++ ) {
+                  struct gallivm_interpolate interp;
+                  interp.type = interp_method;
+                  interp.attrib = i;
+                  interp.chan = j;
+                  add_interpolator(prog, &interp);
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+translate_declarationir(struct gallivm_ir *,
+                      llvm::Module *,
+                      StorageSoa *storage,
+                      struct tgsi_full_declaration *decl,
+                      struct tgsi_full_declaration *)
+{
+   if (decl->Declaration.File == TGSI_FILE_ADDRESS) {
+      int idx = decl->DeclarationRange.First;
+      storage->addAddress(idx);
+   }
+}
+
+static void
+translate_immediate(Storage *storage,
+                    struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+
+static void
+translate_immediateir(StorageSoa *storage,
+                      struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+static inline int
+swizzleInt(struct tgsi_full_src_register *src)
+{
+   int swizzle = 0;
+   int start = 1000;
+
+   for (int k = 0; k < 4; ++k) {
+      swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
+      start /= 10;
+   }
+   return swizzle;
+}
+
+static inline llvm::Value *
+swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
+              Storage *storage)
+{
+   int swizzle = swizzleInt(src);
+
+   if (gallivm_is_swizzle(swizzle)) {
+      /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
+      val = storage->shuffleVector(val, swizzle);
+   }
+   return val;
+}
+
+static void
+translate_instruction(llvm::Module *module,
+                      Storage *storage,
+                      Instructions *instr,
+                      struct tgsi_full_instruction *inst,
+                      struct tgsi_full_instruction *fi,
+                      unsigned instno)
+{
+   llvm::Value *inputs[4];
+   inputs[0] = 0;
+   inputs[1] = 0;
+   inputs[2] = 0;
+   inputs[3] = 0;
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      llvm::Value *val = 0;
+      llvm::Value *indIdx = 0;
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+         indIdx = storage->extractIndex(indIdx);
+      }
+      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
+         val = storage->constElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
+         val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+         val = storage->tempElement(src->SrcRegister.Index);
+      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
+         val = storage->outputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+         val = storage->immediateElement(src->SrcRegister.Index);
+      } else {
+         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
+         return;
+      }
+
+      inputs[i] = swizzleVector(val, src, storage);
+   }
+
+   /*if (inputs[0])
+     instr->printVector(inputs[0]);
+     if (inputs[1])
+     instr->printVector(inputs[1]);*/
+   llvm::Value *out = 0;
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+      out = instr->arl(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+      out = instr->lit(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+      out = instr->rcp(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+      out = instr->rsq(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_EXP: {
+      out = instr->exp(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_LOG: {
+      out = instr->log(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+      out = instr->dp3(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+      out = instr->dp4(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+      out = instr->dst(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+      out = instr->min(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+      out = instr->max(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+      out = instr->sge(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+      out = instr->madd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+      out = instr->sub(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+      out = instr->lerp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_CND: {
+      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_CND0: {
+      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_DOT2ADD: {
+      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE: {
+      out = instr->neg(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_FRAC: {
+      out = instr->frc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_CLAMP: {
+      out = instr->clamp(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_FLOOR: {
+      out = instr->floor(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+      out = instr->ex2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+      out = instr->lg2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+      out = instr->pow(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+      out = instr->cross(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+      out = instr->abs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+      out = instr->dph(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+      out = instr->cos(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_DDX: {
+      out = instr->ddx(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_DDY: {
+      out = instr->ddy(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_KILP:
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ: {
+      out = instr->seq(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SFL: {
+      out = instr->sfl(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SGT: {
+      out = instr->sgt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+      out = instr->sin(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SLE: {
+      out = instr->sle(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SNE: {
+      out = instr->sne(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_STR: {
+      out = instr->str(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D: {
+      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+      instr->cal(inst->InstructionExtLabel.Label, storage->inputPtr());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+      instr->end();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+      out = instr->cmp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+      out = instr->scs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_NRM: {
+      out = instr->nrm(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_DIV: {
+      out = instr->div(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP2: {
+      out = instr->dp2(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+      instr->brk();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+      instr->ifop(inputs[0]);
+      storage->setCurrentBlock(instr->currentBlock());
+      return;  //just update the state
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+      instr->elseop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //only state update
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+      instr->endif();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //just update the state
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+      out = instr->trunc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+      instr->beginLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+      instr->bgnSub(instno);
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->pushTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+      instr->endLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+      instr->endSub();
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->popArguments();
+      storage->popTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL: {
+      out = instr->kil(inputs[0]);
+      storage->setKilElement(out);
+      return;
+   }
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* # not sure if we need this */
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      /*TXT( "_SAT" );*/
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      /*TXT( "_SAT[-1,1]" );*/
+      break;
+   default:
+      assert( 0 );
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+         storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
+         storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
+         storage->setAddrElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else {
+         fprintf(stderr, "ERROR: unsupported LLVM destination!");
+         assert(!"wrong destination");
+      }
+   }
+}
+
+
+static void
+translate_instructionir(llvm::Module *module,
+                        StorageSoa *storage,
+                        InstructionsSoa *instr,
+                        struct tgsi_full_instruction *inst,
+                        struct tgsi_full_instruction *fi,
+                        unsigned instno)
+{
+   std::vector< std::vector<llvm::Value*> > inputs(inst->Instruction.NumSrcRegs);
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      std::vector<llvm::Value*> val;
+      llvm::Value *indIdx = 0;
+      int swizzle = swizzleInt(src);
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+      }
+      val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
+                          src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
+
+      inputs[i] = val;
+   }
+
+   std::vector<llvm::Value*> out(4);
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+      out = instr->arl(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+      out = instr->lit(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+      out = instr->rsq(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_EXP:
+      break;
+   case TGSI_OPCODE_LOG:
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+      out = instr->dp3(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+      out = instr->dp4(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+      out = instr->min(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+      out = instr->max(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+      out = instr->madd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+      out = instr->sub(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+   }
+      break;
+   case TGSI_OPCODE_CND:
+      break;
+   case TGSI_OPCODE_CND0:
+      break;
+   case TGSI_OPCODE_DOT2ADD:
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE:
+      break;
+   case TGSI_OPCODE_FRAC: {
+   }
+      break;
+   case TGSI_OPCODE_CLAMP:
+      break;
+   case TGSI_OPCODE_FLOOR: {
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+      out = instr->pow(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+      out = instr->abs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+   }
+      break;
+   case TGSI_OPCODE_DDX:
+      break;
+   case TGSI_OPCODE_DDY:
+      break;
+   case TGSI_OPCODE_KILP:
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ:
+      break;
+   case TGSI_OPCODE_SFL:
+      break;
+   case TGSI_OPCODE_SGT: {
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+   }
+      break;
+   case TGSI_OPCODE_SLE:
+      break;
+   case TGSI_OPCODE_SNE:
+      break;
+   case TGSI_OPCODE_STR:
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D:
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM:
+      break;
+   case TGSI_OPCODE_DIV:
+      break;
+   case TGSI_OPCODE_DP2:
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_NRM4:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL: {
+   }
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out[0]) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+      storage->store((enum tgsi_file_type)dst->DstRegister.File,
+                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask,
+		     instr->getIRBuilder() );
+   }
+}
+
+llvm::Module *
+tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = new Module("shader");
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   Function* shader = mod->getFunction("execute_shader");
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   stream << ir->id;
+   std::string func_name = stream.str();
+   shader->setName(func_name.c_str());
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("input");
+
+   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+   Storage storage(label_entry, ptr_INPUT);
+   Instructions instr(mod, shader, label_entry, &storage);
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declaration(ir, mod, &storage,
+                               &parse.FullToken.FullDeclaration,
+                               &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediate(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         translate_instruction(mod, &storage, &instr,
+                               &parse.FullToken.FullInstruction,
+                               &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   ir->num_consts = storage.numConsts();
+   return mod;
+}
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = new Module("shader");
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   //stream << ir->id;
+   std::string func_name = stream.str();
+   Function *shader = llvm::cast<Function>(mod->getOrInsertFunction(
+                                              func_name.c_str(),
+                                              vertexShaderFunctionType()));
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *input = args++;
+   input->setName("inputs");
+   Value *output = args++;
+   output->setName("outputs");
+   Value *consts = args++;
+   consts->setName("consts");
+
+   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   StorageSoa storage(label_entry, input, output, consts);
+   InstructionsSoa instr(mod, shader, label_entry, &storage);
+
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declarationir(ir, mod, &storage,
+                                 &parse.FullToken.FullDeclaration,
+                                 &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediateir(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         storage.declareImmediates();
+         translate_instructionir(mod, &storage, &instr,
+                                 &parse.FullToken.FullInstruction,
+                                 &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   return mod;
+}
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.h b/src/gallium/auxiliary/gallivm/tgsitollvm.h
new file mode 100644
index 0000000000..7ada04d629
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.h
@@ -0,0 +1,20 @@
+#ifndef TGSITOLLVM_H
+#define TGSITOLLVM_H
+
+
+namespace llvm {
+   class Module;
+}
+
+struct gallivm_ir;
+struct tgsi_token;
+
+
+llvm::Module * tgsi_to_llvm(struct gallivm_ir *ir,
+                            const struct tgsi_token *tokens);
+
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens);
+
+#endif
diff --git a/src/gallium/auxiliary/pipebuffer/Makefile b/src/gallium/auxiliary/pipebuffer/Makefile
new file mode 100644
index 0000000000..4bcf08f8a5
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/Makefile
@@ -0,0 +1,23 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = pipebuffer
+
+C_SOURCES = \
+	pb_buffer_fenced.c \
+	pb_buffer_malloc.c \
+	pb_bufmgr_alt.c \
+	pb_bufmgr_cache.c \
+	pb_bufmgr_debug.c \
+	pb_bufmgr_fenced.c \
+	pb_bufmgr_mm.c \
+	pb_bufmgr_ondemand.c \
+	pb_bufmgr_pool.c \
+	pb_bufmgr_slab.c \
+	pb_validate.c \
+	pb_winsys.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/pipebuffer/SConscript b/src/gallium/auxiliary/pipebuffer/SConscript
new file mode 100644
index 0000000000..4acf721808
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/SConscript
@@ -0,0 +1,20 @@
+Import('*')
+
+pipebuffer = env.ConvenienceLibrary(
+	target = 'pipebuffer',
+	source = [
+		'pb_buffer_fenced.c',
+		'pb_buffer_malloc.c',
+		'pb_bufmgr_alt.c',
+		'pb_bufmgr_cache.c',
+		'pb_bufmgr_debug.c',
+		'pb_bufmgr_fenced.c',
+		'pb_bufmgr_mm.c',
+		'pb_bufmgr_ondemand.c',
+		'pb_bufmgr_pool.c',
+		'pb_bufmgr_slab.c',
+		'pb_validate.c',
+		'pb_winsys.c',
+	])
+
+auxiliaries.insert(0, pipebuffer)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
new file mode 100644
index 0000000000..dd22ef34ec
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -0,0 +1,285 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Generic code for buffers.
+ * 
+ * Behind a pipe buffle handle there can be DMA buffers, client (or user) 
+ * buffers, regular malloced buffers, etc. This file provides an abstract base 
+ * buffer handle that allows the driver to cope with all those kinds of buffers 
+ * in a more flexible way.
+ * 
+ * There is no obligation of a winsys driver to use this library. And a pipe
+ * driver should be completly agnostic about it.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFFER_H_
+#define PB_BUFFER_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_vtbl;
+struct pb_validate;
+
+
+/**
+ * Buffer description.
+ * 
+ * Used when allocating the buffer.
+ */
+struct pb_desc
+{
+   unsigned alignment;
+   unsigned usage;
+};
+
+
+/**
+ * Base class for all pb_* buffers.
+ */
+struct pb_buffer 
+{
+   struct pipe_buffer base;
+
+   /**
+    * Pointer to the virtual function table.
+    *
+    * Avoid accessing this table directly. Use the inline functions below 
+    * instead to avoid mistakes. 
+    */
+   const struct pb_vtbl *vtbl;
+};
+
+
+/**
+ * Virtual function table for the buffer storage operations.
+ * 
+ * Note that creation is not done through this table.
+ */
+struct pb_vtbl
+{
+   void (*destroy)( struct pb_buffer *buf );
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is bitmask of PIPE_BUFFER_FLAG_READ/WRITE. 
+    */
+   void *(*map)( struct pb_buffer *buf, 
+                 unsigned flags );
+   
+   void (*unmap)( struct pb_buffer *buf );
+
+   enum pipe_error (*validate)( struct pb_buffer *buf, 
+                                struct pb_validate *vl,
+                                unsigned flags );
+
+   void (*fence)( struct pb_buffer *buf, 
+                  struct pipe_fence_handle *fence );
+
+   /**
+    * Get the base buffer and the offset.
+    * 
+    * A buffer can be subdivided in smaller buffers. This method should return
+    * the underlaying buffer, and the relative offset.
+    * 
+    * Buffers without an underlaying base buffer should return themselves, with 
+    * a zero offset.
+    * 
+    * Note that this will increase the reference count of the base buffer.
+    */
+   void (*get_base_buffer)( struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            unsigned *offset );
+   
+};
+
+
+static INLINE struct pipe_buffer *
+pb_pipe_buffer( struct pb_buffer *pbuf )
+{
+   assert(pbuf);
+   return &pbuf->base;
+}
+
+
+static INLINE struct pb_buffer *
+pb_buffer( struct pipe_buffer *buf )
+{
+   assert(buf);
+   /* Could add a magic cookie check on debug builds.
+    */
+   return (struct pb_buffer *)buf;
+}
+
+
+/* Accessor functions for pb->vtbl:
+ */
+static INLINE void *
+pb_map(struct pb_buffer *buf, 
+       unsigned flags)
+{
+   assert(buf);
+   if(!buf)
+      return NULL;
+   return buf->vtbl->map(buf, flags);
+}
+
+
+static INLINE void 
+pb_unmap(struct pb_buffer *buf)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   buf->vtbl->unmap(buf);
+}
+
+
+static INLINE void
+pb_get_base_buffer( struct pb_buffer *buf,
+		    struct pb_buffer **base_buf,
+		    unsigned *offset )
+{
+   assert(buf);
+   if(!buf) {
+      base_buf = NULL;
+      offset = 0;
+      return;
+   }
+   assert(buf->vtbl->get_base_buffer);
+   buf->vtbl->get_base_buffer(buf, base_buf, offset);
+}
+
+
+static INLINE enum pipe_error 
+pb_validate(struct pb_buffer *buf, struct pb_validate *vl, unsigned flags)
+{
+   assert(buf);
+   if(!buf)
+      return PIPE_ERROR;
+   assert(buf->vtbl->validate);
+   return buf->vtbl->validate(buf, vl, flags);
+}
+
+
+static INLINE void 
+pb_fence(struct pb_buffer *buf, struct pipe_fence_handle *fence)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   assert(buf->vtbl->fence);
+   buf->vtbl->fence(buf, fence);
+}
+
+
+static INLINE void 
+pb_destroy(struct pb_buffer *buf)
+{
+   assert(buf);
+   if(!buf)
+      return;
+   assert(buf->base.refcount == 0);
+   buf->vtbl->destroy(buf);
+}
+
+
+/* XXX: thread safety issues!
+ */
+static INLINE void
+pb_reference(struct pb_buffer **dst,
+             struct pb_buffer *src)
+{
+   if (src) {
+      assert(src->base.refcount);
+      src->base.refcount++;
+   }
+
+   if (*dst) {
+      assert((*dst)->base.refcount);
+      if(--(*dst)->base.refcount == 0)
+         pb_destroy( *dst );
+   }
+
+   *dst = src;
+}
+
+
+/**
+ * Utility function to check whether the provided alignment is consistent with
+ * the requested or not.
+ */
+static INLINE boolean
+pb_check_alignment(size_t requested, size_t provided)
+{
+   return requested <= provided && (provided % requested) == 0 ? TRUE : FALSE;
+}
+
+
+/**
+ * Utility function to check whether the provided alignment is consistent with
+ * the requested or not.
+ */
+static INLINE boolean
+pb_check_usage(unsigned requested, unsigned provided)
+{
+   return (requested & provided) == requested ? TRUE : FALSE;
+}
+
+
+/**
+ * Malloc-based buffer to store data that can't be used by the graphics 
+ * hardware.
+ */
+struct pb_buffer *
+pb_malloc_buffer_create(size_t size, 
+                        const struct pb_desc *desc);
+
+
+void 
+pb_init_winsys(struct pipe_winsys *winsys);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFFER_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
new file mode 100644
index 0000000000..17300cb553
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -0,0 +1,508 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of fenced buffers.
+ * 
+ * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX)
+#include <unistd.h>
+#include <sched.h>
+#endif
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_error.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct fenced_buffer_list
+{
+   pipe_mutex mutex;
+   
+   struct pipe_winsys *winsys;
+   
+   size_t numDelayed;
+   
+   struct list_head delayed;
+};
+
+
+/**
+ * Wrapper around a pipe buffer which adds fencing and reference counting.
+ */
+struct fenced_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+
+   /* FIXME: protect access with mutex */
+
+   /**
+    * A bitmask of PIPE_BUFFER_USAGE_CPU/GPU_READ/WRITE describing the current
+    * buffer usage.
+    */
+   unsigned flags;
+
+   unsigned mapcount;
+   struct pb_validate *vl;
+   unsigned validation_flags;
+   struct pipe_fence_handle *fence;
+
+   struct list_head head;
+   struct fenced_buffer_list *list;
+};
+
+
+static INLINE struct fenced_buffer *
+fenced_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct fenced_buffer *)buf;
+}
+
+
+static INLINE void
+_fenced_buffer_add(struct fenced_buffer *fenced_buf)
+{
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+
+   assert(fenced_buf->base.base.refcount);
+   assert(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+   assert(fenced_buf->fence);
+
+   assert(!fenced_buf->head.prev);
+   assert(!fenced_buf->head.next);
+   LIST_ADDTAIL(&fenced_buf->head, &fenced_list->delayed);
+   ++fenced_list->numDelayed;
+}
+
+
+/**
+ * Actually destroy the buffer.
+ */
+static INLINE void
+_fenced_buffer_destroy(struct fenced_buffer *fenced_buf)
+{
+   assert(!fenced_buf->base.base.refcount);
+   assert(!fenced_buf->fence);
+   pb_reference(&fenced_buf->buffer, NULL);
+   FREE(fenced_buf);
+}
+
+
+static INLINE void
+_fenced_buffer_remove(struct fenced_buffer_list *fenced_list,
+                      struct fenced_buffer *fenced_buf)
+{
+   struct pipe_winsys *winsys = fenced_list->winsys;
+
+   assert(fenced_buf->fence);
+   assert(fenced_buf->list == fenced_list);
+   
+   winsys->fence_reference(winsys, &fenced_buf->fence, NULL);
+   fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   
+   assert(fenced_buf->head.prev);
+   assert(fenced_buf->head.next);
+   LIST_DEL(&fenced_buf->head);
+#ifdef DEBUG
+   fenced_buf->head.prev = NULL;
+   fenced_buf->head.next = NULL;
+#endif
+   
+   assert(fenced_list->numDelayed);
+   --fenced_list->numDelayed;
+   
+   if(!fenced_buf->base.base.refcount)
+      _fenced_buffer_destroy(fenced_buf);
+}
+
+
+static INLINE enum pipe_error
+_fenced_buffer_finish(struct fenced_buffer *fenced_buf)
+{
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+   struct pipe_winsys *winsys = fenced_list->winsys;
+
+#if 0
+   debug_warning("waiting for GPU");
+#endif
+
+   assert(fenced_buf->fence);
+   if(fenced_buf->fence) {
+      if(winsys->fence_finish(winsys, fenced_buf->fence, 0) != 0) {
+	 return PIPE_ERROR;
+      }
+      /* Remove from the fenced list */
+      /* TODO: remove consequents */
+      _fenced_buffer_remove(fenced_list, fenced_buf);
+   }
+
+   fenced_buf->flags &= ~PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+   return PIPE_OK;
+}
+
+
+/**
+ * Free as many fenced buffers from the list head as possible. 
+ */
+static void
+_fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                               int wait)
+{
+   struct pipe_winsys *winsys = fenced_list->winsys;
+   struct list_head *curr, *next;
+   struct fenced_buffer *fenced_buf;
+   struct pipe_fence_handle *prev_fence = NULL;
+
+   curr = fenced_list->delayed.next;
+   next = curr->next;
+   while(curr != &fenced_list->delayed) {
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+
+      if(fenced_buf->fence != prev_fence) {
+	 int signaled;
+	 if (wait)
+	    signaled = winsys->fence_finish(winsys, fenced_buf->fence, 0);
+	 else
+	    signaled = winsys->fence_signalled(winsys, fenced_buf->fence, 0);
+	 if (signaled != 0)
+	    break;
+	 prev_fence = fenced_buf->fence;
+      }
+      else {
+	 assert(winsys->fence_signalled(winsys, fenced_buf->fence, 0) == 0);
+      }
+
+      _fenced_buffer_remove(fenced_list, fenced_buf);
+
+      curr = next; 
+      next = curr->next;
+   }
+}
+
+
+static void
+fenced_buffer_destroy(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+
+   pipe_mutex_lock(fenced_list->mutex);
+   assert(fenced_buf->base.base.refcount == 0);
+   if (fenced_buf->fence) {
+      struct pipe_winsys *winsys = fenced_list->winsys;
+      if(winsys->fence_signalled(winsys, fenced_buf->fence, 0) == 0) {
+	 struct list_head *curr, *prev;
+	 curr = &fenced_buf->head;
+	 prev = curr->prev;
+	 do {
+	    fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
+	    assert(winsys->fence_signalled(winsys, fenced_buf->fence, 0) == 0);
+	    _fenced_buffer_remove(fenced_list, fenced_buf);
+	    curr = prev;
+	    prev = curr->prev;
+	 } while (curr != &fenced_list->delayed);
+      }	  
+      else {
+	 /* delay destruction */
+      }
+   }
+   else {
+      _fenced_buffer_destroy(fenced_buf);
+   }
+   pipe_mutex_unlock(fenced_list->mutex);
+}
+
+
+static void *
+fenced_buffer_map(struct pb_buffer *buf, 
+                  unsigned flags)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   void *map;
+
+   assert(flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE);
+   assert(!(flags & ~PIPE_BUFFER_USAGE_CPU_READ_WRITE));
+   flags &= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+   
+   /* Check for GPU read/write access */
+   if(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE) {
+      /* Wait for the GPU to finish writing */
+      _fenced_buffer_finish(fenced_buf);
+   }
+
+#if 0
+   /* Check for CPU write access (read is OK) */
+   if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
+      /* this is legal -- just for debugging */
+      debug_warning("concurrent CPU writes");
+   }
+#endif
+   
+   map = pb_map(fenced_buf->buffer, flags);
+   if(map) {
+      ++fenced_buf->mapcount;
+      fenced_buf->flags |= flags;
+   }
+
+   return map;
+}
+
+
+static void
+fenced_buffer_unmap(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   assert(fenced_buf->mapcount);
+   if(fenced_buf->mapcount) {
+      pb_unmap(fenced_buf->buffer);
+      --fenced_buf->mapcount;
+      if(!fenced_buf->mapcount)
+	 fenced_buf->flags &= ~PIPE_BUFFER_USAGE_CPU_READ_WRITE;
+   }
+}
+
+
+static enum pipe_error
+fenced_buffer_validate(struct pb_buffer *buf,
+                       struct pb_validate *vl,
+                       unsigned flags)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   enum pipe_error ret;
+   
+   if(!vl) {
+      /* invalidate */
+      fenced_buf->vl = NULL;
+      fenced_buf->validation_flags = 0;
+      return PIPE_OK;
+   }
+   
+   assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PIPE_BUFFER_USAGE_GPU_READ_WRITE));
+   flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+
+   /* Buffer cannot be validated in two different lists */ 
+   if(fenced_buf->vl && fenced_buf->vl != vl)
+      return PIPE_ERROR_RETRY;
+   
+   /* Do not validate if buffer is still mapped */
+   if(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
+      /* TODO: wait for the thread that mapped the buffer to unmap it */
+      return PIPE_ERROR_RETRY;
+   }
+
+   /* Allow concurrent GPU reads, but serialize GPU writes */
+   if(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE) {
+      if((fenced_buf->flags | flags) & PIPE_BUFFER_USAGE_GPU_WRITE) {
+         _fenced_buffer_finish(fenced_buf);
+      }
+   }
+
+   if(fenced_buf->vl == vl &&
+      (fenced_buf->validation_flags & flags) == flags) {
+      /* Nothing to do -- buffer already validated */
+      return PIPE_OK;
+   }
+
+   /* Final sanity checking */
+   assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE));
+   assert(!(fenced_buf->flags & PIPE_BUFFER_USAGE_GPU_WRITE));
+   assert(!fenced_buf->mapcount);
+   
+   ret = pb_validate(fenced_buf->buffer, vl, flags);
+   if (ret != PIPE_OK)
+      return ret;
+   
+   fenced_buf->vl = vl;
+   fenced_buf->validation_flags |= flags;
+   
+   return PIPE_OK;
+}
+
+
+static void
+fenced_buffer_fence(struct pb_buffer *buf,
+                    struct pipe_fence_handle *fence)
+{
+   struct fenced_buffer *fenced_buf;
+   struct fenced_buffer_list *fenced_list;
+   struct pipe_winsys *winsys;
+
+   fenced_buf = fenced_buffer(buf);
+   fenced_list = fenced_buf->list;
+   winsys = fenced_list->winsys;
+   
+   if(fence == fenced_buf->fence) {
+      /* Nothing to do */
+      return;
+   }
+
+   assert(fenced_buf->vl);
+   assert(fenced_buf->validation_flags);
+   
+   pipe_mutex_lock(fenced_list->mutex);
+   if (fenced_buf->fence)
+      _fenced_buffer_remove(fenced_list, fenced_buf);
+   if (fence) {
+      winsys->fence_reference(winsys, &fenced_buf->fence, fence);
+      fenced_buf->flags |= fenced_buf->validation_flags;
+      _fenced_buffer_add(fenced_buf);
+   }
+   pipe_mutex_unlock(fenced_list->mutex);
+   
+   pb_fence(fenced_buf->buffer, fence);
+
+   fenced_buf->vl = NULL;
+   fenced_buf->validation_flags = 0;
+}
+
+
+static void
+fenced_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              unsigned *offset)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
+}
+
+
+static const struct pb_vtbl 
+fenced_buffer_vtbl = {
+      fenced_buffer_destroy,
+      fenced_buffer_map,
+      fenced_buffer_unmap,
+      fenced_buffer_validate,
+      fenced_buffer_fence,
+      fenced_buffer_get_base_buffer
+};
+
+
+struct pb_buffer *
+fenced_buffer_create(struct fenced_buffer_list *fenced_list, 
+                     struct pb_buffer *buffer)
+{
+   struct fenced_buffer *buf;
+   
+   if(!buffer)
+      return NULL;
+   
+   buf = CALLOC_STRUCT(fenced_buffer);
+   if(!buf) {
+      pb_reference(&buffer, NULL);
+      return NULL;
+   }
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = buffer->base.alignment;
+   buf->base.base.usage = buffer->base.usage;
+   buf->base.base.size = buffer->base.size;
+   
+   buf->base.vtbl = &fenced_buffer_vtbl;
+   buf->buffer = buffer;
+   buf->list = fenced_list;
+   
+   return &buf->base;
+}
+
+
+struct fenced_buffer_list *
+fenced_buffer_list_create(struct pipe_winsys *winsys) 
+{
+   struct fenced_buffer_list *fenced_list;
+
+   fenced_list = CALLOC_STRUCT(fenced_buffer_list);
+   if (!fenced_list)
+      return NULL;
+
+   fenced_list->winsys = winsys;
+
+   LIST_INITHEAD(&fenced_list->delayed);
+
+   fenced_list->numDelayed = 0;
+   
+   pipe_mutex_init(fenced_list->mutex);
+
+   return fenced_list;
+}
+
+
+void
+fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                              int wait)
+{
+   pipe_mutex_lock(fenced_list->mutex);
+   _fenced_buffer_list_check_free(fenced_list, wait);
+   pipe_mutex_unlock(fenced_list->mutex);
+}
+
+
+void
+fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
+{
+   pipe_mutex_lock(fenced_list->mutex);
+
+   /* Wait on outstanding fences */
+   while (fenced_list->numDelayed) {
+      pipe_mutex_unlock(fenced_list->mutex);
+#if defined(PIPE_OS_LINUX)
+      sched_yield();
+#endif
+      _fenced_buffer_list_check_free(fenced_list, 1);
+      pipe_mutex_lock(fenced_list->mutex);
+   }
+
+   pipe_mutex_unlock(fenced_list->mutex);
+   
+   FREE(fenced_list);
+}
+
+
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
new file mode 100644
index 0000000000..b15c676194
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer fencing.
+ * 
+ * "Fenced buffers" is actually a misnomer. They should be referred as 
+ * "fenceable buffers", i.e, buffers that can be fenced, but I couldn't find
+ * the word "fenceable" in the dictionary.
+ * 
+ * A "fenced buffer" is a decorator around a normal buffer, which adds two 
+ * special properties:
+ * - the ability for the destruction to be delayed by a fence;
+ * - reference counting.
+ * 
+ * Usually DMA buffers have a life-time that will extend the life-time of its 
+ * handle. The end-of-life is dictated by the fence signalling. 
+ * 
+ * Between the handle's destruction, and the fence signalling, the buffer is 
+ * stored in a fenced buffer list.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFFER_FENCED_H_
+#define PB_BUFFER_FENCED_H_
+
+
+#include "pipe/p_debug.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_winsys;
+struct pipe_buffer;
+struct pipe_fence_handle;
+
+
+/**
+ * List of buffers which are awaiting fence signalling.
+ */
+struct fenced_buffer_list;
+
+
+/**
+ * Create a fenced buffer list.
+ * 
+ * See also fenced_bufmgr_create for a more convenient way to use this.
+ */
+struct fenced_buffer_list *
+fenced_buffer_list_create(struct pipe_winsys *winsys);
+
+
+/**
+ * Walk the fenced buffer list to check and free signalled buffers.
+ */ 
+void
+fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                              int wait);
+
+void
+fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list);
+
+
+/**
+ * Wrap a buffer in a fenced buffer.
+ * 
+ * NOTE: this will not increase the buffer reference count.
+ */
+struct pb_buffer *
+fenced_buffer_create(struct fenced_buffer_list *fenced, 
+                     struct pb_buffer *buffer);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFFER_FENCED_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
new file mode 100644
index 0000000000..53f497cfb0
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -0,0 +1,186 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of malloc-based buffers to store data that can't be processed
+ * by the hardware. 
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct malloc_buffer 
+{
+   struct pb_buffer base;
+   void *data;
+};
+
+
+extern const struct pb_vtbl malloc_buffer_vtbl;
+
+static INLINE struct malloc_buffer *
+malloc_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &malloc_buffer_vtbl);
+   return (struct malloc_buffer *)buf;
+}
+
+
+static void
+malloc_buffer_destroy(struct pb_buffer *buf)
+{
+   align_free(malloc_buffer(buf)->data);
+   FREE(buf);
+}
+
+
+static void *
+malloc_buffer_map(struct pb_buffer *buf, 
+                  unsigned flags)
+{
+   return malloc_buffer(buf)->data;
+}
+
+
+static void
+malloc_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+malloc_buffer_validate(struct pb_buffer *buf, 
+                       struct pb_validate *vl,
+                       unsigned flags)
+{
+   assert(0);
+   return PIPE_ERROR;
+}
+
+
+static void
+malloc_buffer_fence(struct pb_buffer *buf, 
+                    struct pipe_fence_handle *fence)
+{
+   assert(0);
+}
+
+
+static void
+malloc_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+const struct pb_vtbl 
+malloc_buffer_vtbl = {
+      malloc_buffer_destroy,
+      malloc_buffer_map,
+      malloc_buffer_unmap,
+      malloc_buffer_validate,
+      malloc_buffer_fence,
+      malloc_buffer_get_base_buffer
+};
+
+
+struct pb_buffer *
+pb_malloc_buffer_create(size_t size,
+                   	const struct pb_desc *desc) 
+{
+   struct malloc_buffer *buf;
+   
+   /* TODO: do a single allocation */
+   
+   buf = CALLOC_STRUCT(malloc_buffer);
+   if(!buf)
+      return NULL;
+
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   buf->base.base.size = size;
+   buf->base.vtbl = &malloc_buffer_vtbl;
+
+   buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
+   if(!buf->data) {
+      FREE(buf);
+      return NULL;
+   }
+
+   return &buf->base;
+}
+
+
+static struct pb_buffer *
+pb_malloc_bufmgr_create_buffer(struct pb_manager *mgr, 
+                               size_t size,
+                               const struct pb_desc *desc) 
+{
+   return pb_malloc_buffer_create(size, desc);
+}
+
+
+static void
+pb_malloc_bufmgr_flush(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static void
+pb_malloc_bufmgr_destroy(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static struct pb_manager 
+pb_malloc_bufmgr = {
+   pb_malloc_bufmgr_destroy,
+   pb_malloc_bufmgr_create_buffer,
+   pb_malloc_bufmgr_flush
+};
+
+
+struct pb_manager *
+pb_malloc_bufmgr_create(void) 
+{
+  return &pb_malloc_bufmgr;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
new file mode 100644
index 0000000000..0a8264a924
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -0,0 +1,212 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer management.
+ * 
+ * A buffer manager does only one basic thing: it creates buffers. Actually,
+ * "buffer factory" would probably a more accurate description.
+ * 
+ * You can chain buffer managers so that you can have a finer grained memory
+ * management and pooling.
+ * 
+ * For example, for a simple batch buffer manager you would chain:
+ * - the native buffer manager, which provides DMA memory from the graphics
+ * memory space;
+ * - the pool buffer manager, which keep around a pool of equally sized buffers
+ * to avoid latency associated with the native buffer manager; 
+ * - the fenced buffer manager, which will delay buffer destruction until the 
+ * the moment the card finishing processing it. 
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_BUFMGR_H_
+#define PB_BUFMGR_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_error.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_desc;
+struct pipe_buffer;
+struct pipe_winsys;
+
+
+/** 
+ * Abstract base class for all buffer managers.
+ */
+struct pb_manager
+{
+   void
+   (*destroy)( struct pb_manager *mgr );
+
+   struct pb_buffer *
+   (*create_buffer)( struct pb_manager *mgr, 
+	             size_t size,
+	             const struct pb_desc *desc);
+
+   /**
+    * Flush all temporary-held buffers.
+    * 
+    * Used mostly to aid debugging memory issues or to clean up resources when 
+    * the drivers are long lived.
+    */
+   void
+   (*flush)( struct pb_manager *mgr );
+};
+
+
+/**
+ * Malloc buffer provider.
+ * 
+ * Simple wrapper around pb_malloc_buffer_create for convenience.
+ */
+struct pb_manager *
+pb_malloc_bufmgr_create(void);
+
+
+/** 
+ * Static buffer pool sub-allocator.
+ * 
+ * Manages the allocation of equally sized buffers. It does so by allocating
+ * a single big buffer and divide it equally sized buffers. 
+ * 
+ * It is meant to manage the allocation of batch buffer pools.
+ */
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   size_t n, size_t size,
+                   const struct pb_desc *desc);
+
+
+/** 
+ * Static sub-allocator based the old memory manager.
+ * 
+ * It managers buffers of different sizes. It does so by allocating a buffer
+ * with the size of the heap, and then using the old mm memory manager to manage
+ * that heap. 
+ */
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 size_t size, size_t align2);
+
+/**
+ * Same as mm_bufmgr_create.
+ * 
+ * Buffer will be release when the manager is destroyed.
+ */
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             size_t size, size_t align2);
+
+
+/**
+ * Slab sub-allocator.
+ */
+struct pb_manager *
+pb_slab_manager_create(struct pb_manager *provider,
+                       size_t bufSize,
+                       size_t slabSize,
+                       const struct pb_desc *desc);
+
+/**
+ * Allow a range of buffer size, by aggregating multiple slabs sub-allocators 
+ * with different bucket sizes.
+ */
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             size_t minBufSize,
+                             size_t maxBufSize,
+                             size_t slabSize,
+                             const struct pb_desc *desc);
+
+
+/** 
+ * Time-based buffer cache.
+ *
+ * This manager keeps a cache of destroyed buffers during a time interval. 
+ */
+struct pb_manager *
+pb_cache_manager_create(struct pb_manager *provider, 
+                     	unsigned usecs); 
+
+
+/** 
+ * Fenced buffer manager.
+ *
+ * This manager is just meant for convenience. It wraps the buffers returned
+ * by another manager in fenced buffers, so that  
+ * 
+ * NOTE: the buffer manager that provides the buffers will be destroyed
+ * at the same time.
+ */
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider,
+                     struct pipe_winsys *winsys);
+
+
+struct pb_manager *
+pb_alt_manager_create(struct pb_manager *provider1, 
+                      struct pb_manager *provider2);
+
+
+/** 
+ * Ondemand buffer manager.
+ * 
+ * Buffers are created in malloc'ed memory (fast and cached), and the constents
+ * is transfered to a buffer from the provider (typically in slow uncached 
+ * memory) when there is an attempt to validate the buffer.
+ * 
+ * Ideal for situations where one does not know before hand whether a given
+ * buffer will effectively be used by the hardware or not. 
+ */
+struct pb_manager *
+pb_ondemand_manager_create(struct pb_manager *provider); 
+
+
+/** 
+ * Debug buffer manager to detect buffer under- and overflows.
+ *
+ * Band size should be a multiple of the largest alignment
+ */
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider, size_t band_size); 
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_BUFMGR_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
new file mode 100644
index 0000000000..c956924cc7
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -0,0 +1,120 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Allocate buffers from two alternative buffer providers.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_alt_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider1;
+   struct pb_manager *provider2;
+};
+
+
+static INLINE struct pb_alt_manager *
+pb_alt_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_alt_manager *)mgr;
+}
+
+
+static struct pb_buffer *
+pb_alt_manager_create_buffer(struct pb_manager *_mgr, 
+                             size_t size,
+                             const struct pb_desc *desc)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   struct pb_buffer *buf;
+   
+   buf = mgr->provider1->create_buffer(mgr->provider1, size, desc);
+   if(buf)
+      return buf;
+   
+   buf = mgr->provider2->create_buffer(mgr->provider2, size, desc);
+   return buf;
+}
+
+
+static void
+pb_alt_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   
+   assert(mgr->provider1->flush);
+   if(mgr->provider1->flush)
+      mgr->provider1->flush(mgr->provider1);
+   
+   assert(mgr->provider2->flush);
+   if(mgr->provider2->flush)
+      mgr->provider2->flush(mgr->provider2);
+}
+
+
+static void
+pb_alt_manager_destroy(struct pb_manager *mgr)
+{
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_alt_manager_create(struct pb_manager *provider1, 
+                      struct pb_manager *provider2)
+{
+   struct pb_alt_manager *mgr;
+
+   if(!provider1 || !provider2)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_alt_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_alt_manager_destroy;
+   mgr->base.create_buffer = pb_alt_manager_create_buffer;
+   mgr->base.flush = pb_alt_manager_flush;
+   mgr->provider1 = provider1;
+   mgr->provider2 = provider2;
+      
+   return &mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
new file mode 100644
index 0000000000..f57a7bffd7
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -0,0 +1,385 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer cache.
+ * 
+ * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pb_cache_manager;
+
+
+/**
+ * Wrapper around a pipe buffer which adds delayed destruction.
+ */
+struct pb_cache_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+   struct pb_cache_manager *mgr;
+
+   /** Caching time interval */
+   struct util_time start, end;
+
+   struct list_head head;
+};
+
+
+struct pb_cache_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   unsigned usecs;
+   
+   pipe_mutex mutex;
+   
+   struct list_head delayed;
+   size_t numDelayed;
+};
+
+
+static INLINE struct pb_cache_buffer *
+pb_cache_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_cache_buffer *)buf;
+}
+
+
+static INLINE struct pb_cache_manager *
+pb_cache_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_cache_manager *)mgr;
+}
+
+
+/**
+ * Actually destroy the buffer.
+ */
+static INLINE void
+_pb_cache_buffer_destroy(struct pb_cache_buffer *buf)
+{
+   struct pb_cache_manager *mgr = buf->mgr;
+
+   LIST_DEL(&buf->head);
+   assert(mgr->numDelayed);
+   --mgr->numDelayed;
+   assert(!buf->base.base.refcount);
+   pb_reference(&buf->buffer, NULL);
+   FREE(buf);
+}
+
+
+/**
+ * Free as many cache buffers from the list head as possible. 
+ */
+static void
+_pb_cache_buffer_list_check_free(struct pb_cache_manager *mgr)
+{
+   struct list_head *curr, *next;
+   struct pb_cache_buffer *buf;
+   struct util_time now;
+   
+   util_time_get(&now);
+   
+   curr = mgr->delayed.next;
+   next = curr->next;
+   while(curr != &mgr->delayed) {
+      buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+
+      if(!util_time_timeout(&buf->start, &buf->end, &now))
+	 break;
+	 
+      _pb_cache_buffer_destroy(buf);
+
+      curr = next; 
+      next = curr->next;
+   }
+}
+
+
+static void
+pb_cache_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   struct pb_cache_manager *mgr = buf->mgr;
+
+   pipe_mutex_lock(mgr->mutex);
+   assert(buf->base.base.refcount == 0);
+   
+   _pb_cache_buffer_list_check_free(mgr);
+   
+   util_time_get(&buf->start);
+   util_time_add(&buf->start, mgr->usecs, &buf->end);
+   LIST_ADDTAIL(&buf->head, &mgr->delayed);
+   ++mgr->numDelayed;
+   pipe_mutex_unlock(mgr->mutex);
+}
+
+
+static void *
+pb_cache_buffer_map(struct pb_buffer *_buf, 
+                  unsigned flags)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   return pb_map(buf->buffer, flags);
+}
+
+
+static void
+pb_cache_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
+   pb_unmap(buf->buffer);
+}
+
+
+static enum pipe_error 
+pb_cache_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_cache_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   pb_fence(buf->buffer, fence);
+}
+
+
+static void
+pb_cache_buffer_get_base_buffer(struct pb_buffer *_buf,
+                              struct pb_buffer **base_buf,
+                              unsigned *offset)
+{
+   struct pb_cache_buffer *buf = pb_cache_buffer(_buf);
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+}
+
+
+const struct pb_vtbl 
+pb_cache_buffer_vtbl = {
+      pb_cache_buffer_destroy,
+      pb_cache_buffer_map,
+      pb_cache_buffer_unmap,
+      pb_cache_buffer_validate,
+      pb_cache_buffer_fence,
+      pb_cache_buffer_get_base_buffer
+};
+
+
+static INLINE boolean
+pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
+                          size_t size,
+                          const struct pb_desc *desc)
+{
+   if(buf->base.base.size < size)
+      return FALSE;
+
+   /* be lenient with size */
+   if(buf->base.base.size >= 2*size)
+      return FALSE;
+   
+   if(!pb_check_alignment(desc->alignment, buf->base.base.alignment))
+      return FALSE;
+   
+   if(!pb_check_usage(desc->usage, buf->base.base.usage))
+      return FALSE;
+   
+   return TRUE;
+}
+
+
+static struct pb_buffer *
+pb_cache_manager_create_buffer(struct pb_manager *_mgr, 
+                               size_t size,
+                               const struct pb_desc *desc)
+{
+   struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
+   struct pb_cache_buffer *buf;
+   struct pb_cache_buffer *curr_buf;
+   struct list_head *curr, *next;
+   struct util_time now;
+   
+   pipe_mutex_lock(mgr->mutex);
+
+   buf = NULL;
+   curr = mgr->delayed.next;
+   next = curr->next;
+   
+   /* search in the expired buffers, freeing them in the process */
+   util_time_get(&now);
+   while(curr != &mgr->delayed) {
+      curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc))
+	 buf = curr_buf;
+      else if(util_time_timeout(&curr_buf->start, &curr_buf->end, &now))
+	 _pb_cache_buffer_destroy(curr_buf);
+      else
+         /* This buffer (and all hereafter) are still hot in cache */
+         break;
+      curr = next; 
+      next = curr->next;
+   }
+
+   /* keep searching in the hot buffers */
+   if(!buf) {
+      while(curr != &mgr->delayed) {
+         curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+         if(pb_cache_is_buffer_compat(curr_buf, size, desc)) {
+            buf = curr_buf;
+            break;
+         }
+         /* no need to check the timeout here */
+         curr = next;
+         next = curr->next;
+      }
+   }
+   
+   if(buf) {
+      LIST_DEL(&buf->head);
+      pipe_mutex_unlock(mgr->mutex);
+      ++buf->base.base.refcount;
+      return &buf->base;
+   }
+   
+   pipe_mutex_unlock(mgr->mutex);
+
+   buf = CALLOC_STRUCT(pb_cache_buffer);
+   if(!buf)
+      return NULL;
+   
+   buf->buffer = mgr->provider->create_buffer(mgr->provider, size, desc);
+   if(!buf->buffer) {
+      FREE(buf);
+      return NULL;
+   }
+   
+   assert(buf->buffer->base.refcount >= 1);
+   assert(pb_check_alignment(desc->alignment, buf->buffer->base.alignment));
+   assert(pb_check_usage(desc->usage, buf->buffer->base.usage));
+   assert(buf->buffer->base.size >= size);
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = buf->buffer->base.alignment;
+   buf->base.base.usage = buf->buffer->base.usage;
+   buf->base.base.size = buf->buffer->base.size;
+   
+   buf->base.vtbl = &pb_cache_buffer_vtbl;
+   buf->mgr = mgr;
+   
+   return &buf->base;
+}
+
+
+static void
+pb_cache_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
+   struct list_head *curr, *next;
+   struct pb_cache_buffer *buf;
+
+   pipe_mutex_lock(mgr->mutex);
+   curr = mgr->delayed.next;
+   next = curr->next;
+   while(curr != &mgr->delayed) {
+      buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      _pb_cache_buffer_destroy(buf);
+      curr = next; 
+      next = curr->next;
+   }
+   pipe_mutex_unlock(mgr->mutex);
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_cache_manager_destroy(struct pb_manager *mgr)
+{
+   pb_cache_manager_flush(mgr);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_cache_manager_create(struct pb_manager *provider, 
+                     	unsigned usecs) 
+{
+   struct pb_cache_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_cache_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_cache_manager_destroy;
+   mgr->base.create_buffer = pb_cache_manager_create_buffer;
+   mgr->base.flush = pb_cache_manager_flush;
+   mgr->provider = provider;
+   mgr->usecs = usecs;
+   LIST_INITHEAD(&mgr->delayed);
+   mgr->numDelayed = 0;
+   pipe_mutex_init(mgr->mutex);
+      
+   return &mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
new file mode 100644
index 0000000000..62639fe1c8
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -0,0 +1,391 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Debug buffer manager to detect buffer under- and overflows.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+#ifdef DEBUG
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pb_debug_manager;
+
+
+/**
+ * Wrapper around a pipe buffer which adds delayed destruction.
+ */
+struct pb_debug_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+   struct pb_debug_manager *mgr;
+   
+   size_t underflow_size;
+   size_t overflow_size;
+};
+
+
+struct pb_debug_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+
+   size_t band_size;
+};
+
+
+static INLINE struct pb_debug_buffer *
+pb_debug_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_debug_buffer *)buf;
+}
+
+
+static INLINE struct pb_debug_manager *
+pb_debug_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_debug_manager *)mgr;
+}
+
+
+static const uint8_t random_pattern[32] = {
+   0xaf, 0xcf, 0xa5, 0xa2, 0xc2, 0x63, 0x15, 0x1a, 
+   0x7e, 0xe2, 0x7e, 0x84, 0x15, 0x49, 0xa2, 0x1e,
+   0x49, 0x63, 0xf5, 0x52, 0x74, 0x66, 0x9e, 0xc4, 
+   0x6d, 0xcf, 0x2c, 0x4a, 0x74, 0xe6, 0xfd, 0x94
+};
+
+
+static INLINE void 
+fill_random_pattern(uint8_t *dst, size_t size)
+{
+   size_t i = 0;
+   while(size--) {
+      *dst++ = random_pattern[i++];
+      i &= sizeof(random_pattern) - 1;
+   }
+}
+
+
+static INLINE boolean 
+check_random_pattern(const uint8_t *dst, size_t size, 
+                     size_t *min_ofs, size_t *max_ofs) 
+{
+   boolean result = TRUE;
+   size_t i;
+   *min_ofs = size;
+   *max_ofs = 0;
+   for(i = 0; i < size; ++i) {
+      if(*dst++ != random_pattern[i % sizeof(random_pattern)]) {
+         *min_ofs = MIN2(*min_ofs, i);
+         *max_ofs = MAX2(*max_ofs, i);
+	 result = FALSE;
+      }
+   }
+   return result;
+}
+
+
+static void
+pb_debug_buffer_fill(struct pb_debug_buffer *buf)
+{
+   uint8_t *map;
+   
+   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+   assert(map);
+   if(map) {
+      fill_random_pattern(map, buf->underflow_size);
+      fill_random_pattern(map + buf->underflow_size + buf->base.base.size, 
+                          buf->overflow_size);
+      pb_unmap(buf->buffer);
+   }
+}
+
+
+/**
+ * Check for under/over flows.
+ * 
+ * Should be called with the buffer unmaped.
+ */
+static void
+pb_debug_buffer_check(struct pb_debug_buffer *buf)
+{
+   uint8_t *map;
+   
+   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+   assert(map);
+   if(map) {
+      boolean underflow, overflow;
+      size_t min_ofs, max_ofs;
+      
+      underflow = !check_random_pattern(map, buf->underflow_size, 
+                                        &min_ofs, &max_ofs);
+      if(underflow) {
+         debug_printf("buffer underflow (offset -%u%s to -%u bytes) detected\n",
+                      buf->underflow_size - min_ofs,
+                      min_ofs == 0 ? "+" : "",
+                      buf->underflow_size - max_ofs);
+      }
+      
+      overflow = !check_random_pattern(map + buf->underflow_size + buf->base.base.size, 
+                                       buf->overflow_size, 
+                                       &min_ofs, &max_ofs);
+      if(overflow) {
+         debug_printf("buffer overflow (size %u plus offset %u to %u%s bytes) detected\n",
+                      buf->base.base.size,
+                      min_ofs,
+                      max_ofs,
+                      max_ofs == buf->overflow_size - 1 ? "+" : "");
+      }
+      
+      debug_assert(!underflow && !overflow);
+
+      /* re-fill if not aborted */
+      if(underflow)
+         fill_random_pattern(map, buf->underflow_size);
+      if(overflow)
+         fill_random_pattern(map + buf->underflow_size + buf->base.base.size, 
+                             buf->overflow_size);
+
+      pb_unmap(buf->buffer);
+   }
+}
+
+
+static void
+pb_debug_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);  
+   
+   assert(!buf->base.base.refcount);
+   
+   pb_debug_buffer_check(buf);
+
+   pb_reference(&buf->buffer, NULL);
+   FREE(buf);
+}
+
+
+static void *
+pb_debug_buffer_map(struct pb_buffer *_buf, 
+                    unsigned flags)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   void *map;
+   
+   pb_debug_buffer_check(buf);
+
+   map = pb_map(buf->buffer, flags);
+   if(!map)
+      return NULL;
+   
+   return (uint8_t *)map + buf->underflow_size;
+}
+
+
+static void
+pb_debug_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);   
+   pb_unmap(buf->buffer);
+   
+   pb_debug_buffer_check(buf);
+}
+
+
+static void
+pb_debug_buffer_get_base_buffer(struct pb_buffer *_buf,
+                                struct pb_buffer **base_buf,
+                                unsigned *offset)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+   *offset += buf->underflow_size;
+}
+
+
+static enum pipe_error 
+pb_debug_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   
+   pb_debug_buffer_check(buf);
+
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_debug_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
+   pb_fence(buf->buffer, fence);
+}
+
+
+const struct pb_vtbl 
+pb_debug_buffer_vtbl = {
+      pb_debug_buffer_destroy,
+      pb_debug_buffer_map,
+      pb_debug_buffer_unmap,
+      pb_debug_buffer_validate,
+      pb_debug_buffer_fence,
+      pb_debug_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pb_debug_manager_create_buffer(struct pb_manager *_mgr, 
+                               size_t size,
+                               const struct pb_desc *desc)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   struct pb_debug_buffer *buf;
+   struct pb_desc real_desc;
+   size_t real_size;
+   
+   buf = CALLOC_STRUCT(pb_debug_buffer);
+   if(!buf)
+      return NULL;
+   
+   real_size = size + 2*mgr->band_size;
+   real_desc = *desc;
+   real_desc.usage |= PIPE_BUFFER_USAGE_CPU_WRITE;
+   real_desc.usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+   buf->buffer = mgr->provider->create_buffer(mgr->provider, 
+                                              real_size, 
+                                              &real_desc);
+   if(!buf->buffer) {
+      FREE(buf);
+      return NULL;
+   }
+   
+   assert(buf->buffer->base.refcount >= 1);
+   assert(pb_check_alignment(real_desc.alignment, buf->buffer->base.alignment));
+   assert(pb_check_usage(real_desc.usage, buf->buffer->base.usage));
+   assert(buf->buffer->base.size >= real_size);
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   buf->base.base.size = size;
+   
+   buf->base.vtbl = &pb_debug_buffer_vtbl;
+   buf->mgr = mgr;
+
+   buf->underflow_size = mgr->band_size;
+   buf->overflow_size = buf->buffer->base.size - buf->underflow_size - size;
+   
+   pb_debug_buffer_fill(buf);
+   
+   return &buf->base;
+}
+
+
+static void
+pb_debug_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_debug_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   mgr->provider->destroy(mgr->provider);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider, size_t band_size) 
+{
+   struct pb_debug_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_debug_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_debug_manager_destroy;
+   mgr->base.create_buffer = pb_debug_manager_create_buffer;
+   mgr->base.flush = pb_debug_manager_flush;
+   mgr->provider = provider;
+   mgr->band_size = band_size;
+      
+   return &mgr->base;
+}
+
+
+#else /* !DEBUG */
+
+
+struct pb_manager *
+pb_debug_manager_create(struct pb_manager *provider, size_t band_size) 
+{
+   return provider;
+}
+
+
+#endif /* !DEBUG */
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
new file mode 100644
index 0000000000..513ed28ca6
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -0,0 +1,148 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * A buffer manager that wraps buffers in fenced buffers.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.dot.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+#include "pb_bufmgr.h"
+
+
+struct fenced_pb_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   
+   struct fenced_buffer_list *fenced_list;
+};
+
+
+static INLINE struct fenced_pb_manager *
+fenced_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct fenced_pb_manager *)mgr;
+}
+
+
+static struct pb_buffer *
+fenced_bufmgr_create_buffer(struct pb_manager *mgr, 
+                            size_t size,
+                            const struct pb_desc *desc)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+   struct pb_buffer *buf;
+   struct pb_buffer *fenced_buf;
+
+   /* check for free buffers before allocating new ones */
+   fenced_buffer_list_check_free(fenced_mgr->fenced_list, 0);
+   
+   buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
+   if(!buf) {
+      /* try harder to get a buffer */
+      fenced_buffer_list_check_free(fenced_mgr->fenced_list, 1);
+      
+      buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
+      if(!buf) {
+         /* give up */
+         return NULL;
+      }
+   }
+   
+   fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
+   if(!fenced_buf) {
+      pb_reference(&buf, NULL);
+   }
+   
+   return fenced_buf;
+}
+
+
+static void
+fenced_bufmgr_flush(struct pb_manager *mgr)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+
+   fenced_buffer_list_check_free(fenced_mgr->fenced_list, TRUE);
+
+   assert(fenced_mgr->provider->flush);
+   if(fenced_mgr->provider->flush)
+      fenced_mgr->provider->flush(fenced_mgr->provider);
+}
+
+
+static void
+fenced_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+
+   fenced_buffer_list_destroy(fenced_mgr->fenced_list);
+
+   if(fenced_mgr->provider)
+      fenced_mgr->provider->destroy(fenced_mgr->provider);
+   
+   FREE(fenced_mgr);
+}
+
+
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider, 
+                     struct pipe_winsys *winsys) 
+{
+   struct fenced_pb_manager *fenced_mgr;
+
+   if(!provider)
+      return NULL;
+   
+   fenced_mgr = CALLOC_STRUCT(fenced_pb_manager);
+   if (!fenced_mgr)
+      return NULL;
+
+   fenced_mgr->base.destroy = fenced_bufmgr_destroy;
+   fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+   fenced_mgr->base.flush = fenced_bufmgr_flush;
+
+   fenced_mgr->provider = provider;
+   fenced_mgr->fenced_list = fenced_buffer_list_create(winsys);
+   if(!fenced_mgr->fenced_list) {
+      FREE(fenced_mgr);
+      return NULL;
+   }
+      
+   return &fenced_mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
new file mode 100644
index 0000000000..fd39838bfd
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -0,0 +1,322 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer manager using the old texture memory manager.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_mm.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct mm_pb_manager
+{
+   struct pb_manager base;
+   
+   pipe_mutex mutex;
+   
+   size_t size;
+   struct mem_block *heap;
+   
+   size_t align2;
+   
+   struct pb_buffer *buffer;
+   void *map;
+};
+
+
+static INLINE struct mm_pb_manager *
+mm_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct mm_pb_manager *)mgr;
+}
+
+
+struct mm_buffer
+{
+   struct pb_buffer base;
+   
+   struct mm_pb_manager *mgr;
+   
+   struct mem_block *block;
+};
+
+
+static INLINE struct mm_buffer *
+mm_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct mm_buffer *)buf;
+}
+
+
+static void
+mm_buffer_destroy(struct pb_buffer *buf)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   
+   assert(buf->base.refcount == 0);
+   
+   pipe_mutex_lock(mm->mutex);
+   u_mmFreeMem(mm_buf->block);
+   FREE(buf);
+   pipe_mutex_unlock(mm->mutex);
+}
+
+
+static void *
+mm_buffer_map(struct pb_buffer *buf,
+              unsigned flags)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+
+   return (unsigned char *) mm->map + mm_buf->block->ofs;
+}
+
+
+static void
+mm_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+mm_buffer_validate(struct pb_buffer *buf, 
+                   struct pb_validate *vl,
+                   unsigned flags)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   return pb_validate(mm->buffer, vl, flags);
+}
+
+
+static void
+mm_buffer_fence(struct pb_buffer *buf, 
+                struct pipe_fence_handle *fence)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   pb_fence(mm->buffer, fence);
+}
+
+
+static void
+mm_buffer_get_base_buffer(struct pb_buffer *buf,
+                          struct pb_buffer **base_buf,
+                          unsigned *offset)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   pb_get_base_buffer(mm->buffer, base_buf, offset);
+   *offset += mm_buf->block->ofs;
+}
+
+
+static const struct pb_vtbl 
+mm_buffer_vtbl = {
+      mm_buffer_destroy,
+      mm_buffer_map,
+      mm_buffer_unmap,
+      mm_buffer_validate,
+      mm_buffer_fence,
+      mm_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+mm_bufmgr_create_buffer(struct pb_manager *mgr, 
+                        size_t size,
+                        const struct pb_desc *desc)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   struct mm_buffer *mm_buf;
+
+   /* We don't handle alignments larger then the one initially setup */
+   assert(desc->alignment % (1 << mm->align2) == 0);
+   if(desc->alignment % (1 << mm->align2))
+      return NULL;
+   
+   pipe_mutex_lock(mm->mutex);
+
+   mm_buf = CALLOC_STRUCT(mm_buffer);
+   if (!mm_buf) {
+      pipe_mutex_unlock(mm->mutex);
+      return NULL;
+   }
+
+   mm_buf->base.base.refcount = 1;
+   mm_buf->base.base.alignment = desc->alignment;
+   mm_buf->base.base.usage = desc->usage;
+   mm_buf->base.base.size = size;
+   
+   mm_buf->base.vtbl = &mm_buffer_vtbl;
+   
+   mm_buf->mgr = mm;
+   
+   mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
+   if(!mm_buf->block) {
+      debug_printf("warning: heap full\n");
+#if 0
+      mmDumpMemInfo(mm->heap);
+#endif
+      
+      mm_buf->block = u_mmAllocMem(mm->heap, size, mm->align2, 0);
+      if(!mm_buf->block) {
+         FREE(mm_buf);
+         pipe_mutex_unlock(mm->mutex);
+         return NULL;
+      }
+   }
+   
+   /* Some sanity checks */
+   assert(0 <= (unsigned)mm_buf->block->ofs && (unsigned)mm_buf->block->ofs < mm->size);
+   assert(size <= (unsigned)mm_buf->block->size && (unsigned)mm_buf->block->ofs + (unsigned)mm_buf->block->size <= mm->size);
+   
+   pipe_mutex_unlock(mm->mutex);
+   return SUPER(mm_buf);
+}
+
+
+static void
+mm_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
+mm_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   
+   pipe_mutex_lock(mm->mutex);
+
+   u_mmDestroy(mm->heap);
+   
+   pb_unmap(mm->buffer);
+   pb_reference(&mm->buffer, NULL);
+   
+   pipe_mutex_unlock(mm->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             size_t size, size_t align2) 
+{
+   struct mm_pb_manager *mm;
+
+   if(!buffer)
+      return NULL;
+   
+   mm = CALLOC_STRUCT(mm_pb_manager);
+   if (!mm)
+      return NULL;
+
+   mm->base.destroy = mm_bufmgr_destroy;
+   mm->base.create_buffer = mm_bufmgr_create_buffer;
+   mm->base.flush = mm_bufmgr_flush;
+
+   mm->size = size;
+   mm->align2 = align2; /* 64-byte alignment */
+
+   pipe_mutex_init(mm->mutex);
+
+   mm->buffer = buffer; 
+
+   mm->map = pb_map(mm->buffer, 
+		    PIPE_BUFFER_USAGE_CPU_READ |
+		    PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!mm->map)
+      goto failure;
+
+   mm->heap = u_mmInit(0, size); 
+   if (!mm->heap)
+      goto failure;
+
+   return SUPER(mm);
+   
+failure:
+if(mm->heap)
+   u_mmDestroy(mm->heap);
+   if(mm->map)
+      pb_unmap(mm->buffer);
+   if(mm)
+      FREE(mm);
+   return NULL;
+}
+
+
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 size_t size, size_t align2) 
+{
+   struct pb_buffer *buffer;
+   struct pb_manager *mgr;
+   struct pb_desc desc;
+
+   if(!provider)
+      return NULL;
+   
+   memset(&desc, 0, sizeof(desc));
+   desc.alignment = 1 << align2;
+   
+   buffer = provider->create_buffer(provider, size, &desc); 
+   if (!buffer)
+      return NULL;
+   
+   mgr = mm_bufmgr_create_from_buffer(buffer, size, align2);
+   if (!mgr) {
+      pb_reference(&buffer, NULL);
+      return NULL;
+   }
+
+  return mgr;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
new file mode 100644
index 0000000000..ba02a84e62
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_ondemand.c
@@ -0,0 +1,303 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * A variation of malloc buffers which get transferred to real graphics memory
+ * when there is an attempt to validate them. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_ondemand_manager;
+
+
+struct pb_ondemand_buffer 
+{
+   struct pb_buffer base;
+   
+   struct pb_ondemand_manager *mgr;
+   
+   /** Regular malloc'ed memory */
+   void *data;
+   unsigned mapcount;
+   
+   /** Real buffer */
+   struct pb_buffer *buffer;
+   size_t size;
+   struct pb_desc desc;
+};
+
+
+struct pb_ondemand_manager
+{
+   struct pb_manager base;
+   
+   struct pb_manager *provider;
+};
+
+
+extern const struct pb_vtbl pb_ondemand_buffer_vtbl;
+
+static INLINE struct pb_ondemand_buffer *
+pb_ondemand_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &pb_ondemand_buffer_vtbl);
+   return (struct pb_ondemand_buffer *)buf;
+}
+
+static INLINE struct pb_ondemand_manager *
+pb_ondemand_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_ondemand_manager *)mgr;
+}
+
+
+static void
+pb_ondemand_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   
+   pb_reference(&buf->buffer, NULL);
+   
+   align_free(buf->data);
+   
+   FREE(buf);
+}
+
+
+static void *
+pb_ondemand_buffer_map(struct pb_buffer *_buf, 
+                       unsigned flags)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(buf->buffer) {
+      assert(!buf->data);
+      return pb_map(buf->buffer, flags);
+   }
+   else {
+      assert(buf->data);
+      ++buf->mapcount;
+      return buf->data;
+   }
+}
+
+
+static void
+pb_ondemand_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(buf->buffer) {
+      assert(!buf->data);
+      pb_unmap(buf->buffer);
+   }
+   else {
+      assert(buf->data);
+      assert(buf->mapcount);
+      if(buf->mapcount)
+         --buf->mapcount;
+   }
+}
+
+
+static enum pipe_error 
+pb_ondemand_buffer_instantiate(struct pb_ondemand_buffer *buf)
+{
+   if(!buf->buffer) {
+      struct pb_manager *provider = buf->mgr->provider;
+      uint8_t *map;
+      
+      assert(!buf->mapcount);
+      
+      buf->buffer = provider->create_buffer(provider, buf->size, &buf->desc);
+      if(!buf->buffer)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+      if(!map) {
+         pb_reference(&buf->buffer, NULL);
+         return PIPE_ERROR;
+      }
+      
+      memcpy(map, buf->data, buf->size);
+      
+      pb_unmap(buf->buffer);
+      
+      if(!buf->mapcount) {
+         FREE(buf->data);
+         buf->data = NULL;
+      }
+   }
+   
+   return PIPE_OK;
+}
+
+static enum pipe_error 
+pb_ondemand_buffer_validate(struct pb_buffer *_buf, 
+                            struct pb_validate *vl,
+                            unsigned flags)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   enum pipe_error ret; 
+
+   assert(!buf->mapcount);
+   if(buf->mapcount)
+      return PIPE_ERROR;
+
+   ret = pb_ondemand_buffer_instantiate(buf);
+   if(ret != PIPE_OK)
+      return ret;
+   
+   return pb_validate(buf->buffer, vl, flags);
+}
+
+
+static void
+pb_ondemand_buffer_fence(struct pb_buffer *_buf, 
+                         struct pipe_fence_handle *fence)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+   
+   assert(buf->buffer);
+   if(!buf->buffer)
+      return;
+   
+   pb_fence(buf->buffer, fence);
+}
+
+
+static void
+pb_ondemand_buffer_get_base_buffer(struct pb_buffer *_buf,
+                                   struct pb_buffer **base_buf,
+                                   unsigned *offset)
+{
+   struct pb_ondemand_buffer *buf = pb_ondemand_buffer(_buf);
+
+   if(pb_ondemand_buffer_instantiate(buf) != PIPE_OK) {
+      assert(0);
+      *base_buf = &buf->base;
+      *offset = 0;
+      return;
+   }
+
+   pb_get_base_buffer(buf->buffer, base_buf, offset);
+}
+
+
+const struct pb_vtbl 
+pb_ondemand_buffer_vtbl = {
+      pb_ondemand_buffer_destroy,
+      pb_ondemand_buffer_map,
+      pb_ondemand_buffer_unmap,
+      pb_ondemand_buffer_validate,
+      pb_ondemand_buffer_fence,
+      pb_ondemand_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pb_ondemand_manager_create_buffer(struct pb_manager *_mgr, 
+                                  size_t size,
+                                  const struct pb_desc *desc) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+   struct pb_ondemand_buffer *buf;
+   
+   buf = CALLOC_STRUCT(pb_ondemand_buffer);
+   if(!buf)
+      return NULL;
+
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   buf->base.base.size = size;
+   buf->base.vtbl = &pb_ondemand_buffer_vtbl;
+   
+   buf->mgr = mgr;
+   
+   buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
+   if(!buf->data) {
+      FREE(buf);
+      return NULL;
+   }
+   
+   buf->size = size;
+   buf->desc = *desc;
+
+   return &buf->base;
+}
+
+
+static void
+pb_ondemand_manager_flush(struct pb_manager *_mgr) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+   
+   mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_ondemand_manager_destroy(struct pb_manager *_mgr) 
+{
+   struct pb_ondemand_manager *mgr = pb_ondemand_manager(_mgr);
+
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_ondemand_manager_create(struct pb_manager *provider) 
+{
+   struct pb_ondemand_manager *mgr;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_ondemand_manager);
+   if(!mgr)
+      return NULL;
+   
+   mgr->base.destroy = pb_ondemand_manager_destroy;
+   mgr->base.create_buffer = pb_ondemand_manager_create_buffer;
+   mgr->base.flush = pb_ondemand_manager_flush;
+   
+   mgr->provider = provider;
+
+   return &mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
new file mode 100644
index 0000000000..a6ff37653e
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -0,0 +1,321 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Batch buffer pool management.
+ * 
+ * \author Jose Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellström <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pool_pb_manager
+{
+   struct pb_manager base;
+   
+   pipe_mutex mutex;
+   
+   size_t bufSize;
+   size_t bufAlign;
+   
+   size_t numFree;
+   size_t numTot;
+   
+   struct list_head free;
+   
+   struct pb_buffer *buffer;
+   void *map;
+   
+   struct pool_buffer *bufs;
+};
+
+
+static INLINE struct pool_pb_manager *
+pool_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pool_pb_manager *)mgr;
+}
+
+
+struct pool_buffer
+{
+   struct pb_buffer base;
+   
+   struct pool_pb_manager *mgr;
+   
+   struct list_head head;
+   
+   size_t start;
+};
+
+
+static INLINE struct pool_buffer *
+pool_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pool_buffer *)buf;
+}
+
+
+
+static void
+pool_buffer_destroy(struct pb_buffer *buf)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   
+   assert(pool_buf->base.base.refcount == 0);
+
+   pipe_mutex_lock(pool->mutex);
+   LIST_ADD(&pool_buf->head, &pool->free);
+   pool->numFree++;
+   pipe_mutex_unlock(pool->mutex);
+}
+
+
+static void *
+pool_buffer_map(struct pb_buffer *buf, unsigned flags)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   void *map;
+
+   pipe_mutex_lock(pool->mutex);
+   map = (unsigned char *) pool->map + pool_buf->start;
+   pipe_mutex_unlock(pool->mutex);
+   return map;
+}
+
+
+static void
+pool_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+pool_buffer_validate(struct pb_buffer *buf, 
+                     struct pb_validate *vl,
+                     unsigned flags)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   return pb_validate(pool->buffer, vl, flags);
+}
+
+
+static void
+pool_buffer_fence(struct pb_buffer *buf, 
+                  struct pipe_fence_handle *fence)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   pb_fence(pool->buffer, fence);
+}
+
+
+static void
+pool_buffer_get_base_buffer(struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            unsigned *offset)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   pb_get_base_buffer(pool->buffer, base_buf, offset);
+   *offset += pool_buf->start;
+}
+
+
+static const struct pb_vtbl 
+pool_buffer_vtbl = {
+      pool_buffer_destroy,
+      pool_buffer_map,
+      pool_buffer_unmap,
+      pool_buffer_validate,
+      pool_buffer_fence,
+      pool_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pool_bufmgr_create_buffer(struct pb_manager *mgr,
+                          size_t size,
+                          const struct pb_desc *desc)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   struct pool_buffer *pool_buf;
+   struct list_head *item;
+
+   assert(size == pool->bufSize);
+   assert(pool->bufAlign % desc->alignment == 0);
+   
+   pipe_mutex_lock(pool->mutex);
+
+   if (pool->numFree == 0) {
+      pipe_mutex_unlock(pool->mutex);
+      debug_printf("warning: out of fixed size buffer objects\n");
+      return NULL;
+   }
+
+   item = pool->free.next;
+
+   if (item == &pool->free) {
+      pipe_mutex_unlock(pool->mutex);
+      debug_printf("error: fixed size buffer pool corruption\n");
+      return NULL;
+   }
+
+   LIST_DEL(item);
+   --pool->numFree;
+
+   pipe_mutex_unlock(pool->mutex);
+   
+   pool_buf = LIST_ENTRY(struct pool_buffer, item, head);
+   assert(pool_buf->base.base.refcount == 0);
+   pool_buf->base.base.refcount = 1;
+   pool_buf->base.base.alignment = desc->alignment;
+   pool_buf->base.base.usage = desc->usage;
+   
+   return SUPER(pool_buf);
+}
+
+
+static void
+pool_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
+pool_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   pipe_mutex_lock(pool->mutex);
+
+   FREE(pool->bufs);
+   
+   pb_unmap(pool->buffer);
+   pb_reference(&pool->buffer, NULL);
+   
+   pipe_mutex_unlock(pool->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   size_t numBufs, 
+                   size_t bufSize,
+                   const struct pb_desc *desc) 
+{
+   struct pool_pb_manager *pool;
+   struct pool_buffer *pool_buf;
+   size_t i;
+
+   if(!provider)
+      return NULL;
+   
+   pool = CALLOC_STRUCT(pool_pb_manager);
+   if (!pool)
+      return NULL;
+
+   pool->base.destroy = pool_bufmgr_destroy;
+   pool->base.create_buffer = pool_bufmgr_create_buffer;
+   pool->base.flush = pool_bufmgr_flush;
+
+   LIST_INITHEAD(&pool->free);
+
+   pool->numTot = numBufs;
+   pool->numFree = numBufs;
+   pool->bufSize = bufSize;
+   pool->bufAlign = desc->alignment; 
+   
+   pipe_mutex_init(pool->mutex);
+
+   pool->buffer = provider->create_buffer(provider, numBufs*bufSize, desc); 
+   if (!pool->buffer)
+      goto failure;
+
+   pool->map = pb_map(pool->buffer,
+                          PIPE_BUFFER_USAGE_CPU_READ |
+                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!pool->map)
+      goto failure;
+
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
+   if (!pool->bufs)
+      goto failure;
+
+   pool_buf = pool->bufs;
+   for (i = 0; i < numBufs; ++i) {
+      pool_buf->base.base.refcount = 0;
+      pool_buf->base.base.alignment = 0;
+      pool_buf->base.base.usage = 0;
+      pool_buf->base.base.size = bufSize;
+      pool_buf->base.vtbl = &pool_buffer_vtbl;
+      pool_buf->mgr = pool;
+      pool_buf->start = i * bufSize;
+      LIST_ADDTAIL(&pool_buf->head, &pool->free);
+      pool_buf++;
+   }
+
+   return SUPER(pool);
+   
+failure:
+   if(pool->bufs)
+      FREE(pool->bufs);
+   if(pool->map)
+      pb_unmap(pool->buffer);
+   if(pool->buffer)
+      pb_reference(&pool->buffer, NULL);
+   if(pool)
+      FREE(pool);
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
new file mode 100644
index 0000000000..9b9fedccb4
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -0,0 +1,584 @@
+/**************************************************************************
+ *
+ * Copyright 2006-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, FREE of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * S-lab pool implementation.
+ * 
+ * @sa http://en.wikipedia.org/wiki/Slab_allocation
+ * 
+ * @author Thomas Hellstrom <thomas-at-tungstengraphics-dot-com>
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_error.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_double_list.h"
+#include "util/u_time.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+struct pb_slab;
+
+
+/**
+ * Buffer in a slab.
+ * 
+ * Sub-allocation of a contiguous buffer.
+ */
+struct pb_slab_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_slab *slab;
+   
+   struct list_head head;
+   
+   unsigned mapCount;
+   
+   /** Offset relative to the start of the slab buffer. */
+   size_t start;
+   
+   /** Use when validating, to signal that all mappings are finished */
+   /* TODO: Actually validation does not reach this stage yet */
+   pipe_condvar event;
+};
+
+
+/**
+ * Slab -- a contiguous piece of memory. 
+ */
+struct pb_slab
+{
+   struct list_head head;
+   struct list_head freeBuffers;
+   size_t numBuffers;
+   size_t numFree;
+   
+   struct pb_slab_buffer *buffers;
+   struct pb_slab_manager *mgr;
+   
+   /** Buffer from the provider */
+   struct pb_buffer *bo;
+   
+   void *virtual;   
+};
+
+
+/**
+ * It adds/removes slabs as needed in order to meet the allocation/destruction 
+ * of individual buffers.
+ */
+struct pb_slab_manager 
+{
+   struct pb_manager base;
+   
+   /** From where we get our buffers */
+   struct pb_manager *provider;
+   
+   /** Size of the buffers we hand on downstream */
+   size_t bufSize;
+   
+   /** Size of the buffers we request upstream */
+   size_t slabSize;
+   
+   /** 
+    * Alignment, usage to be used to allocate the slab buffers.
+    * 
+    * We can only provide buffers which are consistent (in alignment, usage) 
+    * with this description.   
+    */
+   struct pb_desc desc;
+
+   /** 
+    * Partial slabs
+    * 
+    * Full slabs are not stored in any list. Empty slabs are destroyed 
+    * immediatly.
+    */
+   struct list_head slabs;
+   
+   pipe_mutex mutex;
+};
+
+
+/**
+ * Wrapper around several slabs, therefore capable of handling buffers of 
+ * multiple sizes. 
+ * 
+ * This buffer manager just dispatches buffer allocations to the appropriate slab
+ * manager, according to the requested buffer size, or by passes the slab 
+ * managers altogether for even greater sizes.
+ * 
+ * The data of this structure remains constant after
+ * initialization and thus needs no mutex protection.
+ */
+struct pb_slab_range_manager 
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   
+   size_t minBufSize;
+   size_t maxBufSize;
+   
+   /** @sa pb_slab_manager::desc */ 
+   struct pb_desc desc;
+   
+   unsigned numBuckets;
+   size_t *bucketSizes;
+   
+   /** Array of pb_slab_manager, one for each bucket size */
+   struct pb_manager **buckets;
+};
+
+
+static INLINE struct pb_slab_buffer *
+pb_slab_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pb_slab_buffer *)buf;
+}
+
+
+static INLINE struct pb_slab_manager *
+pb_slab_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_slab_manager *)mgr;
+}
+
+
+static INLINE struct pb_slab_range_manager *
+pb_slab_range_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_slab_range_manager *)mgr;
+}
+
+
+/**
+ * Delete a buffer from the slab delayed list and put
+ * it on the slab FREE list.
+ */
+static void
+pb_slab_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   struct pb_slab *slab = buf->slab;
+   struct pb_slab_manager *mgr = slab->mgr;
+   struct list_head *list = &buf->head;
+
+   pipe_mutex_lock(mgr->mutex);
+   
+   assert(buf->base.base.refcount == 0);
+   
+   buf->mapCount = 0;
+
+   LIST_DEL(list);
+   LIST_ADDTAIL(list, &slab->freeBuffers);
+   slab->numFree++;
+
+   if (slab->head.next == &slab->head)
+      LIST_ADDTAIL(&slab->head, &mgr->slabs);
+
+   /* If the slab becomes totally empty, free it */
+   if (slab->numFree == slab->numBuffers) {
+      list = &slab->head;
+      LIST_DELINIT(list);
+      pb_reference(&slab->bo, NULL);
+      FREE(slab->buffers);
+      FREE(slab);
+   }
+
+   pipe_mutex_unlock(mgr->mutex);
+}
+
+
+static void *
+pb_slab_buffer_map(struct pb_buffer *_buf, 
+                   unsigned flags)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+
+   ++buf->mapCount;
+   return (void *) ((uint8_t *) buf->slab->virtual + buf->start);
+}
+
+
+static void
+pb_slab_buffer_unmap(struct pb_buffer *_buf)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+
+   --buf->mapCount;
+   if (buf->mapCount == 0) 
+       pipe_condvar_broadcast(buf->event);
+}
+
+
+static enum pipe_error 
+pb_slab_buffer_validate(struct pb_buffer *_buf, 
+                         struct pb_validate *vl,
+                         unsigned flags)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   return pb_validate(buf->slab->bo, vl, flags);
+}
+
+
+static void
+pb_slab_buffer_fence(struct pb_buffer *_buf, 
+                      struct pipe_fence_handle *fence)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   pb_fence(buf->slab->bo, fence);
+}
+
+
+static void
+pb_slab_buffer_get_base_buffer(struct pb_buffer *_buf,
+                               struct pb_buffer **base_buf,
+                               unsigned *offset)
+{
+   struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
+   pb_get_base_buffer(buf->slab->bo, base_buf, offset);
+   *offset += buf->start;
+}
+
+
+static const struct pb_vtbl 
+pb_slab_buffer_vtbl = {
+      pb_slab_buffer_destroy,
+      pb_slab_buffer_map,
+      pb_slab_buffer_unmap,
+      pb_slab_buffer_validate,
+      pb_slab_buffer_fence,
+      pb_slab_buffer_get_base_buffer
+};
+
+
+/**
+ * Create a new slab.
+ * 
+ * Called when we ran out of free slabs.
+ */
+static enum pipe_error
+pb_slab_create(struct pb_slab_manager *mgr)
+{
+   struct pb_slab *slab;
+   struct pb_slab_buffer *buf;
+   unsigned numBuffers;
+   unsigned i;
+   enum pipe_error ret;
+
+   slab = CALLOC_STRUCT(pb_slab);
+   if (!slab)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+
+   slab->bo = mgr->provider->create_buffer(mgr->provider, mgr->slabSize, &mgr->desc);
+   if(!slab->bo) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err0;
+   }
+
+   /* Note down the slab virtual address. All mappings are accessed directly 
+    * through this address so it is required that the buffer is pinned. */
+   slab->virtual = pb_map(slab->bo, 
+                          PIPE_BUFFER_USAGE_CPU_READ |
+                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!slab->virtual) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err1;
+   }
+   pb_unmap(slab->bo);
+
+   numBuffers = slab->bo->base.size / mgr->bufSize;
+
+   slab->buffers = CALLOC(numBuffers, sizeof(*slab->buffers));
+   if (!slab->buffers) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
+      goto out_err1;
+   }
+
+   LIST_INITHEAD(&slab->head);
+   LIST_INITHEAD(&slab->freeBuffers);
+   slab->numBuffers = numBuffers;
+   slab->numFree = 0;
+   slab->mgr = mgr;
+
+   buf = slab->buffers;
+   for (i=0; i < numBuffers; ++i) {
+      buf->base.base.refcount = 0;
+      buf->base.base.size = mgr->bufSize;
+      buf->base.base.alignment = 0;
+      buf->base.base.usage = 0;
+      buf->base.vtbl = &pb_slab_buffer_vtbl;
+      buf->slab = slab;
+      buf->start = i* mgr->bufSize;
+      buf->mapCount = 0;
+      pipe_condvar_init(buf->event);
+      LIST_ADDTAIL(&buf->head, &slab->freeBuffers);
+      slab->numFree++;
+      buf++;
+   }
+
+   /* Add this slab to the list of partial slabs */
+   LIST_ADDTAIL(&slab->head, &mgr->slabs);
+
+   return PIPE_OK;
+
+out_err1: 
+   pb_reference(&slab->bo, NULL);
+out_err0: 
+   FREE(slab);
+   return ret;
+}
+
+
+static struct pb_buffer *
+pb_slab_manager_create_buffer(struct pb_manager *_mgr,
+                              size_t size,
+                              const struct pb_desc *desc)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+   static struct pb_slab_buffer *buf;
+   struct pb_slab *slab;
+   struct list_head *list;
+
+   /* check size */
+   assert(size <= mgr->bufSize);
+   if(size > mgr->bufSize)
+      return NULL;
+   
+   /* check if we can provide the requested alignment */
+   assert(pb_check_alignment(desc->alignment, mgr->desc.alignment));
+   if(!pb_check_alignment(desc->alignment, mgr->desc.alignment))
+      return NULL;
+   assert(pb_check_alignment(desc->alignment, mgr->bufSize));
+   if(!pb_check_alignment(desc->alignment, mgr->bufSize))
+      return NULL;
+
+   assert(pb_check_usage(desc->usage, mgr->desc.usage));
+   if(!pb_check_usage(desc->usage, mgr->desc.usage))
+      return NULL;
+
+   pipe_mutex_lock(mgr->mutex);
+   
+   /* Create a new slab, if we run out of partial slabs */
+   if (mgr->slabs.next == &mgr->slabs) {
+      (void) pb_slab_create(mgr);
+      if (mgr->slabs.next == &mgr->slabs) {
+	 pipe_mutex_unlock(mgr->mutex);
+	 return NULL;
+      }
+   }
+   
+   /* Allocate the buffer from a partial (or just created) slab */
+   list = mgr->slabs.next;
+   slab = LIST_ENTRY(struct pb_slab, list, head);
+   
+   /* If totally full remove from the partial slab list */
+   if (--slab->numFree == 0)
+      LIST_DELINIT(list);
+
+   list = slab->freeBuffers.next;
+   LIST_DELINIT(list);
+
+   pipe_mutex_unlock(mgr->mutex);
+   buf = LIST_ENTRY(struct pb_slab_buffer, list, head);
+   
+   ++buf->base.base.refcount;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   
+   return &buf->base;
+}
+
+
+static void
+pb_slab_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_slab_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   /* TODO: cleanup all allocated buffers */
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_slab_manager_create(struct pb_manager *provider,
+                       size_t bufSize,
+                       size_t slabSize,
+                       const struct pb_desc *desc)
+{
+   struct pb_slab_manager *mgr;
+
+   mgr = CALLOC_STRUCT(pb_slab_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_slab_manager_destroy;
+   mgr->base.create_buffer = pb_slab_manager_create_buffer;
+   mgr->base.flush = pb_slab_manager_flush;
+
+   mgr->provider = provider;
+   mgr->bufSize = bufSize;
+   mgr->slabSize = slabSize;
+   mgr->desc = *desc;
+
+   LIST_INITHEAD(&mgr->slabs);
+   
+   pipe_mutex_init(mgr->mutex);
+
+   return &mgr->base;
+}
+
+
+static struct pb_buffer *
+pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
+                                    size_t size,
+                                    const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   size_t bufSize;
+   unsigned i;
+
+   bufSize = mgr->minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      if(bufSize >= size)
+	 return mgr->buckets[i]->create_buffer(mgr->buckets[i], size, desc);
+      bufSize *= 2;
+   }
+
+   /* Fall back to allocate a buffer object directly from the provider. */
+   return mgr->provider->create_buffer(mgr->provider, size, desc);
+}
+
+
+static void
+pb_slab_range_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+
+   /* Individual slabs don't hold any temporary buffers so no need to call them */
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
+pb_slab_range_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   unsigned i;
+   
+   for (i = 0; i < mgr->numBuckets; ++i)
+      mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
+   FREE(mgr->bucketSizes);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             size_t minBufSize,
+                             size_t maxBufSize,
+                             size_t slabSize,
+                             const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr;
+   size_t bufSize;
+   unsigned i;
+
+   if(!provider)
+      return NULL;
+   
+   mgr = CALLOC_STRUCT(pb_slab_range_manager);
+   if (!mgr)
+      goto out_err0;
+
+   mgr->base.destroy = pb_slab_range_manager_destroy;
+   mgr->base.create_buffer = pb_slab_range_manager_create_buffer;
+   mgr->base.flush = pb_slab_range_manager_flush;
+
+   mgr->provider = provider;
+   mgr->minBufSize = minBufSize;
+   mgr->maxBufSize = maxBufSize;
+
+   mgr->numBuckets = 1;
+   bufSize = minBufSize;
+   while(bufSize < maxBufSize) {
+      bufSize *= 2;
+      ++mgr->numBuckets;
+   }
+   
+   mgr->buckets = CALLOC(mgr->numBuckets, sizeof(*mgr->buckets));
+   if (!mgr->buckets)
+      goto out_err1;
+
+   bufSize = minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      mgr->buckets[i] = pb_slab_manager_create(provider, bufSize, slabSize, desc);
+      if(!mgr->buckets[i])
+	 goto out_err2;
+      bufSize *= 2;
+   }
+
+   return &mgr->base;
+
+out_err2: 
+   for (i = 0; i < mgr->numBuckets; ++i)
+      if(mgr->buckets[i])
+	    mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
+out_err1: 
+   FREE(mgr);
+out_err0:
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.c b/src/gallium/auxiliary/pipebuffer/pb_validate.c
new file mode 100644
index 0000000000..94532bb4ce
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.c
@@ -0,0 +1,193 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Buffer validation.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_error.h"
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+#include "pb_validate.h"
+
+
+#define PB_VALIDATE_INITIAL_SIZE 1 /* 512 */ 
+
+
+struct pb_validate_entry
+{
+   struct pb_buffer *buf;
+   unsigned flags;
+};
+
+
+struct pb_validate
+{
+   struct pb_validate_entry *entries;
+   unsigned used;
+   unsigned size;
+};
+
+
+enum pipe_error
+pb_validate_add_buffer(struct pb_validate *vl,
+                       struct pb_buffer *buf,
+                       unsigned flags)
+{
+   assert(buf);
+   if(!buf)
+      return PIPE_ERROR;
+
+   assert(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE);
+   assert(!(flags & ~PIPE_BUFFER_USAGE_GPU_READ_WRITE));
+   flags &= PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+
+   /* We only need to store one reference for each buffer, so avoid storing
+    * consecutive references for the same buffer. It might not be the most 
+    * common pattern, but it is easy to implement.
+    */
+   if(vl->used && vl->entries[vl->used - 1].buf == buf) {
+      vl->entries[vl->used - 1].flags |= flags;
+      return PIPE_OK;
+   }
+   
+   /* Grow the table */
+   if(vl->used == vl->size) {
+      unsigned new_size;
+      struct pb_validate_entry *new_entries;
+      
+      new_size = vl->size * 2;
+      if(!new_size)
+	 return PIPE_ERROR_OUT_OF_MEMORY;
+
+      new_entries = (struct pb_validate_entry *)REALLOC(vl->entries,
+                                                        vl->size*sizeof(struct pb_validate_entry),
+                                                        new_size*sizeof(struct pb_validate_entry));
+      if(!new_entries)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      
+      memset(new_entries + vl->size, 0, (new_size - vl->size)*sizeof(struct pb_validate_entry));
+      
+      vl->size = new_size;
+      vl->entries = new_entries;
+   }
+   
+   assert(!vl->entries[vl->used].buf);
+   pb_reference(&vl->entries[vl->used].buf, buf);
+   vl->entries[vl->used].flags = flags;
+   ++vl->used;
+   
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+pb_validate_foreach(struct pb_validate *vl,
+                    enum pipe_error (*callback)(struct pb_buffer *buf, void *data),
+                    void *data)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i) {
+      enum pipe_error ret;
+      ret = callback(vl->entries[i].buf, data);
+      if(ret != PIPE_OK)
+         return ret;
+   }
+   return PIPE_OK;
+}
+
+
+enum pipe_error
+pb_validate_validate(struct pb_validate *vl) 
+{
+   unsigned i;
+   
+   for(i = 0; i < vl->used; ++i) {
+      enum pipe_error ret;
+      ret = pb_validate(vl->entries[i].buf, vl, vl->entries[i].flags);
+      if(ret != PIPE_OK) {
+         while(i--)
+            pb_validate(vl->entries[i].buf, NULL, 0);
+         return ret;
+      }
+   }
+
+   return PIPE_OK;
+}
+
+
+void
+pb_validate_fence(struct pb_validate *vl,
+                  struct pipe_fence_handle *fence)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i) {
+      pb_fence(vl->entries[i].buf, fence);
+      pb_reference(&vl->entries[i].buf, NULL);
+   }
+   vl->used = 0;
+}
+
+
+void
+pb_validate_destroy(struct pb_validate *vl)
+{
+   unsigned i;
+   for(i = 0; i < vl->used; ++i)
+      pb_reference(&vl->entries[i].buf, NULL);
+   FREE(vl->entries);
+   FREE(vl);
+}
+
+
+struct pb_validate *
+pb_validate_create()
+{
+   struct pb_validate *vl;
+   
+   vl = CALLOC_STRUCT(pb_validate);
+   if(!vl)
+      return NULL;
+   
+   vl->size = PB_VALIDATE_INITIAL_SIZE;
+   vl->entries = (struct pb_validate_entry *)CALLOC(vl->size, sizeof(struct pb_validate_entry));
+   if(!vl->entries) {
+      FREE(vl);
+      return NULL;
+   }
+
+   return vl;
+}
+
diff --git a/src/gallium/auxiliary/pipebuffer/pb_validate.h b/src/gallium/auxiliary/pipebuffer/pb_validate.h
new file mode 100644
index 0000000000..dfb84df1ce
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_validate.h
@@ -0,0 +1,97 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Buffer validation.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef PB_VALIDATE_H_
+#define PB_VALIDATE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+struct pb_buffer;
+struct pipe_fence_handle;
+
+
+/**
+ * Buffer validation list.
+ * 
+ * It holds a list of buffers to be validated and fenced when flushing.
+ */
+struct pb_validate;
+
+
+enum pipe_error
+pb_validate_add_buffer(struct pb_validate *vl,
+                       struct pb_buffer *buf,
+                       unsigned flags);
+
+enum pipe_error
+pb_validate_foreach(struct pb_validate *vl,
+                    enum pipe_error (*callback)(struct pb_buffer *buf, void *data),
+                    void *data);
+
+/**
+ * Validate all buffers for hardware access.
+ * 
+ * Should be called right before issuing commands to the hardware.
+ */
+enum pipe_error
+pb_validate_validate(struct pb_validate *vl);
+
+/**
+ * Fence all buffers and clear the list.
+ * 
+ * Should be called right after issuing commands to the hardware.
+ */
+void
+pb_validate_fence(struct pb_validate *vl,
+                  struct pipe_fence_handle *fence);
+
+struct pb_validate *
+pb_validate_create(void);
+
+void
+pb_validate_destroy(struct pb_validate *vl);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*PB_VALIDATE_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_winsys.c b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
new file mode 100644
index 0000000000..45a883e532
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of client buffer (also designated as "user buffers"), which
+ * are just state-tracker owned data masqueraded as buffers.
+ * 
+ * \author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_winsys.h"
+#include "util/u_memory.h"
+
+#include "pb_buffer.h"
+
+
+/**
+ * User buffers are special buffers that initially reference memory
+ * held by the user but which may if necessary copy that memory into
+ * device memory behind the scenes, for submission to hardware.
+ *
+ * These are particularly useful when the referenced data is never
+ * submitted to hardware at all, in the particular case of software
+ * vertex processing.
+ */
+struct pb_user_buffer 
+{
+   struct pb_buffer base;
+   void *data;
+};
+
+
+extern const struct pb_vtbl pb_user_buffer_vtbl;
+
+
+static INLINE struct pb_user_buffer *
+pb_user_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &pb_user_buffer_vtbl);
+   return (struct pb_user_buffer *)buf;
+}
+
+
+static void
+pb_user_buffer_destroy(struct pb_buffer *buf)
+{
+   assert(buf);
+   FREE(buf);
+}
+
+
+static void *
+pb_user_buffer_map(struct pb_buffer *buf, 
+                   unsigned flags)
+{
+   return pb_user_buffer(buf)->data;
+}
+
+
+static void
+pb_user_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static enum pipe_error 
+pb_user_buffer_validate(struct pb_buffer *buf, 
+                        struct pb_validate *vl,
+                        unsigned flags)
+{
+   assert(0);
+   return PIPE_ERROR;
+}
+
+
+static void
+pb_user_buffer_fence(struct pb_buffer *buf, 
+                     struct pipe_fence_handle *fence)
+{
+   assert(0);
+}
+
+
+static void
+pb_user_buffer_get_base_buffer(struct pb_buffer *buf,
+                               struct pb_buffer **base_buf,
+                               unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+const struct pb_vtbl 
+pb_user_buffer_vtbl = {
+      pb_user_buffer_destroy,
+      pb_user_buffer_map,
+      pb_user_buffer_unmap,
+      pb_user_buffer_validate,
+      pb_user_buffer_fence,
+      pb_user_buffer_get_base_buffer
+};
+
+
+static struct pipe_buffer *
+pb_winsys_user_buffer_create(struct pipe_winsys *winsys,
+                             void *data, 
+                             unsigned bytes) 
+{
+   struct pb_user_buffer *buf = CALLOC_STRUCT(pb_user_buffer);
+
+   if(!buf)
+      return NULL;
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.size = bytes;
+   buf->base.base.alignment = 0;
+   buf->base.base.usage = 0;
+
+   buf->base.vtbl = &pb_user_buffer_vtbl;   
+   buf->data = data;
+   
+   return &buf->base.base;
+}
+
+
+static void *
+pb_winsys_buffer_map(struct pipe_winsys *winsys,
+                     struct pipe_buffer *buf,
+                     unsigned flags)
+{
+   (void)winsys;
+   return pb_map(pb_buffer(buf), flags);
+}
+
+
+static void
+pb_winsys_buffer_unmap(struct pipe_winsys *winsys,
+                       struct pipe_buffer *buf)
+{
+   (void)winsys;
+   pb_unmap(pb_buffer(buf));
+}
+
+
+static void
+pb_winsys_buffer_destroy(struct pipe_winsys *winsys,
+                         struct pipe_buffer *buf)
+{
+   (void)winsys;
+   pb_destroy(pb_buffer(buf));
+}
+
+
+void 
+pb_init_winsys(struct pipe_winsys *winsys)
+{
+   winsys->user_buffer_create = pb_winsys_user_buffer_create;
+   winsys->buffer_map = pb_winsys_buffer_map;
+   winsys->buffer_unmap = pb_winsys_buffer_unmap;
+   winsys->buffer_destroy = pb_winsys_buffer_destroy;
+}
diff --git a/src/gallium/auxiliary/rtasm/Makefile b/src/gallium/auxiliary/rtasm/Makefile
new file mode 100644
index 0000000000..252dc5274a
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/Makefile
@@ -0,0 +1,16 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = rtasm
+
+C_SOURCES = \
+	rtasm_cpu.c \
+	rtasm_execmem.c \
+	rtasm_x86sse.c \
+	rtasm_ppc.c \
+	rtasm_ppc_spe.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/rtasm/SConscript b/src/gallium/auxiliary/rtasm/SConscript
new file mode 100644
index 0000000000..eb48368acc
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+rtasm = env.ConvenienceLibrary(
+	target = 'rtasm',
+	source = [
+		'rtasm_cpu.c',
+		'rtasm_execmem.c',
+		'rtasm_x86sse.c',
+		'rtasm_ppc.c',
+		'rtasm_ppc_spe.c',
+	])
+
+auxiliaries.insert(0, rtasm)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
new file mode 100644
index 0000000000..5499018b21
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_debug.h"
+#include "rtasm_cpu.h"
+
+
+static boolean rtasm_sse_enabled(void)
+{
+   static boolean firsttime = 1;
+   static boolean enabled;
+   
+   /* This gets called quite often at the moment:
+    */
+   if (firsttime) {
+      enabled =  !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
+      firsttime = FALSE;
+   }
+   return enabled;
+}
+
+int rtasm_cpu_has_sse(void)
+{
+   /* FIXME: actually detect this at run-time */
+#if defined(PIPE_ARCH_X86)
+   return rtasm_sse_enabled();
+#else
+   return 0;
+#endif
+}
+
+int rtasm_cpu_has_sse2(void) 
+{
+   /* FIXME: actually detect this at run-time */
+#if defined(PIPE_ARCH_X86)
+   return rtasm_sse_enabled();
+#else
+   return 0;
+#endif
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.h b/src/gallium/auxiliary/rtasm/rtasm_cpu.h
new file mode 100644
index 0000000000..ebc71634fd
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Runtime detection of CPU capabilities.
+ */
+
+#ifndef _RTASM_CPU_H_
+#define _RTASM_CPU_H_
+
+
+int rtasm_cpu_has_sse(void);
+
+int rtasm_cpu_has_sse2(void);
+
+
+#endif /* _RTASM_CPU_H_ */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
new file mode 100644
index 0000000000..be7433baf8
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+#include "util/u_memory.h"
+
+#include "rtasm_execmem.h"
+
+
+#if defined(PIPE_OS_LINUX)
+
+
+/*
+ * Allocate a large block of memory which can hold code then dole it out
+ * in pieces by means of the generic memory manager code.
+ */
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include "pipe/p_thread.h"
+#include "util/u_mm.h"
+
+#define EXEC_HEAP_SIZE (10*1024*1024)
+
+pipe_static_mutex(exec_mutex);
+
+static struct mem_block *exec_heap = NULL;
+static unsigned char *exec_mem = NULL;
+
+
+static void
+init_heap(void)
+{
+   if (!exec_heap)
+      exec_heap = u_mmInit( 0, EXEC_HEAP_SIZE );
+   
+   if (!exec_mem)
+      exec_mem = (unsigned char *) mmap(0, EXEC_HEAP_SIZE, 
+					PROT_EXEC | PROT_READ | PROT_WRITE, 
+					MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+}
+
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   struct mem_block *block = NULL;
+   void *addr = NULL;
+
+   pipe_mutex_lock(exec_mutex);
+
+   init_heap();
+
+   if (exec_heap) {
+      size = (size + 31) & ~31;  /* next multiple of 32 bytes */
+      block = u_mmAllocMem( exec_heap, size, 5, 0 ); /* 5 -> 32-byte alignment */
+   }
+
+   if (block)
+      addr = exec_mem + block->ofs;
+   else 
+      debug_printf("rtasm_exec_malloc failed\n");
+   
+   pipe_mutex_unlock(exec_mutex);
+   
+   return addr;
+}
+
+ 
+void 
+rtasm_exec_free(void *addr)
+{
+   pipe_mutex_lock(exec_mutex);
+
+   if (exec_heap) {
+      struct mem_block *block = u_mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
+   
+      if (block)
+	 u_mmFreeMem(block);
+   }
+
+   pipe_mutex_unlock(exec_mutex);
+}
+
+
+#else /* PIPE_OS_LINUX */
+
+/*
+ * Just use regular memory.
+ */
+
+void *
+rtasm_exec_malloc(size_t size)
+{
+   return MALLOC( size );
+}
+
+ 
+void 
+rtasm_exec_free(void *addr)
+{
+   FREE(addr);
+}
+
+
+#endif /* PIPE_OS_LINUX */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.h b/src/gallium/auxiliary/rtasm/rtasm_execmem.h
new file mode 100644
index 0000000000..155c6d34e0
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file exemem.c
+ * Functions for allocating executable memory.
+ *
+ * \author Keith Whitwell
+ */
+
+#ifndef _RTASM_EXECMEM_H_
+#define _RTASM_EXECMEM_H_
+
+#include "pipe/p_compiler.h"
+
+
+extern void *
+rtasm_exec_malloc( size_t size );
+
+
+extern void 
+rtasm_exec_free( void *addr );
+
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 0000000000..b65bfa7bbd
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,959 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * For reference, see http://www.power.org/resources/reading/PowerISA_V2.05.pdf
+ * ABI info: http://www.cs.utsa.edu/~whaley/teach/cs6463FHPO/LEC/lec12_ho.pdf
+ *
+ * Other PPC refs:
+ * http://www-01.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+ * http://www.ibm.com/developerworks/eserver/library/es-archguide-v2.html
+ * http://www.freescale.com/files/product/doc/MPCFPE32B.pdf
+ *
+ * \author Brian Paul
+ */
+
+
+#include <stdio.h>
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_execmem.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p)
+{
+   uint i;
+
+   p->num_inst = 0;
+   p->max_inst = 100; /* first guess at buffer size */
+   p->store = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+   p->reg_used = 0x0;
+   p->fp_used = 0x0;
+   p->vec_used = 0x0;
+
+   /* only allow using gp registers 3..12 for now */
+   for (i = 0; i < 3; i++)
+      ppc_reserve_register(p, i);
+   for (i = 12; i < PPC_NUM_REGS; i++)
+      ppc_reserve_register(p, i);
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+   assert(p->num_inst <= p->max_inst);
+   if (p->store != NULL) {
+      rtasm_exec_free(p->store);
+   }
+   p->store = NULL;
+}
+
+
+uint
+ppc_num_instructions(const struct ppc_function *p)
+{
+   return p->num_inst;
+}
+
+
+void (*ppc_get_func(struct ppc_function *p))(void)
+{
+#if 0
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+#endif
+      return (void (*)(void)) p->store;
+}
+
+
+void
+ppc_dump_func(const struct ppc_function *p)
+{
+   uint i;
+   for (i = 0; i < p->num_inst; i++) {
+      debug_printf("%3u: 0x%08x\n", i, p->store[i]);
+   }
+}
+
+
+/**
+ * Mark a register as being unavailable.
+ */
+int
+ppc_reserve_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   p->reg_used |= (1 << reg);
+   return reg;
+}
+
+
+/**
+ * Allocate a general purpose register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->reg_used & mask) == 0) {
+         p->reg_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given general purpose register as "unallocated".
+ */
+void
+ppc_release_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_REGS);
+   assert(p->reg_used & (1 << reg));
+   p->reg_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a floating point register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_fp_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_FP_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->fp_used & mask) == 0) {
+         p->fp_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given floating point register as "unallocated".
+ */
+void
+ppc_release_fp_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_FP_REGS);
+   assert(p->fp_used & (1 << reg));
+   p->fp_used &= ~(1 << reg);
+}
+
+
+/**
+ * Allocate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) == 0) {
+         p->vec_used |= mask;
+         return i;
+      }
+   }
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert(p->vec_used & (1 << reg));
+   p->vec_used &= ~(1 << reg);
+}
+
+
+/**
+ * Append instruction to instruction buffer.  Grow buffer if out of room.
+ */
+static void
+emit_instruction(struct ppc_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = rtasm_exec_malloc(p->max_inst * PPC_INST_SIZE);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * PPC_INST_SIZE);
+      }
+      rtasm_exec_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+static INLINE void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+static INLINE void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+static INLINE void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   emit_instruction(p, inst.bits);
+};
+
+
+union i_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned li:24;
+      unsigned aa:1;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_i(struct ppc_function *p, uint op, uint li, uint aa, uint lk)
+{
+   union i_inst inst;
+   inst.inst.op = op;
+   inst.inst.li = li;
+   inst.inst.aa = aa;
+   inst.inst.lk = lk;
+   emit_instruction(p, inst.bits);
+}
+
+
+union xl_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned bo:5;
+      unsigned bi:5;
+      unsigned unused:3;
+      unsigned bh:2;
+      unsigned op2:10;
+      unsigned lk:1;
+   } inst;
+};
+
+static INLINE void
+emit_xl(struct ppc_function *p, uint op, uint bo, uint bi, uint bh,
+        uint op2, uint lk)
+{
+   union xl_inst inst;
+   inst.inst.op = op;
+   inst.inst.bo = bo;
+   inst.inst.bi = bi;
+   inst.inst.unused = 0x0;
+   inst.inst.bh = bh;
+   inst.inst.op2 = op2;
+   inst.inst.lk = lk;
+   emit_instruction(p, inst.bits);
+}
+
+static INLINE void
+dump_xl(const char *name, uint inst)
+{
+   union xl_inst i;
+
+   i.bits = inst;
+   debug_printf("%s = 0x%08x\n", name, inst);
+   debug_printf(" op: %d 0x%x\n", i.inst.op, i.inst.op);
+   debug_printf(" bo: %d 0x%x\n", i.inst.bo, i.inst.bo);
+   debug_printf(" bi: %d 0x%x\n", i.inst.bi, i.inst.bi);
+   debug_printf(" unused: %d 0x%x\n", i.inst.unused, i.inst.unused);
+   debug_printf(" bh: %d 0x%x\n", i.inst.bh, i.inst.bh);
+   debug_printf(" op2: %d 0x%x\n", i.inst.op2, i.inst.op2);
+   debug_printf(" lk: %d 0x%x\n", i.inst.lk, i.inst.lk);
+}
+
+
+union x_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vrs:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned op2:10;
+      unsigned unused:1;
+   } inst;
+};
+
+static INLINE void
+emit_x(struct ppc_function *p, uint op, uint vrs, uint ra, uint rb, uint op2)
+{
+   union x_inst inst;
+   inst.inst.op = op;
+   inst.inst.vrs = vrs;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.op2 = op2;
+   inst.inst.unused = 0x0;
+   emit_instruction(p, inst.bits);
+}
+
+
+union d_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned si:16;
+   } inst;
+};
+
+static INLINE void
+emit_d(struct ppc_function *p, uint op, uint rt, uint ra, int si)
+{
+   union d_inst inst;
+   assert(si >= -32768);
+   assert(si <= 32767);
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.si = (unsigned) (si & 0xffff);
+   emit_instruction(p, inst.bits);
+};
+
+
+union a_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned frt:5;
+      unsigned fra:5;
+      unsigned frb:5;
+      unsigned unused:5;
+      unsigned op2:5;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_a(struct ppc_function *p, uint op, uint frt, uint fra, uint frb, uint op2,
+       uint rc)
+{
+   union a_inst inst;
+   inst.inst.op = op;
+   inst.inst.frt = frt;
+   inst.inst.fra = fra;
+   inst.inst.frb = frb;
+   inst.inst.unused = 0x0;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   emit_instruction(p, inst.bits);
+};
+
+
+union xo_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned rt:5;
+      unsigned ra:5;
+      unsigned rb:5;
+      unsigned oe:1;
+      unsigned op2:9;
+      unsigned rc:1;
+   } inst;
+};
+
+static INLINE void
+emit_xo(struct ppc_function *p, uint op, uint rt, uint ra, uint rb, uint oe,
+        uint op2, uint rc)
+{
+   union xo_inst inst;
+   inst.inst.op = op;
+   inst.inst.rt = rt;
+   inst.inst.ra = ra;
+   inst.inst.rb = rb;
+   inst.inst.oe = oe;
+   inst.inst.op2 = op2;
+   inst.inst.rc = rc;
+   emit_instruction(p, inst.bits);
+}
+
+
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add: vD = vA * vB + vC */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vC, vB); /* note arg order */
+}
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 47, vD, vB, vA, vC); /* note arg order */
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+/** vector store: store vR at mem[vA+vB] */
+void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 231);
+}
+
+/** vector load: vR = mem[vA+vB] */
+void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB)
+{
+   emit_x(p, 31, vR, vA, vB, 103);
+}
+
+/** load vector element word: vR = mem_word[ra+rb] */
+void
+ppc_lvewx(struct ppc_function *p, uint vr, uint ra, uint rb)
+{
+   emit_x(p, 31, vr, ra, rb, 71);
+}
+
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+/** Pseudo-instruction: vector move */
+void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA)
+{
+   ppc_vor(p, vD, vA, vA);
+}
+
+/** Set vector register to {0,0,0,0} */
+void
+ppc_vzero(struct ppc_function *p, uint vr)
+{
+   ppc_vxor(p, vr, vr, vr);
+}
+
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
+
+/** vector splat signed immediate word */
+void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm)
+{
+   assert(imm >= -16);
+   assert(imm < 15);
+   emit_vx(p, 908, vD, imm, 0);
+}
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 388, vD, vA, vB);
+}
+
+
+
+
+/**
+ ** integer arithmetic
+ **/
+
+/** rt = ra + imm */
+void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 14, rt, ra, imm);
+}
+
+/** rt = ra + (imm << 16) */
+void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 15, rt, ra, imm);
+}
+
+/** rt = ra + rb */
+void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_xo(p, 31, rt, ra, rb, 0, 266, 0);
+}
+
+/** rt = ra AND ra */
+void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 28);  /* note argument order */
+}
+
+/** rt = ra AND imm */
+void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 28, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra OR ra */
+void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 444);  /* note argument order */
+}
+
+/** rt = ra OR imm */
+void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 24, ra, rt, imm);  /* note argument order */
+}
+
+/** rt = ra XOR ra */
+void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb)
+{
+   emit_x(p, 31, ra, rt, rb, 316);  /* note argument order */
+}
+
+/** rt = ra XOR imm */
+void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm)
+{
+   emit_d(p, 26, ra, rt, imm);  /* note argument order */
+}
+
+/** pseudo instruction: move: rt = ra */
+void
+ppc_mr(struct ppc_function *p, uint rt, uint ra)
+{
+   ppc_or(p, rt, ra, ra);
+}
+
+/** pseudo instruction: load immediate: rt = imm */
+void
+ppc_li(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addi(p, rt, 0, imm);
+}
+
+/** rt = imm << 16 */
+void
+ppc_lis(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_addis(p, rt, 0, imm);
+}
+
+/** rt = imm */
+void
+ppc_load_int(struct ppc_function *p, uint rt, int imm)
+{
+   ppc_lis(p, rt, (imm >> 16));          /* rt = imm >> 16 */
+   ppc_ori(p, rt, rt, (imm & 0xffff));   /* rt = rt | (imm & 0xffff) */
+}
+
+
+
+
+/**
+ ** integer load/store
+ **/
+
+/** store rs at memory[(ra)+d],
+ * then update ra = (ra)+d
+ */
+void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 37, rs, ra, d);
+}
+
+/** store rs at memory[(ra)+d] */
+void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d)
+{
+   emit_d(p, 36, rs, ra, d);
+}
+
+/** Load rt = mem[(ra)+d];  then zero set high 32 bits to zero. */
+void
+ppc_lwz(struct ppc_function *p, uint rt, uint ra, int d)
+{
+   emit_d(p, 32, rt, ra, d);
+}
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+/** add: frt = fra + frb */
+void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 21, 0);
+}
+
+/** sub: frt = fra - frb */
+void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb)
+{
+   emit_a(p, 63, frt, fra, frb, 20, 0);
+}
+
+/** convert to int: rt = (int) ra */
+void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint fra)
+{
+   emit_x(p, 63, rt, 0, fra, 15);
+}
+
+/** store frs at mem[(ra)+offset] */
+void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset)
+{
+   emit_d(p, 52, frs, ra, offset);
+}
+
+/** store frs at mem[(ra)+(rb)] */
+void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb)
+{
+   emit_x(p, 31, frs, ra, rb, 983);
+}
+
+/** load frt = mem[(ra)+offset] */
+void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset)
+{
+   emit_d(p, 48, frt, ra, offset);
+}
+
+
+
+
+
+/**
+ ** branch instructions
+ **/
+
+/** BLR: Branch to link register (p. 35) */
+void
+ppc_blr(struct ppc_function *p)
+{
+   emit_i(p, 18, 0, 0, 1);
+}
+
+/** Branch Conditional to Link Register (p. 36) */
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg)
+{
+   emit_xl(p, 19, condOp, condReg, branchHint, 16, 0);
+}
+
+/** Pseudo instruction: return from subroutine */
+void
+ppc_return(struct ppc_function *p)
+{
+   ppc_bclr(p, BRANCH_COND_ALWAYS, BRANCH_HINT_SUB_RETURN, 0);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 0000000000..08212a2a25
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,335 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_REGS 32
+#define PPC_NUM_FP_REGS 32
+#define PPC_NUM_VEC_REGS 32
+
+/** Stack pointer register */
+#define PPC_REG_SP 1
+
+/** Branch conditions */
+#define BRANCH_COND_ALWAYS       0x14  /* binary 1z1zz (z=ignored) */
+
+/** Branch hints */
+#define BRANCH_HINT_SUB_RETURN   0x0   /* binary 00 */
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+   uint32_t fp_used;   /** used/free floating point registers bitmask */
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p);
+extern void ppc_release_func(struct ppc_function *p);
+extern uint ppc_num_instructions(const struct ppc_function *p);
+extern void (*ppc_get_func( struct ppc_function *p ))( void );
+extern void ppc_dump_func(const struct ppc_function *p);
+
+extern int ppc_reserve_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_register(struct ppc_function *p);
+extern void ppc_release_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_fp_register(struct ppc_function *p);
+extern void ppc_release_fp_register(struct ppc_function *p, int reg);
+extern int ppc_allocate_vec_register(struct ppc_function *p);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add: vD = vA * vB + vC */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float negative mult subtract: vD = vA - vB * vC */
+extern void
+ppc_vnmsubfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+/** vector store: store vR at mem[vA+vB] */
+extern void
+ppc_stvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** vector load: vR = mem[vA+vB] */
+extern void
+ppc_lvx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+/** load vector element word: vR = mem_word[vA+vB] */
+extern void
+ppc_lvewx(struct ppc_function *p, uint vR, uint vA, uint vB);
+
+
+
+/**
+ ** vector bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** Pseudo-instruction: vector move */
+extern void
+ppc_vmove(struct ppc_function *p, uint vD, uint vA);
+
+/** Set vector register to {0,0,0,0} */
+extern void
+ppc_vzero(struct ppc_function *p, uint vr);
+
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat signed immediate word */
+extern void
+ppc_vspltisw(struct ppc_function *p, uint vD, int imm);
+
+/** vector shift left word: vD[word] = vA[word] << (vB[word] & 0x1f) */
+extern void
+ppc_vslw(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+
+/**
+ ** scalar arithmetic
+ **/
+
+extern void
+ppc_add(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_addi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_addis(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_and(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_andi(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_or(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_ori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_xor(struct ppc_function *p, uint rt, uint ra, uint rb);
+
+extern void
+ppc_xori(struct ppc_function *p, uint rt, uint ra, int imm);
+
+extern void
+ppc_mr(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_li(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_lis(struct ppc_function *p, uint rt, int imm);
+
+extern void
+ppc_load_int(struct ppc_function *p, uint rt, int imm);
+
+
+
+/**
+ ** scalar load/store
+ **/
+
+extern void
+ppc_stwu(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_stw(struct ppc_function *p, uint rs, uint ra, int d);
+
+extern void
+ppc_lwz(struct ppc_function *p, uint rs, uint ra, int d);
+
+
+
+/**
+ ** Float (non-vector) arithmetic
+ **/
+
+extern void
+ppc_fadd(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fsub(struct ppc_function *p, uint frt, uint fra, uint frb);
+
+extern void
+ppc_fctiwz(struct ppc_function *p, uint rt, uint ra);
+
+extern void
+ppc_stfs(struct ppc_function *p, uint frs, uint ra, int offset);
+
+extern void
+ppc_stfiwx(struct ppc_function *p, uint frs, uint ra, uint rb);
+
+extern void
+ppc_lfs(struct ppc_function *p, uint frt, uint ra, int offset);
+
+
+
+/**
+ ** branch instructions
+ **/
+
+extern void
+ppc_blr(struct ppc_function *p);
+
+void
+ppc_bclr(struct ppc_function *p, uint condOp, uint branchHint, uint condReg);
+
+extern void
+ppc_return(struct ppc_function *p);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
new file mode 100644
index 0000000000..b9a75ae559
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -0,0 +1,1066 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Real-time assembly generation interface for Cell B.E. SPEs.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
+ */
+
+
+#include <stdio.h>
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "rtasm_ppc_spe.h"
+
+
+#ifdef GALLIUM_CELL
+/**
+ * SPE instruction types
+ *
+ * There are 6 primary instruction encodings used on the Cell's SPEs.  Each of
+ * the following unions encodes one type.
+ *
+ * \bug
+ * If, at some point, we start generating SPE code from a little-endian host
+ * these unions will not work.
+ */
+/*@{*/
+/**
+ * Encode one output register with two input registers
+ */
+union spe_inst_RR {
+    uint32_t bits;
+    struct {
+	unsigned op:11;
+	unsigned rB:7;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with three input registers
+ */
+union spe_inst_RRR {
+    uint32_t bits;
+    struct {
+	unsigned op:4;
+	unsigned rT:7;
+	unsigned rB:7;
+	unsigned rA:7;
+	unsigned rC:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and a 7-bit signed immed
+ */
+union spe_inst_RI7 {
+    uint32_t bits;
+    struct {
+	unsigned op:11;
+	unsigned i7:7;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and an 8-bit signed immed
+ */
+union spe_inst_RI8 {
+    uint32_t bits;
+    struct {
+	unsigned op:10;
+	unsigned i8:8;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with one input reg. and a 10-bit signed immed
+ */
+union spe_inst_RI10 {
+    uint32_t bits;
+    struct {
+	unsigned op:8;
+	unsigned i10:10;
+	unsigned rA:7;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with a 16-bit signed immediate
+ */
+union spe_inst_RI16 {
+    uint32_t bits;
+    struct {
+	unsigned op:9;
+	unsigned i16:16;
+	unsigned rT:7;
+    } inst;
+};
+
+
+/**
+ * Encode one output register with a 18-bit signed immediate
+ */
+union spe_inst_RI18 {
+    uint32_t bits;
+    struct {
+	unsigned op:7;
+	unsigned i18:18;
+	unsigned rT:7;
+    } inst;
+};
+/*@}*/
+
+
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
+      }
+   }
+}
+
+
+static void
+emit_instruction(struct spe_function *p, uint32_t inst_bits)
+{
+   if (!p->store)
+      return;  /* out of memory, drop the instruction */
+
+   if (p->num_inst == p->max_inst) {
+      /* allocate larger buffer */
+      uint32_t *newbuf;
+      p->max_inst *= 2;  /* 2x larger */
+      newbuf = align_malloc(p->max_inst * SPE_INST_SIZE, 16);
+      if (newbuf) {
+         memcpy(newbuf, p->store, p->num_inst * SPE_INST_SIZE);
+      }
+      align_free(p->store);
+      p->store = newbuf;
+      if (!p->store) {
+         /* out of memory */
+         p->num_inst = 0;
+         return;
+      }
+   }
+
+   p->store[p->num_inst++] = inst_bits;
+}
+
+
+
+static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
+		    unsigned rA, unsigned rB, const char *name)
+{
+    union spe_inst_RR inst;
+    inst.inst.op = op;
+    inst.inst.rB = rB;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
+    }
+}
+
+
+static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
+{
+    union spe_inst_RRR inst;
+    inst.inst.op = op;
+    inst.inst.rT = rT;
+    inst.inst.rB = rB;
+    inst.inst.rA = rA;
+    inst.inst.rC = rC;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
+    }
+}
+
+
+static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
+		     unsigned rA, int imm, const char *name)
+{
+    union spe_inst_RI7 inst;
+    inst.inst.op = op;
+    inst.inst.i7 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
+}
+
+
+
+static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
+		     unsigned rA, int imm, const char *name)
+{
+    union spe_inst_RI8 inst;
+    inst.inst.op = op;
+    inst.inst.i8 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
+}
+
+
+
+static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
+		      unsigned rA, int imm, const char *name)
+{
+    union spe_inst_RI10 inst;
+    inst.inst.op = op;
+    inst.inst.i10 = imm;
+    inst.inst.rA = rA;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
+    }
+}
+
+
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
+static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
+		      int imm, const char *name)
+{
+    union spe_inst_RI16 inst;
+    inst.inst.op = op;
+    inst.inst.i16 = imm;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
+}
+
+
+static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
+		      int imm, const char *name)
+{
+    union spe_inst_RI18 inst;
+    inst.inst.op = op;
+    inst.inst.i18 = imm;
+    inst.inst.rT = rT;
+    emit_instruction(p, inst.bits);
+    if (p->print) {
+       indent(p);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
+    }
+}
+
+
+#define EMIT(_name, _op) \
+void _name (struct spe_function *p) \
+{ \
+   emit_RR(p, _op, 0, 0, 0, __FUNCTION__); \
+}
+
+#define EMIT_(_name, _op) \
+void _name (struct spe_function *p, unsigned rT) \
+{ \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
+}
+
+#define EMIT_R(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA) \
+{ \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
+}
+
+#define EMIT_RR(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
+{ \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
+}
+
+#define EMIT_RRR(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
+{ \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
+}
+
+#define EMIT_RI7(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
+}
+
+#define EMIT_RI8(_name, _op, bias) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
+}
+
+#define EMIT_RI10(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
+#define EMIT_RI16(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, int imm) \
+{ \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
+}
+
+#define EMIT_RI18(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, int imm) \
+{ \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
+}
+
+#define EMIT_I16(_name, _op) \
+void _name (struct spe_function *p, int imm) \
+{ \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
+}
+
+#include "rtasm_ppc_spe.h"
+
+
+
+/**
+ * Initialize an spe_function.
+ * \param code_size  initial size of instruction buffer to allocate, in bytes.
+ *                   If zero, use a default.
+ */
+void spe_init_func(struct spe_function *p, unsigned code_size)
+{
+    unsigned int i;
+
+    if (!code_size)
+       code_size = 64;
+
+    p->num_inst = 0;
+    p->max_inst = code_size / SPE_INST_SIZE;
+    p->store = align_malloc(code_size, 16);
+
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
+    /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
+     */
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
+
+    p->print = false;
+    p->indent = 0;
+}
+
+
+void spe_release_func(struct spe_function *p)
+{
+    assert(p->num_inst <= p->max_inst);
+    if (p->store != NULL) {
+        align_free(p->store);
+    }
+    p->store = NULL;
+}
+
+
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
+/**
+ * Allocate a SPE register.
+ * \return register index or -1 if none left.
+ */
+int spe_allocate_available_register(struct spe_function *p)
+{
+   unsigned i;
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+/**
+ * Mark the given SPE register as "allocated".
+ */
+int spe_allocate_register(struct spe_function *p, int reg)
+{
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
+   return reg;
+}
+
+
+/**
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
+ */
+void spe_release_register(struct spe_function *p, int reg)
+{
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
+
+   p->regs[reg] = 0;
+}
+
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
+
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]--;
+   }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
+}
+
+
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
+
+
+/**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * For branch instructions:
+ * \param d  if 1, disable interupts if branch is taken
+ * \param e  if 1, enable interupts if branch is taken
+ * If d and e are both zero, don't change interupt status (right?)
+ */
+
+/** Branch Indirect to address in rA */
+void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Interupt Return */
+void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect and set link on external data */
+void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
+		int e)
+{
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect and set link.  Save PC in rT, jump to rA. */
+void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
+		int e)
+{
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
+void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
+void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+/** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
+void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
+{
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
+}
+
+
+/* Hint-for-branch instructions
+ */
+#if 0
+hbr;
+hbra;
+hbrr;
+#endif
+
+
+/* Control instructions
+ */
+#if 0
+stop;
+EMIT_RR  (spe_stopd, 0x140);
+EMIT_    (spe_nop,   0x201);
+sync;
+EMIT_    (spe_dsync, 0x003);
+EMIT_R   (spe_mfspr, 0x00c);
+EMIT_R   (spe_mtspr, 0x10c);
+#endif
+
+
+/**
+ ** Helper / "macro" instructions.
+ ** Use somewhat verbose names as a reminder that these aren't native
+ ** SPE instructions.
+ **/
+
+
+void
+spe_load_float(struct spe_function *p, unsigned rT, float x)
+{
+   if (x == 0.0f) {
+      spe_il(p, rT, 0x0);
+   }
+   else if (x == 0.5f) {
+      spe_ilhu(p, rT, 0x3f00);
+   }
+   else if (x == 1.0f) {
+      spe_ilhu(p, rT, 0x3f80);
+   }
+   else if (x == -1.0f) {
+      spe_ilhu(p, rT, 0xbf80);
+   }
+   else {
+      union {
+         float f;
+         unsigned u;
+      } bits;
+      bits.f = x;
+      spe_ilhu(p, rT, bits.u >> 16);
+      spe_iohl(p, rT, bits.u & 0xffff);
+   }
+}
+
+
+void
+spe_load_int(struct spe_function *p, unsigned rT, int i)
+{
+   if (-32768 <= i && i <= 32767) {
+      spe_il(p, rT, i);
+   }
+   else {
+      spe_ilhu(p, rT, i >> 16);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
+    */
+   if ((ui & 0x0003ffff) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
+   else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
+   }
+}
+
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   /* Use a temporary, just in case rT == rA */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, tmp_reg, 0x00010203);
+   spe_shufb(p, rT, rA, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+
+void
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_nor(p, rT, rA, rA);
+}
+
+
+void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
+}
+
+
+void
+spe_zero(struct spe_function *p, unsigned rT)
+{
+   spe_xor(p, rT, rT, rT);
+}
+
+
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+#endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
new file mode 100644
index 0000000000..f9ad2acacd
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -0,0 +1,440 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Real-time assembly generation interface for Cell B.E. SPEs.
+ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ * \author Brian Paul
+ */
+
+#ifndef RTASM_PPC_SPE_H
+#define RTASM_PPC_SPE_H
+
+/** 4 bytes per instruction */
+#define SPE_INST_SIZE 4
+
+/** number of general-purpose SIMD registers */
+#define SPE_NUM_REGS  128
+
+/** Return Address register (aka $lr / Link Register) */
+#define SPE_REG_RA  0
+
+/** Stack Pointer register (aka $sp) */
+#define SPE_REG_SP  1
+
+
+struct spe_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
+     *
+     * \sa
+     * spe_allocate_register, spe_allocate_available_register,
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
+     */
+    unsigned char regs[SPE_NUM_REGS];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
+};
+
+
+extern void spe_init_func(struct spe_function *p, unsigned code_size);
+extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
+
+extern int spe_allocate_available_register(struct spe_function *p);
+extern int spe_allocate_register(struct spe_function *p, int reg);
+extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
+
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
+
+#endif /* RTASM_PPC_SPE_H */
+
+#ifndef EMIT
+#define EMIT(_name, _op) \
+    extern void _name (struct spe_function *p);
+#define EMIT_(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT);
+#define EMIT_R(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA);
+#define EMIT_RR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       unsigned rB);
+#define EMIT_RRR(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       unsigned rB, unsigned rC);
+#define EMIT_RI7(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       int imm);
+#define EMIT_RI8(_name, _op, bias) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       int imm);
+#define EMIT_RI10(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       int imm);
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+                       int imm);
+#define EMIT_RI16(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
+#define EMIT_RI18(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, int imm);
+#define EMIT_I16(_name, _op) \
+    extern void _name (struct spe_function *p, int imm);
+#define UNDEF_EMIT_MACROS
+#endif /* EMIT */
+
+
+/* Memory load / store instructions
+ */
+EMIT_RR  (spe_lqx,  0x1c4)
+EMIT_RI16(spe_lqa,  0x061)
+EMIT_RI16(spe_lqr,  0x067)
+EMIT_RR  (spe_stqx, 0x144)
+EMIT_RI16(spe_stqa, 0x041)
+EMIT_RI16(spe_stqr, 0x047)
+EMIT_RI7 (spe_cbd,  0x1f4)
+EMIT_RR  (spe_cbx,  0x1d4)
+EMIT_RI7 (spe_chd,  0x1f5)
+EMIT_RI7 (spe_chx,  0x1d5)
+EMIT_RI7 (spe_cwd,  0x1f6)
+EMIT_RI7 (spe_cwx,  0x1d6)
+EMIT_RI7 (spe_cdd,  0x1f7)
+EMIT_RI7 (spe_cdx,  0x1d7)
+
+
+/* Constant formation instructions
+ */
+EMIT_RI16(spe_ilh,   0x083)
+EMIT_RI16(spe_ilhu,  0x082)
+EMIT_RI16(spe_il,    0x081)
+EMIT_RI18(spe_ila,   0x021)
+EMIT_RI16(spe_iohl,  0x0c1)
+EMIT_RI16(spe_fsmbi, 0x065)
+
+
+
+/* Integer and logical instructions
+ */
+EMIT_RR  (spe_ah,      0x0c8)
+EMIT_RI10(spe_ahi,     0x01d)
+EMIT_RR  (spe_a,       0x0c0)
+EMIT_RI10s(spe_ai,      0x01c)
+EMIT_RR  (spe_sfh,     0x048)
+EMIT_RI10(spe_sfhi,    0x00d)
+EMIT_RR  (spe_sf,      0x040)
+EMIT_RI10(spe_sfi,     0x00c)
+EMIT_RR  (spe_addx,    0x340)
+EMIT_RR  (spe_cg,      0x0c2)
+EMIT_RR  (spe_cgx,     0x342)
+EMIT_RR  (spe_sfx,     0x341)
+EMIT_RR  (spe_bg,      0x042)
+EMIT_RR  (spe_bgx,     0x343)
+EMIT_RR  (spe_mpy,     0x3c4)
+EMIT_RR  (spe_mpyu,    0x3cc)
+EMIT_RI10(spe_mpyi,    0x074)
+EMIT_RI10(spe_mpyui,   0x075)
+EMIT_RRR (spe_mpya,    0x00c)
+EMIT_RR  (spe_mpyh,    0x3c5)
+EMIT_RR  (spe_mpys,    0x3c7)
+EMIT_RR  (spe_mpyhh,   0x3c6)
+EMIT_RR  (spe_mpyhha,  0x346)
+EMIT_RR  (spe_mpyhhu,  0x3ce)
+EMIT_RR  (spe_mpyhhau, 0x34e)
+EMIT_R   (spe_clz,     0x2a5)
+EMIT_R   (spe_cntb,    0x2b4)
+EMIT_R   (spe_fsmb,    0x1b6)
+EMIT_R   (spe_fsmh,    0x1b5)
+EMIT_R   (spe_fsm,     0x1b4)
+EMIT_R   (spe_gbb,     0x1b2)
+EMIT_R   (spe_gbh,     0x1b1)
+EMIT_R   (spe_gb,      0x1b0)
+EMIT_RR  (spe_avgb,    0x0d3)
+EMIT_RR  (spe_absdb,   0x053)
+EMIT_RR  (spe_sumb,    0x253)
+EMIT_R   (spe_xsbh,    0x2b6)
+EMIT_R   (spe_xshw,    0x2ae)
+EMIT_R   (spe_xswd,    0x2a6)
+EMIT_RR  (spe_and,     0x0c1)
+EMIT_RR  (spe_andc,    0x2c1)
+EMIT_RI10s(spe_andbi,   0x016)
+EMIT_RI10s(spe_andhi,   0x015)
+EMIT_RI10s(spe_andi,    0x014)
+EMIT_RR  (spe_or,      0x041)
+EMIT_RR  (spe_orc,     0x2c9)
+EMIT_RI10s(spe_orbi,    0x006)
+EMIT_RI10s(spe_orhi,    0x005)
+EMIT_RI10s(spe_ori,     0x004)
+EMIT_R   (spe_orx,     0x1f0)
+EMIT_RR  (spe_xor,     0x241)
+EMIT_RI10s(spe_xorbi,   0x046)
+EMIT_RI10s(spe_xorhi,   0x045)
+EMIT_RI10s(spe_xori,    0x044)
+EMIT_RR  (spe_nand,    0x0c9)
+EMIT_RR  (spe_nor,     0x049)
+EMIT_RR  (spe_eqv,     0x249)
+EMIT_RRR (spe_selb,    0x008)
+EMIT_RRR (spe_shufb,   0x00b)
+
+
+/* Shift and rotate instructions
+ */
+EMIT_RR  (spe_shlh,      0x05f)
+EMIT_RI7 (spe_shlhi,     0x07f)
+EMIT_RR  (spe_shl,       0x05b)
+EMIT_RI7 (spe_shli,      0x07b)
+EMIT_RR  (spe_shlqbi,    0x1db)
+EMIT_RI7 (spe_shlqbii,   0x1fb)
+EMIT_RR  (spe_shlqby,    0x1df)
+EMIT_RI7 (spe_shlqbyi,   0x1ff)
+EMIT_RR  (spe_shlqbybi,  0x1cf)
+EMIT_RR  (spe_roth,      0x05c)
+EMIT_RI7 (spe_rothi,     0x07c)
+EMIT_RR  (spe_rot,       0x058)
+EMIT_RI7 (spe_roti,      0x078)
+EMIT_RR  (spe_rotqby,    0x1dc)
+EMIT_RI7 (spe_rotqbyi,   0x1fc)
+EMIT_RR  (spe_rotqbybi,  0x1cc)
+EMIT_RR  (spe_rotqbi,    0x1d8)
+EMIT_RI7 (spe_rotqbii,   0x1f8)
+EMIT_RR  (spe_rothm,     0x05d)
+EMIT_RI7 (spe_rothmi,    0x07d)
+EMIT_RR  (spe_rotm,      0x059)
+EMIT_RI7 (spe_rotmi,     0x079)
+EMIT_RR  (spe_rotqmby,   0x1dd)
+EMIT_RI7 (spe_rotqmbyi,  0x1fd)
+EMIT_RR  (spe_rotqmbybi, 0x1cd)
+EMIT_RR  (spe_rotqmbi,   0x1c9)
+EMIT_RI7 (spe_rotqmbii,  0x1f9)
+EMIT_RR  (spe_rotmah,    0x05e)
+EMIT_RI7 (spe_rotmahi,   0x07e)
+EMIT_RR  (spe_rotma,     0x05a)
+EMIT_RI7 (spe_rotmai,    0x07a)
+
+
+/* Compare, branch, and halt instructions
+ */
+EMIT_RR  (spe_heq,       0x3d8)
+EMIT_RI10(spe_heqi,      0x07f)
+EMIT_RR  (spe_hgt,       0x258)
+EMIT_RI10(spe_hgti,      0x04f)
+EMIT_RR  (spe_hlgt,      0x2d8)
+EMIT_RI10(spe_hlgti,     0x05f)
+EMIT_RR  (spe_ceqb,      0x3d0)
+EMIT_RI10(spe_ceqbi,     0x07e)
+EMIT_RR  (spe_ceqh,      0x3c8)
+EMIT_RI10(spe_ceqhi,     0x07d)
+EMIT_RR  (spe_ceq,       0x3c0)
+EMIT_RI10(spe_ceqi,      0x07c)
+EMIT_RR  (spe_cgtb,      0x250)
+EMIT_RI10(spe_cgtbi,     0x04e)
+EMIT_RR  (spe_cgth,      0x248)
+EMIT_RI10(spe_cgthi,     0x04d)
+EMIT_RR  (spe_cgt,       0x240)
+EMIT_RI10(spe_cgti,      0x04c)
+EMIT_RR  (spe_clgtb,     0x2d0)
+EMIT_RI10(spe_clgtbi,    0x05e)
+EMIT_RR  (spe_clgth,     0x2c8)
+EMIT_RI10(spe_clgthi,    0x05d)
+EMIT_RR  (spe_clgt,      0x2c0)
+EMIT_RI10(spe_clgti,     0x05c)
+EMIT_I16 (spe_br,        0x064)
+EMIT_I16 (spe_bra,       0x060)
+EMIT_RI16(spe_brsl,      0x066)
+EMIT_RI16(spe_brasl,     0x062)
+EMIT_RI16(spe_brnz,      0x042)
+EMIT_RI16(spe_brz,       0x040)
+EMIT_RI16(spe_brhnz,     0x046)
+EMIT_RI16(spe_brhz,      0x044)
+
+/* Control instructions
+ */
+EMIT     (spe_lnop,      0x001)
+
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
+extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_biz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_binz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
+    int d, int e);
+
+
+/** Load/splat immediate float into rT. */
+extern void
+spe_load_float(struct spe_function *p, unsigned rT, float x);
+
+/** Load/splat immediate int into rT. */
+extern void
+spe_load_int(struct spe_function *p, unsigned rT, int i);
+
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Replicate word 0 of rA across rT. */
+extern void
+spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = complement_all_bits(rA). */
+extern void
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = rA. */
+extern void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = {0,0,0,0}. */
+extern void
+spe_zero(struct spe_function *p, unsigned rT);
+
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+
+/* Floating-point instructions
+ */
+EMIT_RR  (spe_fa,         0x2c4)
+EMIT_RR  (spe_dfa,        0x2cc)
+EMIT_RR  (spe_fs,         0x2c5)
+EMIT_RR  (spe_dfs,        0x2cd)
+EMIT_RR  (spe_fm,         0x2c6)
+EMIT_RR  (spe_dfm,        0x2ce)
+EMIT_RRR (spe_fma,        0x00e)
+EMIT_RR  (spe_dfma,       0x35c)
+EMIT_RRR (spe_fnms,       0x00d)
+EMIT_RR  (spe_dfnms,      0x35e)
+EMIT_RRR (spe_fms,        0x00f)
+EMIT_RR  (spe_dfms,       0x35d)
+EMIT_RR  (spe_dfnma,      0x35f)
+EMIT_R   (spe_frest,      0x1b8)
+EMIT_R   (spe_frsqest,    0x1b9)
+EMIT_RR  (spe_fi,         0x3d4)
+EMIT_RI8 (spe_csflt,      0x1da, 155)
+EMIT_RI8 (spe_cflts,      0x1d8, 173)
+EMIT_RI8 (spe_cuflt,      0x1db, 155)
+EMIT_RI8 (spe_cfltu,      0x1d9, 173)
+EMIT_R   (spe_frds,       0x3b9)
+EMIT_R   (spe_fesd,       0x3b8)
+EMIT_RR  (spe_dfceq,      0x3c3)
+EMIT_RR  (spe_dfcmeq,     0x3cb)
+EMIT_RR  (spe_dfcgt,      0x2c3)
+EMIT_RR  (spe_dfcmgt,     0x2cb)
+EMIT_RI7 (spe_dftsv,      0x3bf)
+EMIT_RR  (spe_fceq,       0x3c2)
+EMIT_RR  (spe_fcmeq,      0x3ca)
+EMIT_RR  (spe_fcgt,       0x2c2)
+EMIT_RR  (spe_fcmgt,      0x2ca)
+EMIT_R   (spe_fscrwr,     0x3ba)
+EMIT_    (spe_fscrrd,     0x398)
+
+
+/* Channel instructions
+ */
+EMIT_R   (spe_rdch,       0x00d)
+EMIT_R   (spe_rdchcnt,    0x00f)
+EMIT_R   (spe_wrch,       0x10d)
+
+
+#ifdef UNDEF_EMIT_MACROS
+#undef EMIT
+#undef EMIT_
+#undef EMIT_R
+#undef EMIT_RR
+#undef EMIT_RRR
+#undef EMIT_RI7
+#undef EMIT_RI8
+#undef EMIT_RI10
+#undef EMIT_RI10s
+#undef EMIT_RI16
+#undef EMIT_RI18
+#undef EMIT_I16
+#undef UNDEF_EMIT_MACROS
+#endif /* EMIT_ */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
new file mode 100644
index 0000000000..99ee74cf14
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -0,0 +1,1748 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_pointer.h"
+
+#include "rtasm_execmem.h"
+#include "rtasm_x86sse.h"
+
+#define DISASSEM 0
+#define X86_TWOB 0x0f
+
+
+#define DUMP_SSE  0
+
+
+void x86_print_reg( struct x86_reg reg )
+{
+   if (reg.mod != mod_REG) 
+      debug_printf( "[" );
+      
+   switch( reg.file ) {
+   case file_REG32:
+      switch( reg.idx ) {
+      case reg_AX: debug_printf( "EAX" ); break;
+      case reg_CX: debug_printf( "ECX" ); break;
+      case reg_DX: debug_printf( "EDX" ); break;
+      case reg_BX: debug_printf( "EBX" ); break;
+      case reg_SP: debug_printf( "ESP" ); break;
+      case reg_BP: debug_printf( "EBP" ); break;
+      case reg_SI: debug_printf( "ESI" ); break;
+      case reg_DI: debug_printf( "EDI" ); break;
+      }
+      break;
+   case file_MMX:
+      debug_printf( "MMX%u", reg.idx );
+      break;
+   case file_XMM:
+      debug_printf( "XMM%u", reg.idx );
+      break;
+   case file_x87:
+      debug_printf( "fp%u", reg.idx );
+      break;
+   }
+
+   if (reg.mod == mod_DISP8 ||
+       reg.mod == mod_DISP32)
+      debug_printf("+%d", reg.disp);
+
+   if (reg.mod != mod_REG) 
+      debug_printf( "]" );
+}
+
+#if DUMP_SSE
+
+#define DUMP_START() debug_printf( "\n" )
+#define DUMP_END() debug_printf( "\n" )
+
+#define DUMP() do {                             \
+   const char *foo = __FUNCTION__;              \
+   while (*foo && *foo != '_')                  \
+      foo++;                                    \
+   if  (*foo)                                   \
+      foo++;                                    \
+   debug_printf( "\n% 4x% 15s ", p->csr - p->store, foo );             \
+} while (0)
+
+#define DUMP_I( I ) do {                        \
+   DUMP();                                      \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#define DUMP_R( R0 ) do {                       \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+} while( 0 )
+
+#define DUMP_RR( R0, R1 ) do {                  \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   x86_print_reg( R1 );                            \
+} while( 0 )
+
+#define DUMP_RI( R0, I ) do {                   \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#define DUMP_RRI( R0, R1, I ) do {              \
+   DUMP();                                      \
+   x86_print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   x86_print_reg( R1 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#else
+
+#define DUMP_START()
+#define DUMP_END()
+#define DUMP( )
+#define DUMP_I( I )
+#define DUMP_R( R0 )
+#define DUMP_RR( R0, R1 )
+#define DUMP_RI( R0, I )
+#define DUMP_RRI( R0, R1, I )
+
+#endif
+
+
+static void do_realloc( struct x86_function *p )
+{
+   if (p->store == p->error_overflow) {
+      p->csr = p->store;
+   }
+   else if (p->size == 0) {
+      p->size = 1024;
+      p->store = rtasm_exec_malloc(p->size);
+      p->csr = p->store;
+   }
+   else {
+      uintptr_t used = pointer_to_uintptr( p->csr ) - pointer_to_uintptr( p->store );
+      unsigned char *tmp = p->store;
+      p->size *= 2;
+      p->store = rtasm_exec_malloc(p->size);
+
+      if (p->store) {
+         memcpy(p->store, tmp, used);
+         p->csr = p->store + used;
+      }
+      else {
+         p->csr = p->store;
+      }
+
+      rtasm_exec_free(tmp);
+   }
+
+   if (p->store == NULL) {
+      p->store = p->csr = p->error_overflow;
+      p->size = sizeof(p->error_overflow);
+   }
+}
+
+/* Emit bytes to the instruction stream:
+ */
+static unsigned char *reserve( struct x86_function *p, int bytes )
+{
+   if (p->csr + bytes - p->store > (int) p->size)
+      do_realloc(p);
+
+   {
+      unsigned char *csr = p->csr;
+      p->csr += bytes;
+      return csr;
+   }
+}
+
+
+
+static void emit_1b( struct x86_function *p, char b0 )
+{
+   char *csr = (char *)reserve(p, 1);
+   *csr = b0;
+}
+
+static void emit_1i( struct x86_function *p, int i0 )
+{
+   int *icsr = (int *)reserve(p, sizeof(i0));
+   *icsr = i0;
+}
+
+static void emit_1ub( struct x86_function *p, unsigned char b0 )
+{
+   unsigned char *csr = reserve(p, 1);
+   *csr++ = b0;
+}
+
+static void emit_2ub( struct x86_function *p, unsigned char b0, unsigned char b1 )
+{
+   unsigned char *csr = reserve(p, 2);
+   *csr++ = b0;
+   *csr++ = b1;
+}
+
+static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1, unsigned char b2 )
+{
+   unsigned char *csr = reserve(p, 3);
+   *csr++ = b0;
+   *csr++ = b1;
+   *csr++ = b2;
+}
+
+
+/* Build a modRM byte + possible displacement.  No treatment of SIB
+ * indexing.  BZZT - no way to encode an absolute address.
+ *
+ * This is the "/r" field in the x86 manuals...
+ */
+static void emit_modrm( struct x86_function *p, 
+			struct x86_reg reg, 
+			struct x86_reg regmem )
+{
+   unsigned char val = 0;
+   
+   assert(reg.mod == mod_REG);
+   
+   val |= regmem.mod << 6;     	/* mod field */
+   val |= reg.idx << 3;		/* reg field */
+   val |= regmem.idx;		/* r/m field */
+   
+   emit_1ub(p, val);
+
+   /* Oh-oh we've stumbled into the SIB thing.
+    */
+   if (regmem.file == file_REG32 &&
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
+      emit_1ub(p, 0x24);		/* simplistic! */
+   }
+
+   switch (regmem.mod) {
+   case mod_REG:
+   case mod_INDIRECT:
+      break;
+   case mod_DISP8:
+      emit_1b(p, (char) regmem.disp);
+      break;
+   case mod_DISP32:
+      emit_1i(p, regmem.disp);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+/* Emits the "/0".."/7" specialized versions of the modrm ("/r") bytes.
+ */
+static void emit_modrm_noreg( struct x86_function *p,
+			      unsigned op,
+			      struct x86_reg regmem )
+{
+   struct x86_reg dummy = x86_make_reg(file_REG32, op);
+   emit_modrm(p, dummy, regmem);
+}
+
+/* Many x86 instructions have two opcodes to cope with the situations
+ * where the destination is a register or memory reference
+ * respectively.  This function selects the correct opcode based on
+ * the arguments presented.
+ */
+static void emit_op_modrm( struct x86_function *p,
+			   unsigned char op_dst_is_reg, 
+			   unsigned char op_dst_is_mem,
+			   struct x86_reg dst,
+			   struct x86_reg src )
+{  
+   switch (dst.mod) {
+   case mod_REG:
+      emit_1ub(p, op_dst_is_reg);
+      emit_modrm(p, dst, src);
+      break;
+   case mod_INDIRECT:
+   case mod_DISP32:
+   case mod_DISP8:
+      assert(src.mod == mod_REG);
+      emit_1ub(p, op_dst_is_mem);
+      emit_modrm(p, src, dst);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx )
+{
+   struct x86_reg reg;
+
+   reg.file = file;
+   reg.idx = idx;
+   reg.mod = mod_REG;
+   reg.disp = 0;
+
+   return reg;
+}
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp )
+{
+   assert(reg.file == file_REG32);
+
+   if (reg.mod == mod_REG)
+      reg.disp = disp;
+   else
+      reg.disp += disp;
+
+   if (reg.disp == 0 && reg.idx != reg_BP)
+      reg.mod = mod_INDIRECT;
+   else if (reg.disp <= 127 && reg.disp >= -128)
+      reg.mod = mod_DISP8;
+   else
+      reg.mod = mod_DISP32;
+
+   return reg;
+}
+
+struct x86_reg x86_deref( struct x86_reg reg )
+{
+   return x86_make_disp(reg, 0);
+}
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg )
+{
+   return x86_make_reg( reg.file, reg.idx );
+}
+
+int x86_get_label( struct x86_function *p )
+{
+   return p->csr - p->store;
+}
+
+
+
+/***********************************************************************
+ * x86 instructions
+ */
+
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      int label )
+{
+   int offset = label - (x86_get_label(p) + 2);
+   DUMP_I(cc);
+   
+   if (offset < 0) {
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
+   }
+
+   if (offset <= 127 && offset >= -128) {
+      emit_1ub(p, 0x70 + cc);
+      emit_1b(p, (char) offset);
+   }
+   else {
+      offset = label - (x86_get_label(p) + 6);
+      emit_2ub(p, 0x0f, 0x80 + cc);
+      emit_1i(p, offset);
+   }
+}
+
+/* Always use a 32bit offset for forward jumps:
+ */
+int x86_jcc_forward( struct x86_function *p,
+                     enum x86_cc cc )
+{
+   DUMP_I(cc);
+   emit_2ub(p, 0x0f, 0x80 + cc);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+int x86_jmp_forward( struct x86_function *p)
+{
+   DUMP();
+   emit_1ub(p, 0xe9);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+int x86_call_forward( struct x86_function *p)
+{
+   DUMP();
+
+   emit_1ub(p, 0xe8);
+   emit_1i(p, 0);
+   return x86_get_label(p);
+}
+
+/* Fixup offset from forward jump:
+ */
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 int fixup )
+{
+   *(int *)(p->store + fixup - 4) = x86_get_label(p) - fixup;
+}
+
+void x86_jmp( struct x86_function *p, int label)
+{
+   DUMP_I( label );
+   emit_1ub(p, 0xe9);
+   emit_1i(p, label - x86_get_label(p) - 4);
+}
+
+void x86_call( struct x86_function *p, struct x86_reg reg)
+{
+   DUMP_R( reg );
+   emit_1ub(p, 0xff);
+   emit_modrm_noreg(p, 2, reg);
+}
+
+
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0xb8 + dst.idx);
+   emit_1i(p, imm);
+}
+
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
+{
+   assert(dst.file == file_REG32);
+   assert(dst.mod == mod_REG);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
+}
+
+
+void x86_push( struct x86_function *p,
+	       struct x86_reg reg )
+{
+   DUMP_R( reg );
+   if (reg.mod == mod_REG)
+      emit_1ub(p, 0x50 + reg.idx);
+   else 
+   {
+      emit_1ub(p, 0xff);
+      emit_modrm_noreg(p, 6, reg);
+   }
+
+
+   p->stack_offset += 4;
+}
+
+void x86_push_imm32( struct x86_function *p,
+                     int imm32 )
+{
+   DUMP_I( imm32 );
+   emit_1ub(p, 0x68);
+   emit_1i(p,  imm32);
+
+   p->stack_offset += 4;
+}
+
+
+void x86_pop( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x58 + reg.idx);
+   p->stack_offset -= 4;
+}
+
+void x86_inc( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x40 + reg.idx);
+}
+
+void x86_dec( struct x86_function *p,
+	      struct x86_reg reg )
+{
+   DUMP_R( reg );
+   assert(reg.mod == mod_REG);
+   emit_1ub(p, 0x48 + reg.idx);
+}
+
+void x86_ret( struct x86_function *p )
+{
+   DUMP();
+   assert(p->stack_offset == 0);
+   emit_1ub(p, 0xc3);
+}
+
+void x86_retw( struct x86_function *p, unsigned short imm )
+{
+   DUMP();
+   emit_3ub(p, 0xc2, imm & 0xff, (imm >> 8) & 0xff);
+}
+
+void x86_sahf( struct x86_function *p )
+{
+   DUMP();
+   emit_1ub(p, 0x9e);
+}
+
+void x86_mov( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x8b, 0x89, dst, src );
+}
+
+void x86_xor( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x33, 0x31, dst, src );
+}
+
+void x86_cmp( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x3b, 0x39, dst, src );
+}
+
+void x86_lea( struct x86_function *p,
+	      struct x86_reg dst,
+	      struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x8d);
+   emit_modrm( p, dst, src );
+}
+
+void x86_test( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, 0x85);
+   emit_modrm( p, dst, src );
+}
+
+void x86_add( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm(p, 0x03, 0x01, dst, src );
+}
+
+/* Calculate EAX * src, results in EDX:EAX.
+ */
+void x86_mul( struct x86_function *p,
+	       struct x86_reg src )
+{
+   DUMP_R(  src );
+   emit_1ub(p, 0xf7);
+   emit_modrm_noreg(p, 4, src );
+}
+
+
+void x86_imul( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0xAF);
+   emit_modrm(p, dst, src);
+}
+
+
+void x86_sub( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm(p, 0x2b, 0x29, dst, src );
+}
+
+void x86_or( struct x86_function *p,
+             struct x86_reg dst,
+             struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x0b, 0x09, dst, src );
+}
+
+void x86_and( struct x86_function *p,
+              struct x86_reg dst,
+              struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_op_modrm( p, 0x23, 0x21, dst, src );
+}
+
+
+
+/***********************************************************************
+ * SSE instructions
+ */
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
+
+
+
+void sse_movss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0xF3, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movaps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x28, 0x29, dst, src );
+}
+
+void sse_movups( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x10, 0x11, dst, src );
+}
+
+void sse_movhps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
+}
+
+void sse_movlps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod != mod_REG || src.mod != mod_REG);
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
+}
+
+void sse_maxps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_maxss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
+   emit_modrm( p, dst, src );
+}
+
+void sse_divss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
+   emit_modrm( p, dst, src );
+}
+
+void sse_minps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5D);
+   emit_modrm( p, dst, src );
+}
+
+void sse_subps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5C);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_mulss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x59);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_addss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x58);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andnps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x55);
+   emit_modrm( p, dst, src );
+}
+
+void sse_andps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x54);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtps( struct x86_function *p,
+                  struct x86_reg dst,
+                  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+}
+
+void sse_rsqrtss( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x52);
+   emit_modrm( p, dst, src );
+
+}
+
+void sse_movhlps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x12);
+   emit_modrm( p, dst, src );
+}
+
+void sse_movlhps( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.mod == mod_REG && src.mod == mod_REG);
+   emit_2ub(p, X86_TWOB, 0x16);
+   emit_modrm( p, dst, src );
+}
+
+void sse_orps( struct x86_function *p,
+               struct x86_reg dst,
+               struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x56);
+   emit_modrm( p, dst, src );
+}
+
+void sse_xorps( struct x86_function *p,
+                struct x86_reg dst,
+                struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x57);
+   emit_modrm( p, dst, src );
+}
+
+void sse_cvtps2pi( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_XMM || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x2d);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtdq2ps( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5b);
+   emit_modrm( p, dst, src );
+}
+
+
+/* Shufps can also be used to implement a reduced swizzle when dest ==
+ * arg0.
+ */
+void sse_shufps( struct x86_function *p,
+		 struct x86_reg dst,
+		 struct x86_reg src,
+		 unsigned char shuf) 
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_2ub(p, X86_TWOB, 0xC6);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf); 
+}
+
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x15 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub( p, X86_TWOB, 0x14 );
+   emit_modrm( p, dst, src );
+}
+
+void sse_cmpps( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src,
+		enum sse_cc cc) 
+{
+   DUMP_RRI( dst, src, cc );
+   emit_2ub(p, X86_TWOB, 0xC2);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, cc); 
+}
+
+void sse_pmovmskb( struct x86_function *p,
+                   struct x86_reg dst,
+                   struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+   emit_modrm(p, dst, src);
+}
+
+/***********************************************************************
+ * SSE2 instructions
+ */
+
+/**
+ * Perform a reduced swizzle:
+ */
+void sse2_pshufd( struct x86_function *p,
+		  struct x86_reg dst,
+		  struct x86_reg src,
+		  unsigned char shuf) 
+{
+   DUMP_RRI( dst, src, shuf );
+   emit_3ub(p, 0x66, X86_TWOB, 0x70);
+   emit_modrm(p, dst, src);
+   emit_1ub(p, shuf); 
+}
+
+void sse2_cvttps2dq( struct x86_function *p,
+                     struct x86_reg dst,
+                     struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
+   emit_modrm( p, dst, src );
+}
+
+void sse2_cvtps2dq( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x5B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packssdw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x6B);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packsswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x63);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_packuswb( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_punpcklbw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x60);
+   emit_modrm( p, dst, src );
+}
+
+
+void sse2_rcpps( struct x86_function *p,
+                 struct x86_reg dst,
+                 struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_rcpss( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0xF3, X86_TWOB, 0x53);
+   emit_modrm( p, dst, src );
+}
+
+void sse2_movd( struct x86_function *p,
+		struct x86_reg dst,
+		struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, 0x66, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+
+
+
+/***********************************************************************
+ * x87 instructions
+ */
+static void note_x87_pop( struct x86_function *p )
+{
+   p->x87_stack--;
+   assert(p->x87_stack >= 0);
+}
+
+static void note_x87_push( struct x86_function *p )
+{
+   p->x87_stack++;
+   assert(p->x87_stack <= 7);
+}
+
+void x87_assert_stack_empty( struct x86_function *p )
+{
+   assert (p->x87_stack == 0);
+}
+
+
+void x87_fist( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 2, dst);
+}
+
+void x87_fistp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   emit_1ub(p, 0xdb);
+   emit_modrm_noreg(p, 3, dst);
+   note_x87_pop(p);
+}
+
+void x87_fild( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_1ub(p, 0xdf);
+   emit_modrm_noreg(p, 0, arg);
+   note_x87_push(p);
+}
+
+void x87_fldz( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xee);
+   note_x87_push(p);
+}
+
+
+void x87_fldcw( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_REG32);
+   assert(arg.mod != mod_REG);
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 5, arg);
+}
+
+void x87_fld1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe8);
+   note_x87_push(p);
+}
+
+void x87_fldl2e( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xea);
+   note_x87_push(p);
+}
+
+void x87_fldln2( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xed);
+   note_x87_push(p);
+}
+
+void x87_fwait( struct x86_function *p )
+{
+   DUMP();
+   emit_1ub(p, 0x9b);
+}
+
+void x87_fnclex( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xdb, 0xe2);
+}
+
+void x87_fclex( struct x86_function *p )
+{
+   x87_fwait(p);
+   x87_fnclex(p);
+}
+
+void x87_fcmovb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc0+arg.idx);
+}
+
+void x87_fcmove( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc8+arg.idx);
+}
+
+void x87_fcmovbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xd0+arg.idx);
+}
+
+void x87_fcmovnb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc0+arg.idx);
+}
+
+void x87_fcmovne( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc8+arg.idx);
+}
+
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xd0+arg.idx);
+}
+
+
+
+static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
+			  unsigned char dst0ub0,
+			  unsigned char dst0ub1,
+			  unsigned char arg0ub0,
+			  unsigned char arg0ub1,
+			  unsigned char argmem_noreg)
+{
+   assert(dst.file == file_x87);
+
+   if (arg.file == file_x87) {
+      if (dst.idx == 0) 
+	 emit_2ub(p, dst0ub0, dst0ub1+arg.idx);
+      else if (arg.idx == 0) 
+	 emit_2ub(p, arg0ub0, arg0ub1+arg.idx);
+      else
+	 assert(0);
+   }
+   else if (dst.idx == 0) {
+      assert(arg.file == file_REG32);
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, argmem_noreg, arg);
+   }
+   else
+      assert(0);
+}
+
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xc8,
+		0xdc, 0xc8,
+		4);
+}
+
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xe0,
+		0xdc, 0xe8,
+		4);
+}
+
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xe8,
+		0xdc, 0xe0,
+		5);
+}
+
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xc0,
+		0xdc, 0xc0,
+		0);
+}
+
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xf0,
+		0xdc, 0xf8,
+		6);
+}
+
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
+		0xd8, 0xf8,
+		0xdc, 0xf0,
+		7);
+}
+
+void x87_fmulp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fsubp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xe0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_faddp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xc0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fdivp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf8+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_x87);
+   assert(dst.idx >= 1);
+   emit_2ub(p, 0xde, 0xf0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_ftst( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe4);
+}
+
+void x87_fucom( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe0+arg.idx);
+}
+
+void x87_fucomp( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdd, 0xe8+arg.idx);
+   note_x87_pop(p);
+}
+
+void x87_fucompp( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xda, 0xe9);
+   note_x87_pop(p);             /* pop twice */
+   note_x87_pop(p);             /* pop twice */
+}
+
+void x87_fxch( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xd9, 0xc8+arg.idx);
+}
+
+void x87_fabs( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe1);
+}
+
+void x87_fchs( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe0);
+}
+
+void x87_fcos( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xff);
+}
+
+
+void x87_fprndint( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfc);
+}
+
+void x87_fscale( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfd);
+}
+
+void x87_fsin( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfe);
+}
+
+void x87_fsincos( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfb);
+}
+
+void x87_fsqrt( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xfa);
+}
+
+void x87_fxtract( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf4);
+}
+
+/* st0 = (2^st0)-1
+ *
+ * Restrictions: -1.0 <= st0 <= 1.0
+ */
+void x87_f2xm1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf0);
+}
+
+/* st1 = st1 * log2(st0);
+ * pop_stack;
+ */
+void x87_fyl2x( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf1);
+   note_x87_pop(p);
+}
+
+/* st1 = st1 * log2(st0 + 1.0);
+ * pop_stack;
+ *
+ * A fast operation, with restrictions: -.29 < st0 < .29 
+ */
+void x87_fyl2xp1( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xf9);
+   note_x87_pop(p);
+}
+
+
+void x87_fld( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   if (arg.file == file_x87) 
+      emit_2ub(p, 0xd9, 0xc0 + arg.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 0, arg);
+   }
+   note_x87_push(p);
+}
+
+void x87_fst( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+void x87_fstp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xdd, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd9);
+      emit_modrm_noreg(p, 3, dst);
+   }
+   note_x87_pop(p);
+}
+
+void x87_fpop( struct x86_function *p )
+{
+   x87_fstp( p, x86_make_reg( file_x87, 0 ));
+}
+
+
+void x87_fcom( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd0 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 2, dst);
+   }
+}
+
+
+void x87_fcomp( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   if (dst.file == file_x87) 
+      emit_2ub(p, 0xd8, 0xd8 + dst.idx);
+   else {
+      emit_1ub(p, 0xd8);
+      emit_modrm_noreg(p, 3, dst);
+   }
+   note_x87_pop(p);
+}
+
+void x87_fcomi( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+}
+
+void x87_fcomip( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+   note_x87_pop(p);
+}
+
+
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   if (dst.idx == reg_AX &&
+       dst.mod == mod_REG) 
+      emit_2ub(p, 0xdf, 0xe0);
+   else {
+      emit_1ub(p, 0xdd);
+      emit_modrm_noreg(p, 7, dst);
+   }
+}
+
+
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   emit_1ub(p, 0x9b);           /* WAIT -- needed? */
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 7, dst);
+}
+
+
+
+
+/***********************************************************************
+ * MMX instructions
+ */
+
+void mmx_emms( struct x86_function *p )
+{
+   DUMP();
+   assert(p->need_emms);
+   emit_2ub(p, 0x0f, 0x77);
+   p->need_emms = 0;
+}
+
+void mmx_packssdw( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x6b);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_packuswb( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   assert(dst.file == file_MMX && 
+	  (src.file == file_MMX || src.mod != mod_REG));
+
+   p->need_emms = 1;
+
+   emit_2ub(p, X86_TWOB, 0x67);
+   emit_modrm( p, dst, src );
+}
+
+void mmx_movd( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6e, 0x7e, dst, src );
+}
+
+void mmx_movq( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   p->need_emms = 1;
+   emit_1ub(p, X86_TWOB);
+   emit_op_modrm( p, 0x6f, 0x7f, dst, src );
+}
+
+
+/***********************************************************************
+ * Helper functions
+ */
+
+
+void x86_cdecl_caller_push_regs( struct x86_function *p )
+{
+   x86_push(p, x86_make_reg(file_REG32, reg_AX));
+   x86_push(p, x86_make_reg(file_REG32, reg_CX));
+   x86_push(p, x86_make_reg(file_REG32, reg_DX));
+}
+
+void x86_cdecl_caller_pop_regs( struct x86_function *p )
+{
+   x86_pop(p, x86_make_reg(file_REG32, reg_DX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_CX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_AX));
+}
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity:
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p,
+			   unsigned arg )
+{
+   return x86_make_disp(x86_make_reg(file_REG32, reg_SP), 
+			p->stack_offset + arg * 4);	/* ??? */
+}
+
+
+void x86_init_func( struct x86_function *p )
+{
+   p->size = 0;
+   p->store = NULL;
+   p->csr = p->store;
+   DUMP_START();
+}
+
+void x86_init_func_size( struct x86_function *p, unsigned code_size )
+{
+   p->size = code_size;
+   p->store = rtasm_exec_malloc(code_size);
+   if (p->store == NULL) {
+      p->store = p->error_overflow;
+   }
+   p->csr = p->store;
+   DUMP_START();
+}
+
+void x86_release_func( struct x86_function *p )
+{
+   if (p->store && p->store != p->error_overflow)
+      rtasm_exec_free(p->store);
+
+   p->store = NULL;
+   p->csr = NULL;
+   p->size = 0;
+}
+
+
+void (*x86_get_func( struct x86_function *p ))(void)
+{
+   DUMP_END();
+   if (DISASSEM && p->store)
+      debug_printf("disassemble %p %p\n", p->store, p->csr);
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+      return (void (*)(void)) p->store;
+}
+
+#else
+
+void x86sse_dummy( void )
+{
+}
+
+#endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
new file mode 100644
index 0000000000..1b5eaaca85
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -0,0 +1,319 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999-2005  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _RTASM_X86SSE_H_
+#define _RTASM_X86SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
+/* It is up to the caller to ensure that instructions issued are
+ * suitable for the host cpu.  There are no checks made in this module
+ * for mmx/sse/sse2 support on the cpu.
+ */
+struct x86_reg {
+   unsigned file:3;
+   unsigned idx:3;
+   unsigned mod:2;		/* mod_REG if this is just a register */
+   int      disp:24;		/* only +/- 23bits of offset - should be enough... */
+};
+
+struct x86_function {
+   unsigned size;
+   unsigned char *store;
+   unsigned char *csr;
+
+   unsigned stack_offset:16;
+   unsigned need_emms:8;
+   int x87_stack:8;
+
+   unsigned char error_overflow[4];
+};
+
+enum x86_reg_file {
+   file_REG32,
+   file_MMX,
+   file_XMM,
+   file_x87
+};
+
+/* Values for mod field of modr/m byte
+ */
+enum x86_reg_mod {
+   mod_INDIRECT,
+   mod_DISP8,
+   mod_DISP32,
+   mod_REG
+};
+
+enum x86_reg_name {
+   reg_AX,
+   reg_CX,
+   reg_DX,
+   reg_BX,
+   reg_SP,
+   reg_BP,
+   reg_SI,
+   reg_DI
+};
+
+
+enum x86_cc {
+   cc_O,			/* overflow */
+   cc_NO,			/* not overflow */
+   cc_NAE,			/* not above or equal / carry */
+   cc_AE,			/* above or equal / not carry */
+   cc_E,			/* equal / zero */
+   cc_NE			/* not equal / not zero */
+};
+
+enum sse_cc {
+   cc_Equal,
+   cc_LessThan,
+   cc_LessThanEqual,
+   cc_Unordered,
+   cc_NotEqual,
+   cc_NotLessThan,
+   cc_NotLessThanEqual,
+   cc_Ordered
+};
+
+#define cc_Z  cc_E
+#define cc_NZ cc_NE
+
+/* Begin/end/retreive function creation:
+ */
+
+
+void x86_init_func( struct x86_function *p );
+void x86_init_func_size( struct x86_function *p, unsigned code_size );
+void x86_release_func( struct x86_function *p );
+void (*x86_get_func( struct x86_function *p ))( void );
+
+/* Debugging:
+ */
+void x86_print_reg( struct x86_reg reg );
+
+
+/* Create and manipulate registers and regmem values:
+ */
+struct x86_reg x86_make_reg( enum x86_reg_file file,
+			     enum x86_reg_name idx );
+
+struct x86_reg x86_make_disp( struct x86_reg reg,
+			      int disp );
+
+struct x86_reg x86_deref( struct x86_reg reg );
+
+struct x86_reg x86_get_base_reg( struct x86_reg reg );
+
+
+/* Labels, jumps and fixup:
+ */
+int x86_get_label( struct x86_function *p );
+
+void x86_jcc( struct x86_function *p,
+	      enum x86_cc cc,
+	      int label );
+
+int x86_jcc_forward( struct x86_function *p,
+			  enum x86_cc cc );
+
+int x86_jmp_forward( struct x86_function *p);
+
+int x86_call_forward( struct x86_function *p);
+
+void x86_fixup_fwd_jump( struct x86_function *p,
+			 int fixup );
+
+void x86_jmp( struct x86_function *p, int label );
+
+/* void x86_call( struct x86_function *p, void (*label)() ); */
+void x86_call( struct x86_function *p, struct x86_reg reg);
+
+void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
+
+
+/* Macro for sse_shufps() and sse2_pshufd():
+ */
+#define SHUF(_x,_y,_z,_w)       (((_x)<<0) | ((_y)<<2) | ((_z)<<4) | ((_w)<<6))
+#define SHUF_NOOP               RSW(0,1,2,3)
+#define GET_SHUF(swz, idx)      (((swz) >> ((idx)*2)) & 0x3)
+
+void mmx_emms( struct x86_function *p );
+void mmx_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_movq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                  unsigned char shuf );
+void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
+void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_divss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andnps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_andps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_cmpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src,
+                enum sse_cc cc );
+void sse_maxps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_maxss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_minps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movaps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movlps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_movups( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_mulss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_orps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_xorps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_subps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
+                 unsigned char shuf );
+void sse_unpckhps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_unpcklps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+
+void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_cmp( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_dec( struct x86_function *p, struct x86_reg reg );
+void x86_inc( struct x86_function *p, struct x86_reg reg );
+void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_pop( struct x86_function *p, struct x86_reg reg );
+void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_push_imm32( struct x86_function *p, int imm );
+void x86_ret( struct x86_function *p );
+void x86_retw( struct x86_function *p, unsigned short imm );
+void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void x86_sahf( struct x86_function *p );
+
+
+void x86_cdecl_caller_push_regs( struct x86_function *p );
+void x86_cdecl_caller_pop_regs( struct x86_function *p );
+
+void x87_assert_stack_empty( struct x86_function *p );
+
+void x87_f2xm1( struct x86_function *p );
+void x87_fabs( struct x86_function *p );
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_faddp( struct x86_function *p, struct x86_reg dst );
+void x87_fchs( struct x86_function *p );
+void x87_fclex( struct x86_function *p );
+void x87_fcmovb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmove( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmovne( struct x86_function *p, struct x86_reg src );
+void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomi( struct x86_function *p, struct x86_reg dst );
+void x87_fcomip( struct x86_function *p, struct x86_reg dst );
+void x87_fcomp( struct x86_function *p, struct x86_reg dst );
+void x87_fcos( struct x86_function *p );
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivp( struct x86_function *p, struct x86_reg dst );
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fdivrp( struct x86_function *p, struct x86_reg dst );
+void x87_fild( struct x86_function *p, struct x86_reg arg );
+void x87_fist( struct x86_function *p, struct x86_reg dst );
+void x87_fistp( struct x86_function *p, struct x86_reg dst );
+void x87_fld( struct x86_function *p, struct x86_reg arg );
+void x87_fld1( struct x86_function *p );
+void x87_fldcw( struct x86_function *p, struct x86_reg arg );
+void x87_fldl2e( struct x86_function *p );
+void x87_fldln2( struct x86_function *p );
+void x87_fldz( struct x86_function *p );
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fmulp( struct x86_function *p, struct x86_reg dst );
+void x87_fnclex( struct x86_function *p );
+void x87_fprndint( struct x86_function *p );
+void x87_fpop( struct x86_function *p );
+void x87_fscale( struct x86_function *p );
+void x87_fsin( struct x86_function *p );
+void x87_fsincos( struct x86_function *p );
+void x87_fsqrt( struct x86_function *p );
+void x87_fst( struct x86_function *p, struct x86_reg dst );
+void x87_fstp( struct x86_function *p, struct x86_reg dst );
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubp( struct x86_function *p, struct x86_reg dst );
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
+void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_ftst( struct x86_function *p );
+void x87_fxch( struct x86_function *p, struct x86_reg dst );
+void x87_fxtract( struct x86_function *p );
+void x87_fyl2x( struct x86_function *p );
+void x87_fyl2xp1( struct x86_function *p );
+void x87_fwait( struct x86_function *p );
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst );
+void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
+void x87_fucompp( struct x86_function *p );
+void x87_fucomp( struct x86_function *p, struct x86_reg arg );
+void x87_fucom( struct x86_function *p, struct x86_reg arg );
+
+
+
+/* Retreive a reference to one of the function arguments, taking into
+ * account any push/pop activity.  Note - doesn't track explict
+ * manipulation of ESP by other instructions.
+ */
+struct x86_reg x86_fn_arg( struct x86_function *p, unsigned arg );
+
+#endif
+#endif
diff --git a/src/gallium/auxiliary/sct/Makefile b/src/gallium/auxiliary/sct/Makefile
new file mode 100644
index 0000000000..516d1756cf
--- /dev/null
+++ b/src/gallium/auxiliary/sct/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = sct
+
+C_SOURCES = \
+	sct.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/sct/SConscript b/src/gallium/auxiliary/sct/SConscript
new file mode 100644
index 0000000000..76927d973f
--- /dev/null
+++ b/src/gallium/auxiliary/sct/SConscript
@@ -0,0 +1,9 @@
+Import('*')
+
+sct = env.ConvenienceLibrary(
+	target = 'sct',
+	source = [
+		'sct.c'
+	])
+
+auxiliaries.insert(0, sct)
diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c
new file mode 100644
index 0000000000..49bb7ea92e
--- /dev/null
+++ b/src/gallium/auxiliary/sct/sct.c
@@ -0,0 +1,454 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+#include "sct.h"
+
+
+struct texture_list
+{
+   struct pipe_texture *texture;
+   struct texture_list *next;
+};
+
+
+
+#define MAX_SURFACES  ((PIPE_MAX_COLOR_BUFS) + 1)
+
+struct sct_context
+{
+   const struct pipe_context *context;
+
+   /** surfaces the context is drawing into */
+   struct pipe_surface *surfaces[MAX_SURFACES];
+
+   /** currently bound textures */
+   struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
+
+   /** previously bound textures, used but not flushed */
+   struct texture_list *textures_used;
+
+   boolean needs_flush;
+
+   struct sct_context *next;
+};
+
+
+
+struct sct_surface
+{
+   const struct pipe_surface *surface;
+
+   /** list of contexts drawing to this surface */
+   struct sct_context_list *contexts;
+
+   struct sct_surface *next;
+};
+
+
+
+/**
+ * Find the surface_info for the given pipe_surface
+ */
+static struct sct_surface *
+find_surface_info(struct surface_context_tracker *sct,
+                  const struct pipe_surface *surface)
+{
+   struct sct_surface *si;
+   for (si = sct->surfaces; si; si = si->next)
+      if (si->surface == surface)
+         return si;
+   return NULL;
+}
+
+
+/**
+ * As above, but create new surface_info if surface is new.
+ */
+static struct sct_surface *
+find_create_surface_info(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surface)
+{
+   struct sct_surface *si = find_surface_info(sct, surface);
+   if (si)
+      return si;
+
+   /* alloc new */
+   si = CALLOC_STRUCT(sct_surface);
+   if (si) {
+      si->surface = surface;
+
+      /* insert at head */
+      si->next = sct->surfaces;
+      sct->surfaces = si;
+   }
+
+   return si;
+}
+
+
+/**
+ * Find a context_info for the given context.
+ */
+static struct sct_context *
+find_context_info(struct surface_context_tracker *sct,
+                  const struct pipe_context *context)
+{
+   struct sct_context *ci;
+   for (ci = sct->contexts; ci; ci = ci->next)
+      if (ci->context == context)
+         return ci;
+   return NULL;
+}
+
+
+/**
+ * As above, but create new context_info if context is new.
+ */
+static struct sct_context *
+find_create_context_info(struct surface_context_tracker *sct,
+                         const struct pipe_context *context)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+   if (ci)
+      return ci;
+
+   /* alloc new */
+   ci = CALLOC_STRUCT(sct_context);
+   if (ci) {
+      ci->context = context;
+
+      /* insert at head */
+      ci->next = sct->contexts;
+      sct->contexts = ci;
+   }
+
+   return ci;
+}
+
+
+/**
+ * Is the context already bound to the surface?
+ */
+static boolean
+find_surface_context(const struct sct_surface *si,
+                     const struct pipe_context *context)
+{
+   const struct sct_context_list *cl;
+   for (cl = si->contexts; cl; cl = cl->next) {
+      if (cl->context == context) {
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+/**
+ * Add a context to the list of contexts associated with a surface.
+ */
+static void
+add_context_to_surface(struct sct_surface *si,
+                       const struct pipe_context *context)
+{
+   struct sct_context_list *cl = CALLOC_STRUCT(sct_context_list);
+   if (cl) {
+      cl->context = context;
+      /* insert at head of list of contexts */
+      cl->next = si->contexts;
+      si->contexts = cl;
+   }
+}
+
+
+/**
+ * Remove a context from the list of contexts associated with a surface.
+ */
+static void
+remove_context_from_surface(struct sct_surface *si,
+                            const struct pipe_context *context)
+{
+   struct sct_context_list *prev = NULL, *curr, *next;
+
+   for (curr = si->contexts; curr; curr = next) {
+      if (curr->context == context) {
+         /* remove */
+         if (prev)
+            prev->next = curr->next;
+         else
+            si->contexts = curr->next;
+         next = curr->next;
+         FREE(curr);
+      }
+      else {
+         prev = curr;
+         next = curr->next;
+      }
+   }
+}
+
+
+/**
+ * Unbind context from surface.
+ */
+static void
+unbind_context_surface(struct surface_context_tracker *sct,
+                       struct pipe_context *context,
+                       struct pipe_surface *surface)
+{
+   struct sct_surface *si = find_surface_info(sct, surface);
+   if (si) {
+      remove_context_from_surface(si, context);
+   }
+}
+
+
+/**
+ * Bind context to a set of surfaces (color + Z).
+ * Like MakeCurrent().
+ */
+void
+sct_bind_surfaces(struct surface_context_tracker *sct,
+                  struct pipe_context *context,
+                  uint num_surf,
+                  struct pipe_surface **surfaces)
+{
+   struct sct_context *ci = find_create_context_info(sct, context);
+   uint i;
+
+   if (!ci) {
+      return; /* out of memory */
+   }
+
+   /* unbind currently bound surfaces */
+   for (i = 0; i < MAX_SURFACES; i++) {
+      if (ci->surfaces[i]) {
+         unbind_context_surface(sct, context, ci->surfaces[i]);
+      }
+   }
+
+   /* bind new surfaces */
+   for (i = 0; i < num_surf; i++) {
+      struct sct_surface *si = find_create_surface_info(sct, surfaces[i]);
+      if (!find_surface_context(si, context)) {
+         add_context_to_surface(si, context);
+      }
+   }
+}
+
+
+/**
+ * Return list of contexts bound to a surface.
+ */
+const struct sct_context_list *
+sct_get_surface_contexts(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surface)
+{
+   const struct sct_surface *si = find_surface_info(sct, surface);
+   return si->contexts;
+}
+
+
+
+static boolean
+find_texture(const struct sct_context *ci,
+             const struct pipe_texture *texture)
+{
+   const struct texture_list *tl;
+
+   for (tl = ci->textures_used; tl; tl = tl->next) {
+      if (tl->texture == texture) {
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+/**
+ * Add the given texture to the context's list of used textures.
+ */
+static void
+add_texture_used(struct sct_context *ci,
+                 struct pipe_texture *texture)
+{
+   if (!find_texture(ci, texture)) {
+      /* add to list */
+      struct texture_list *tl = CALLOC_STRUCT(texture_list);
+      if (tl) {
+         pipe_texture_reference(&tl->texture, texture);
+         /* insert at head */
+         tl->next = ci->textures_used;
+         ci->textures_used = tl;
+      }
+   }
+}
+
+
+/**
+ * Bind a texture to a rendering context.
+ */
+void
+sct_bind_texture(struct surface_context_tracker *sct,
+                 struct pipe_context *context,
+                 uint unit,
+                 struct pipe_texture *tex)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+
+   if (ci->textures[unit] != tex) {
+      /* put texture on the 'used' list */
+      add_texture_used(ci, tex);
+      /* bind new */
+      pipe_texture_reference(&ci->textures[unit], tex);
+   }
+}
+
+
+/**
+ * Check if the given texture has been used by the rendering context
+ * since the last call to sct_flush_textures().
+ */
+boolean
+sct_is_texture_used(struct surface_context_tracker *sct,
+                    const struct pipe_context *context,
+                    const struct pipe_texture *texture)
+{
+   const struct sct_context *ci = find_context_info(sct, context);
+   return find_texture(ci, texture);
+}
+
+
+/**
+ * To be called when the image contents of a texture are changed, such
+ * as for gl[Copy]TexSubImage().
+ * XXX this may not be needed
+ */
+void
+sct_update_texture(struct pipe_texture *tex)
+{
+
+}
+
+
+/**
+ * When a scene is flushed/rendered we can release the list of
+ * used textures.
+ */
+void
+sct_flush_textures(struct surface_context_tracker *sct,
+                   struct pipe_context *context)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+   struct texture_list *tl, *next;
+   uint i;
+
+   for (tl = ci->textures_used; tl; tl = next) {
+      next = tl->next;
+      pipe_texture_release(&tl->texture);
+      FREE(tl);
+   }
+   ci->textures_used = NULL;
+
+   /* put the currently bound textures on the 'used' list */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      add_texture_used(ci, ci->textures[i]);
+   }
+}
+
+
+
+void
+sct_destroy_context(struct surface_context_tracker *sct,
+                    struct pipe_context *context)
+{
+   /* XXX should we require an unbinding first? */
+   {
+      struct sct_surface *si;
+      for (si = sct->surfaces; si; si = si->next) {
+         remove_context_from_surface(si, context);
+      }
+   }
+
+   /* remove context from context_info list */
+   {
+      struct sct_context *ci, *next, *prev = NULL;
+      for (ci = sct->contexts; ci; ci = next) {
+         next = ci->next;
+         if (ci->context == context) {
+            if (prev)
+               prev->next = ci->next;
+            else
+               sct->contexts = ci->next;
+            FREE(ci);
+         }
+         else {
+            prev = ci;
+         }
+      }
+   }
+
+}
+
+
+void
+sct_destroy_surface(struct surface_context_tracker *sct,
+                    struct pipe_surface *surface)
+{
+   if (1) {
+      /* debug/sanity: no context should be bound to surface */
+      struct sct_context *ci;
+      uint i;
+      for (ci = sct->contexts; ci; ci = ci->next) {
+         for (i = 0; i < MAX_SURFACES; i++) {
+            assert(ci->surfaces[i] != surface);
+         }
+      }
+   }
+
+   /* remove surface from sct_surface list */
+   {
+      struct sct_surface *si, *next, *prev = NULL;
+      for (si = sct->surfaces; si; si = next) {
+         next = si->next;
+         if (si->surface == surface) {
+            /* unlink */
+            if (prev)
+               prev->next = si->next;
+            else
+               sct->surfaces = si->next;
+            FREE(si);
+         }
+         else {
+            prev = si;
+         }
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/sct/sct.h b/src/gallium/auxiliary/sct/sct.h
new file mode 100644
index 0000000000..cf7c4d3bdf
--- /dev/null
+++ b/src/gallium/auxiliary/sct/sct.h
@@ -0,0 +1,123 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Surface/Context Tracking
+ *
+ * For some drivers, we need to monitor the binding between contexts and
+ * surfaces/textures.
+ * This code may evolve quite a bit...
+ */
+
+
+#ifndef SCT_H
+#define SCT_H
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_context;
+struct pipe_surface;
+
+struct sct_context;
+struct sct_surface;
+
+
+/**
+ * Per-device info, basically
+ */
+struct surface_context_tracker
+{
+   struct sct_context *contexts;
+   struct sct_surface *surfaces;
+};
+
+
+
+/**
+ * Simple linked list of contexts
+ */
+struct sct_context_list
+{
+   const struct pipe_context *context;
+   struct sct_context_list *next;
+};
+
+
+
+extern void
+sct_bind_surfaces(struct surface_context_tracker *sct,
+                  struct pipe_context *context,
+                  uint num_surf,
+                  struct pipe_surface **surfaces);
+
+
+extern void
+sct_bind_texture(struct surface_context_tracker *sct,
+                 struct pipe_context *context,
+                 uint unit,
+                 struct pipe_texture *texture);
+
+
+extern void
+sct_update_texture(struct pipe_texture *tex);
+
+
+extern boolean
+sct_is_texture_used(struct surface_context_tracker *sct,
+                    const struct pipe_context *context,
+                    const struct pipe_texture *texture);
+
+extern void
+sct_flush_textures(struct surface_context_tracker *sct,
+                   struct pipe_context *context);
+
+
+extern const struct sct_context_list *
+sct_get_surface_contexts(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surf);
+
+
+extern void
+sct_destroy_context(struct surface_context_tracker *sct,
+                    struct pipe_context *context);
+
+
+extern void
+sct_destroy_surface(struct surface_context_tracker *sct,
+                    struct pipe_surface *surface);
+
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* SCT_H */
diff --git a/src/gallium/auxiliary/sct/usage.c b/src/gallium/auxiliary/sct/usage.c
new file mode 100644
index 0000000000..6227f19962
--- /dev/null
+++ b/src/gallium/auxiliary/sct/usage.c
@@ -0,0 +1,61 @@
+/* surface / context tracking */
+
+
+/*
+
+context A:
+  render to texture T
+
+context B:
+  texture from T
+
+-----------------------
+
+flush surface:
+  which contexts are bound to the surface?
+
+-----------------------
+
+glTexSubImage():
+  which contexts need to be flushed?
+
+ */
+
+
+/*
+
+in MakeCurrent():
+
+  call sct_bind_surfaces(context, list of surfaces) to update the
+  dependencies between context and surfaces
+
+
+in SurfaceFlush(), or whatever it is in D3D:
+
+  call sct_get_surface_contexts(surface) to get a list of contexts
+  which are currently bound to the surface.
+
+
+
+in BindTexture():
+
+  call sct_bind_texture(context, texture) to indicate that the texture
+  is used in the scene.
+
+
+in glTexSubImage() or RenderToTexture():
+
+  call sct_is_texture_used(context, texture) to determine if the texture
+  has been used in the scene, but the scene's not flushed.  If TRUE is
+  returned it means the scene has to be rendered/flushed before the contents
+  of the texture can be changed.
+
+
+in psb_scene_flush/terminate():
+
+  call sct_flush_textures(context) to tell the SCT that the textures which
+  were used in the scene can be released.
+
+
+
+*/
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
new file mode 100644
index 0000000000..d7df9490cf
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = tgsi
+
+C_SOURCES = \
+	tgsi_sanity.c \
+	tgsi_build.c \
+	tgsi_dump.c \
+	tgsi_exec.c \
+	tgsi_info.c \
+	tgsi_iterate.c \
+	tgsi_parse.c \
+	tgsi_ppc.c \
+	tgsi_scan.c \
+	tgsi_sse2.c \
+	tgsi_text.c \
+	tgsi_transform.c \
+	tgsi_util.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/tgsi/SConscript b/src/gallium/auxiliary/tgsi/SConscript
new file mode 100644
index 0000000000..8200cce42f
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/SConscript
@@ -0,0 +1,22 @@
+Import('*')
+
+tgsi = env.ConvenienceLibrary(
+	target = 'tgsi',
+	source = [
+		'tgsi_build.c',
+		'tgsi_dump.c',
+		'tgsi_dump_c.c',
+		'tgsi_exec.c',
+		'tgsi_info.c',
+		'tgsi_iterate.c',
+		'tgsi_parse.c',
+		'tgsi_sanity.c',
+		'tgsi_scan.c',
+		'tgsi_ppc.c',
+		'tgsi_sse2.c',
+		'tgsi_text.c',
+		'tgsi_transform.c',
+		'tgsi_util.c',
+	])
+
+auxiliaries.insert(0, tgsi)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
new file mode 100644
index 0000000000..fd02c2c87c
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -0,0 +1,1335 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_build.h"
+#include "tgsi_parse.h"
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void )
+{
+   struct tgsi_version  version;
+
+   version.MajorVersion = 1;
+   version.MinorVersion = 1;
+   version.Padding = 0;
+
+   return version;
+}
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void )
+{
+   struct tgsi_header header;
+
+   header.HeaderSize = 1;
+   header.BodySize = 0;
+
+   return header;
+}
+
+static void
+header_headersize_grow( struct tgsi_header *header )
+{
+   assert( header->HeaderSize < 0xFF );
+   assert( header->BodySize == 0 );
+
+   header->HeaderSize++;
+}
+
+static void
+header_bodysize_grow( struct tgsi_header *header )
+{
+   assert( header->BodySize < 0xFFFFFF );
+
+   header->BodySize++;
+}
+
+struct tgsi_processor
+tgsi_default_processor( void )
+{
+   struct tgsi_processor processor;
+
+   processor.Processor = TGSI_PROCESSOR_FRAGMENT;
+   processor.Padding = 0;
+
+   return processor;
+}
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned type,
+   struct tgsi_header *header )
+{
+   struct tgsi_processor processor;
+
+   processor = tgsi_default_processor();
+   processor.Processor = type;
+
+   header_headersize_grow( header );
+
+   return processor;
+}
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void )
+{
+   struct tgsi_declaration declaration;
+
+   declaration.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   declaration.Size = 1;
+   declaration.File = TGSI_FILE_NULL;
+   declaration.UsageMask = TGSI_WRITEMASK_XYZW;
+   declaration.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   declaration.Semantic = 0;
+   declaration.Centroid = 0;
+   declaration.Invariant = 0;
+   declaration.Padding = 0;
+   declaration.Extended = 0;
+
+   return declaration;
+}
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   unsigned centroid,
+   unsigned invariant,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration declaration;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( interpolate <= TGSI_INTERPOLATE_PERSPECTIVE );
+
+   declaration = tgsi_default_declaration();
+   declaration.File = file;
+   declaration.UsageMask = usage_mask;
+   declaration.Interpolate = interpolate;
+   declaration.Semantic = semantic;
+   declaration.Centroid = centroid;
+   declaration.Invariant = invariant;
+
+   header_bodysize_grow( header );
+
+   return declaration;
+}
+
+static void
+declaration_grow(
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   assert( declaration->Size < 0xFF );
+
+   declaration->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void )
+{
+   struct tgsi_full_declaration  full_declaration;
+
+   full_declaration.Declaration  = tgsi_default_declaration();
+   full_declaration.DeclarationRange = tgsi_default_declaration_range();
+   full_declaration.Semantic = tgsi_default_declaration_semantic();
+
+   return full_declaration;
+}
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0;
+   struct tgsi_declaration *declaration;
+   struct tgsi_declaration_range *dr;
+
+   if( maxsize <= size )
+     return 0;
+   declaration = (struct tgsi_declaration *) &tokens[size];
+   size++;
+
+   *declaration = tgsi_build_declaration(
+      full_decl->Declaration.File,
+      full_decl->Declaration.UsageMask,
+      full_decl->Declaration.Interpolate,
+      full_decl->Declaration.Semantic,
+      full_decl->Declaration.Centroid,
+      full_decl->Declaration.Invariant,
+      header );
+
+   if (maxsize <= size)
+      return 0;
+   dr = (struct tgsi_declaration_range *) &tokens[size];
+   size++;
+
+   *dr = tgsi_build_declaration_range(
+      full_decl->DeclarationRange.First,
+      full_decl->DeclarationRange.Last,
+      declaration,
+      header );
+
+   if( full_decl->Declaration.Semantic ) {
+      struct tgsi_declaration_semantic *ds;
+
+      if( maxsize <= size )
+         return  0;
+      ds = (struct tgsi_declaration_semantic *) &tokens[size];
+      size++;
+
+      *ds = tgsi_build_declaration_semantic(
+         full_decl->Semantic.SemanticName,
+         full_decl->Semantic.SemanticIndex,
+         declaration,
+         header );
+   }
+
+   return size;
+}
+
+struct tgsi_declaration_range
+tgsi_default_declaration_range( void )
+{
+   struct tgsi_declaration_range dr;
+
+   dr.First = 0;
+   dr.Last = 0;
+
+   return dr;
+}
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_range declaration_range;
+
+   assert( last >= first );
+   assert( last <= 0xFFFF );
+
+   declaration_range = tgsi_default_declaration_range();
+   declaration_range.First = first;
+   declaration_range.Last = last;
+
+   declaration_grow( declaration, header );
+
+   return declaration_range;
+}
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void )
+{
+   struct tgsi_declaration_semantic ds;
+
+   ds.SemanticName = TGSI_SEMANTIC_POSITION;
+   ds.SemanticIndex = 0;
+   ds.Padding = 0;
+
+   return ds;
+}
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_semantic ds;
+
+   assert( semantic_name <= TGSI_SEMANTIC_COUNT );
+   assert( semantic_index <= 0xFFFF );
+
+   ds = tgsi_default_declaration_semantic();
+   ds.SemanticName = semantic_name;
+   ds.SemanticIndex = semantic_index;
+
+   declaration_grow( declaration, header );
+
+   return ds;
+}
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void )
+{
+   struct tgsi_immediate immediate;
+
+   immediate.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
+   immediate.Size = 1;
+   immediate.DataType = TGSI_IMM_FLOAT32;
+   immediate.Padding = 0;
+   immediate.Extended = 0;
+
+   return immediate;
+}
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate immediate;
+
+   immediate = tgsi_default_immediate();
+
+   header_bodysize_grow( header );
+
+   return immediate;
+}
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void )
+{
+   struct tgsi_full_immediate fullimm;
+
+   fullimm.Immediate = tgsi_default_immediate();
+   fullimm.u.Pointer = (void *) 0;
+
+   return fullimm;
+}
+
+static void
+immediate_grow(
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   assert( immediate->Size < 0xFF );
+
+   immediate->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate_float32 immediate_float32;
+
+   immediate_float32.Float = value;
+
+   immediate_grow( immediate, header );
+
+   return immediate_float32;
+}
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0, i;
+   struct tgsi_immediate *immediate;
+
+   if( maxsize <= size )
+      return 0;
+   immediate = (struct tgsi_immediate *) &tokens[size];
+   size++;
+
+   *immediate = tgsi_build_immediate( header );
+
+   for( i = 0; i < full_imm->Immediate.Size - 1; i++ ) {
+      struct tgsi_immediate_float32 *if32;
+
+      if( maxsize <= size )
+         return  0;
+      if32 = (struct tgsi_immediate_float32 *) &tokens[size];
+      size++;
+
+      *if32 = tgsi_build_immediate_float32(
+         full_imm->u.ImmediateFloat32[i].Float,
+         immediate,
+         header );
+   }
+
+   return size;
+}
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void )
+{
+   struct tgsi_instruction instruction;
+
+   instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
+   instruction.Size = 1;
+   instruction.Opcode = TGSI_OPCODE_MOV;
+   instruction.Saturate = TGSI_SAT_NONE;
+   instruction.NumDstRegs = 1;
+   instruction.NumSrcRegs = 1;
+   instruction.Padding  = 0;
+   instruction.Extended = 0;
+
+   return instruction;
+}
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction instruction;
+
+   assert (opcode <= TGSI_OPCODE_LAST);
+   assert (saturate <= TGSI_SAT_MINUS_PLUS_ONE);
+   assert (num_dst_regs <= 3);
+   assert (num_src_regs <= 15);
+
+   instruction = tgsi_default_instruction();
+   instruction.Opcode = opcode;
+   instruction.Saturate = saturate;
+   instruction.NumDstRegs = num_dst_regs;
+   instruction.NumSrcRegs = num_src_regs;
+
+   header_bodysize_grow( header );
+
+   return instruction;
+}
+
+static void
+instruction_grow(
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   assert (instruction->Size <   0xFF);
+
+   instruction->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void )
+{
+   struct tgsi_full_instruction full_instruction;
+   unsigned i;
+
+   full_instruction.Instruction = tgsi_default_instruction();
+   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
+   full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
+   full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
+   for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
+      full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
+   }
+   for( i = 0;  i < TGSI_FULL_MAX_SRC_REGISTERS; i++ ) {
+      full_instruction.FullSrcRegisters[i] = tgsi_default_full_src_register();
+   }
+
+   return full_instruction;
+}
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct  tgsi_token *tokens,
+   struct  tgsi_header *header,
+   unsigned  maxsize )
+{
+   unsigned size = 0;
+   unsigned i;
+   struct tgsi_instruction *instruction;
+   struct tgsi_token *prev_token;
+
+   if( maxsize <= size )
+      return 0;
+   instruction = (struct tgsi_instruction *) &tokens[size];
+   size++;
+
+   *instruction = tgsi_build_instruction(
+      full_inst->Instruction.Opcode,
+      full_inst->Instruction.Saturate,
+      full_inst->Instruction.NumDstRegs,
+      full_inst->Instruction.NumSrcRegs,
+      header );
+   prev_token = (struct tgsi_token  *) instruction;
+
+   if( tgsi_compare_instruction_ext_nv(
+         full_inst->InstructionExtNv,
+         tgsi_default_instruction_ext_nv() ) ) {
+      struct tgsi_instruction_ext_nv *instruction_ext_nv;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_nv =
+         (struct  tgsi_instruction_ext_nv *) &tokens[size];
+      size++;
+
+      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
+         full_inst->InstructionExtNv.Precision,
+         full_inst->InstructionExtNv.CondDstIndex,
+         full_inst->InstructionExtNv.CondFlowIndex,
+         full_inst->InstructionExtNv.CondMask,
+         full_inst->InstructionExtNv.CondSwizzleX,
+         full_inst->InstructionExtNv.CondSwizzleY,
+         full_inst->InstructionExtNv.CondSwizzleZ,
+         full_inst->InstructionExtNv.CondSwizzleW,
+         full_inst->InstructionExtNv.CondDstUpdate,
+         full_inst->InstructionExtNv.CondFlowEnable,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_nv;
+   }
+
+   if( tgsi_compare_instruction_ext_label(
+         full_inst->InstructionExtLabel,
+         tgsi_default_instruction_ext_label() ) ) {
+      struct tgsi_instruction_ext_label *instruction_ext_label;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_label =
+         (struct  tgsi_instruction_ext_label *) &tokens[size];
+      size++;
+
+      *instruction_ext_label = tgsi_build_instruction_ext_label(
+         full_inst->InstructionExtLabel.Label,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_label;
+   }
+
+   if( tgsi_compare_instruction_ext_texture(
+         full_inst->InstructionExtTexture,
+         tgsi_default_instruction_ext_texture() ) ) {
+      struct tgsi_instruction_ext_texture *instruction_ext_texture;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_texture =
+         (struct  tgsi_instruction_ext_texture *) &tokens[size];
+      size++;
+
+      *instruction_ext_texture = tgsi_build_instruction_ext_texture(
+         full_inst->InstructionExtTexture.Texture,
+         prev_token,
+         instruction,
+         header   );
+      prev_token = (struct tgsi_token  *) instruction_ext_texture;
+   }
+
+   for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
+      const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
+      struct tgsi_dst_register *dst_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      dst_register = (struct tgsi_dst_register *) &tokens[size];
+      size++;
+
+      *dst_register = tgsi_build_dst_register(
+         reg->DstRegister.File,
+         reg->DstRegister.WriteMask,
+         reg->DstRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) dst_register;
+
+      if( tgsi_compare_dst_register_ext_concode(
+            reg->DstRegisterExtConcode,
+            tgsi_default_dst_register_ext_concode() ) ) {
+         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_concode =
+            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
+         size++;
+
+         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
+            reg->DstRegisterExtConcode.CondMask,
+            reg->DstRegisterExtConcode.CondSwizzleX,
+            reg->DstRegisterExtConcode.CondSwizzleY,
+            reg->DstRegisterExtConcode.CondSwizzleZ,
+            reg->DstRegisterExtConcode.CondSwizzleW,
+            reg->DstRegisterExtConcode.CondSrcIndex,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
+      }
+
+      if( tgsi_compare_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate,
+            tgsi_default_dst_register_ext_modulate() ) ) {
+         struct tgsi_dst_register_ext_modulate *dst_register_ext_modulate;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_modulate =
+            (struct  tgsi_dst_register_ext_modulate *) &tokens[size];
+         size++;
+
+         *dst_register_ext_modulate = tgsi_build_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate.Modulate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_modulate;
+      }
+   }
+
+   for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
+      const struct tgsi_full_src_register *reg = &full_inst->FullSrcRegisters[i];
+      struct tgsi_src_register *src_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      src_register = (struct tgsi_src_register *)  &tokens[size];
+      size++;
+
+      *src_register = tgsi_build_src_register(
+         reg->SrcRegister.File,
+         reg->SrcRegister.SwizzleX,
+         reg->SrcRegister.SwizzleY,
+         reg->SrcRegister.SwizzleZ,
+         reg->SrcRegister.SwizzleW,
+         reg->SrcRegister.Negate,
+         reg->SrcRegister.Indirect,
+         reg->SrcRegister.Dimension,
+         reg->SrcRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) src_register;
+
+      if( tgsi_compare_src_register_ext_swz(
+            reg->SrcRegisterExtSwz,
+            tgsi_default_src_register_ext_swz() ) ) {
+         struct tgsi_src_register_ext_swz *src_register_ext_swz;
+
+         /* Use of the extended swizzle requires the simple swizzle to be identity.
+          */
+         assert( reg->SrcRegister.SwizzleX == TGSI_SWIZZLE_X );
+         assert( reg->SrcRegister.SwizzleY == TGSI_SWIZZLE_Y );
+         assert( reg->SrcRegister.SwizzleZ == TGSI_SWIZZLE_Z );
+         assert( reg->SrcRegister.SwizzleW == TGSI_SWIZZLE_W );
+         assert( reg->SrcRegister.Negate == FALSE );
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_swz =
+            (struct  tgsi_src_register_ext_swz *) &tokens[size];
+         size++;
+
+         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
+            reg->SrcRegisterExtSwz.ExtSwizzleX,
+            reg->SrcRegisterExtSwz.ExtSwizzleY,
+            reg->SrcRegisterExtSwz.ExtSwizzleZ,
+            reg->SrcRegisterExtSwz.ExtSwizzleW,
+            reg->SrcRegisterExtSwz.NegateX,
+            reg->SrcRegisterExtSwz.NegateY,
+            reg->SrcRegisterExtSwz.NegateZ,
+            reg->SrcRegisterExtSwz.NegateW,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_swz;
+      }
+
+      if( tgsi_compare_src_register_ext_mod(
+            reg->SrcRegisterExtMod,
+            tgsi_default_src_register_ext_mod() ) ) {
+         struct tgsi_src_register_ext_mod *src_register_ext_mod;
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_mod =
+            (struct  tgsi_src_register_ext_mod *) &tokens[size];
+         size++;
+
+         *src_register_ext_mod = tgsi_build_src_register_ext_mod(
+            reg->SrcRegisterExtMod.Complement,
+            reg->SrcRegisterExtMod.Bias,
+            reg->SrcRegisterExtMod.Scale2X,
+            reg->SrcRegisterExtMod.Absolute,
+            reg->SrcRegisterExtMod.Negate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_mod;
+      }
+
+      if( reg->SrcRegister.Indirect ) {
+         struct  tgsi_src_register *ind;
+
+         if( maxsize <= size )
+            return 0;
+         ind = (struct tgsi_src_register *) &tokens[size];
+         size++;
+
+         *ind = tgsi_build_src_register(
+            reg->SrcRegisterInd.File,
+            reg->SrcRegisterInd.SwizzleX,
+            reg->SrcRegisterInd.SwizzleY,
+            reg->SrcRegisterInd.SwizzleZ,
+            reg->SrcRegisterInd.SwizzleW,
+            reg->SrcRegisterInd.Negate,
+            reg->SrcRegisterInd.Indirect,
+            reg->SrcRegisterInd.Dimension,
+            reg->SrcRegisterInd.Index,
+            instruction,
+            header );
+      }
+
+      if( reg->SrcRegister.Dimension ) {
+         struct  tgsi_dimension *dim;
+
+         assert( !reg->SrcRegisterDim.Dimension );
+
+         if( maxsize <= size )
+            return 0;
+         dim = (struct tgsi_dimension *) &tokens[size];
+         size++;
+
+         *dim = tgsi_build_dimension(
+            reg->SrcRegisterDim.Indirect,
+            reg->SrcRegisterDim.Index,
+            instruction,
+            header );
+
+         if( reg->SrcRegisterDim.Indirect ) {
+            struct tgsi_src_register *ind;
+
+            if( maxsize <= size )
+               return 0;
+            ind = (struct tgsi_src_register *) &tokens[size];
+            size++;
+
+            *ind = tgsi_build_src_register(
+               reg->SrcRegisterDimInd.File,
+               reg->SrcRegisterDimInd.SwizzleX,
+               reg->SrcRegisterDimInd.SwizzleY,
+               reg->SrcRegisterDimInd.SwizzleZ,
+               reg->SrcRegisterDimInd.SwizzleW,
+               reg->SrcRegisterDimInd.Negate,
+               reg->SrcRegisterDimInd.Indirect,
+               reg->SrcRegisterDimInd.Dimension,
+               reg->SrcRegisterDimInd.Index,
+               instruction,
+               header );
+         }
+      }
+   }
+
+   return size;
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
+   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
+   instruction_ext_nv.CondDstIndex = 0;
+   instruction_ext_nv.CondFlowIndex = 0;
+   instruction_ext_nv.CondMask = TGSI_CC_TR;
+   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
+   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
+   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
+   instruction_ext_nv.CondDstUpdate = 0;
+   instruction_ext_nv.CondFlowEnable = 0;
+   instruction_ext_nv.Padding = 0;
+   instruction_ext_nv.Extended = 0;
+
+   return instruction_ext_nv;
+}
+
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
+{
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_enable,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv = tgsi_default_instruction_ext_nv();
+   instruction_ext_nv.Precision = precision;
+   instruction_ext_nv.CondDstIndex = cond_dst_index;
+   instruction_ext_nv.CondFlowIndex = cond_flow_index;
+   instruction_ext_nv.CondMask = cond_mask;
+   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
+   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
+   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
+   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
+   instruction_ext_nv.CondDstUpdate = cond_dst_update;
+   instruction_ext_nv.CondFlowEnable = cond_flow_enable;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_nv;
+}
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
+   instruction_ext_label.Label = 0;
+   instruction_ext_label.Padding = 0;
+   instruction_ext_label.Extended = 0;
+
+   return instruction_ext_label;
+}
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token  *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label = tgsi_default_instruction_ext_label();
+   instruction_ext_label.Label = label;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_label;
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
+   instruction_ext_texture.Texture = TGSI_TEXTURE_UNKNOWN;
+   instruction_ext_texture.Padding = 0;
+   instruction_ext_texture.Extended = 0;
+
+   return instruction_ext_texture;
+}
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture = tgsi_default_instruction_ext_texture();
+   instruction_ext_texture.Texture = texture;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_texture;
+}
+
+struct tgsi_src_register
+tgsi_default_src_register( void )
+{
+   struct tgsi_src_register src_register;
+
+   src_register.File = TGSI_FILE_NULL;
+   src_register.SwizzleX = TGSI_SWIZZLE_X;
+   src_register.SwizzleY = TGSI_SWIZZLE_Y;
+   src_register.SwizzleZ = TGSI_SWIZZLE_Z;
+   src_register.SwizzleW = TGSI_SWIZZLE_W;
+   src_register.Negate = 0;
+   src_register.Indirect = 0;
+   src_register.Dimension = 0;
+   src_register.Index = 0;
+   src_register.Extended = 0;
+
+   return src_register;
+}
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register   src_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( negate <= 1 );
+   assert( index >= -0x8000 && index <= 0x7FFF );
+
+   src_register = tgsi_default_src_register();
+   src_register.File = file;
+   src_register.SwizzleX = swizzle_x;
+   src_register.SwizzleY = swizzle_y;
+   src_register.SwizzleZ = swizzle_z;
+   src_register.SwizzleW = swizzle_w;
+   src_register.Negate = negate;
+   src_register.Indirect = indirect;
+   src_register.Dimension = dimension;
+   src_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return src_register;
+}
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void )
+{
+   struct tgsi_full_src_register full_src_register;
+
+   full_src_register.SrcRegister = tgsi_default_src_register();
+   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
+   full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
+   full_src_register.SrcRegisterInd = tgsi_default_src_register();
+   full_src_register.SrcRegisterDim = tgsi_default_dimension();
+   full_src_register.SrcRegisterDimInd = tgsi_default_src_register();
+
+   return full_src_register;
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
+   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
+   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
+   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
+   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
+   src_register_ext_swz.NegateX = 0;
+   src_register_ext_swz.NegateY = 0;
+   src_register_ext_swz.NegateZ = 0;
+   src_register_ext_swz.NegateW = 0;
+   src_register_ext_swz.Padding = 0;
+   src_register_ext_swz.Extended = 0;
+
+   return src_register_ext_swz;
+}
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
+   assert( negate_x <= 1 );
+   assert( negate_y <= 1 );
+   assert( negate_z <= 1 );
+   assert( negate_w <= 1 );
+
+   src_register_ext_swz = tgsi_default_src_register_ext_swz();
+   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
+   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
+   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
+   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
+   src_register_ext_swz.NegateX = negate_x;
+   src_register_ext_swz.NegateY = negate_y;
+   src_register_ext_swz.NegateZ = negate_z;
+   src_register_ext_swz.NegateW = negate_w;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_swz;
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   src_register_ext_mod.Type = TGSI_SRC_REGISTER_EXT_TYPE_MOD;
+   src_register_ext_mod.Complement = 0;
+   src_register_ext_mod.Bias = 0;
+   src_register_ext_mod.Scale2X = 0;
+   src_register_ext_mod.Absolute = 0;
+   src_register_ext_mod.Negate = 0;
+   src_register_ext_mod.Padding = 0;
+   src_register_ext_mod.Extended = 0;
+
+   return src_register_ext_mod;
+}
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   assert( complement <= 1 );
+   assert( bias <= 1 );
+   assert( scale_2x <= 1 );
+   assert( absolute <= 1 );
+   assert( negate <= 1 );
+
+   src_register_ext_mod = tgsi_default_src_register_ext_mod();
+   src_register_ext_mod.Complement = complement;
+   src_register_ext_mod.Bias = bias;
+   src_register_ext_mod.Scale2X = scale_2x;
+   src_register_ext_mod.Absolute = absolute;
+   src_register_ext_mod.Negate = negate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_mod;
+}
+
+struct tgsi_dimension
+tgsi_default_dimension( void )
+{
+   struct tgsi_dimension dimension;
+
+   dimension.Indirect = 0;
+   dimension.Dimension = 0;
+   dimension.Padding = 0;
+   dimension.Index = 0;
+   dimension.Extended = 0;
+
+   return dimension;
+}
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dimension dimension;
+
+   dimension = tgsi_default_dimension();
+   dimension.Indirect = indirect;
+   dimension.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dimension;
+}
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void )
+{
+   struct tgsi_dst_register dst_register;
+
+   dst_register.File = TGSI_FILE_NULL;
+   dst_register.WriteMask = TGSI_WRITEMASK_XYZW;
+   dst_register.Indirect = 0;
+   dst_register.Dimension = 0;
+   dst_register.Index = 0;
+   dst_register.Padding = 0;
+   dst_register.Extended = 0;
+
+   return dst_register;
+}
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register dst_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( mask <= TGSI_WRITEMASK_XYZW );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register = tgsi_default_dst_register();
+   dst_register.File = file;
+   dst_register.WriteMask = mask;
+   dst_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dst_register;
+}
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void )
+{
+   struct tgsi_full_dst_register full_dst_register;
+
+   full_dst_register.DstRegister = tgsi_default_dst_register();
+   full_dst_register.DstRegisterExtConcode =
+      tgsi_default_dst_register_ext_concode();
+   full_dst_register.DstRegisterExtModulate =
+      tgsi_default_dst_register_ext_modulate();
+
+   return full_dst_register;
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
+   dst_register_ext_concode.CondMask = TGSI_CC_TR;
+   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
+   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
+   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
+   dst_register_ext_concode.CondSrcIndex = 0;
+   dst_register_ext_concode.Padding = 0;
+   dst_register_ext_concode.Extended = 0;
+
+   return dst_register_ext_concode;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   assert( cc <= TGSI_CC_FL );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
+   dst_register_ext_concode.CondMask = cc;
+   dst_register_ext_concode.CondSwizzleX = swizzle_x;
+   dst_register_ext_concode.CondSwizzleY = swizzle_y;
+   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
+   dst_register_ext_concode.CondSwizzleW = swizzle_w;
+   dst_register_ext_concode.CondSrcIndex = index;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_concode;
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   dst_register_ext_modulate.Type = TGSI_DST_REGISTER_EXT_TYPE_MODULATE;
+   dst_register_ext_modulate.Modulate = TGSI_MODULATE_1X;
+   dst_register_ext_modulate.Padding = 0;
+   dst_register_ext_modulate.Extended = 0;
+
+   return dst_register_ext_modulate;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return compare32(&a, &b);
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   assert( modulate <= TGSI_MODULATE_EIGHTH );
+
+   dst_register_ext_modulate = tgsi_default_dst_register_ext_modulate();
+   dst_register_ext_modulate.Modulate = modulate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_modulate;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
new file mode 100644
index 0000000000..0fd6fabd83
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -0,0 +1,338 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_BUILD_H
+#define TGSI_BUILD_H
+
+
+struct tgsi_token;
+
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void );
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void );
+
+struct tgsi_processor
+tgsi_default_processor( void );
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned processor,
+   struct tgsi_header *header );
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void );
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   unsigned centroid,
+   unsigned invariant,
+   struct tgsi_header *header );
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void );
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_declaration_range
+tgsi_default_declaration_range( void );
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void );
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void );
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header );
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void );
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header );
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void );
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header );
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void );
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void );
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b );
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_enable,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void );
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b );
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void );
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b );
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register
+tgsi_default_src_register( void );
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void );
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void );
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b );
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void );
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b );
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dimension
+tgsi_default_dimension( void );
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void );
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void );
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void );
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b );
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void );
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_BUILD_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
new file mode 100644
index 0000000000..c2a0ac5aff
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -0,0 +1,555 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "util/u_string.h"
+#include "tgsi_dump.h"
+#include "tgsi_info.h"
+#include "tgsi_iterate.h"
+
+struct dump_ctx
+{
+   struct tgsi_iterate_context iter;
+
+   uint instno;
+   
+   void (*printf)(struct dump_ctx *ctx, const char *format, ...);
+};
+
+static void 
+dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
+{
+   va_list ap;
+   (void)ctx;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
+}
+
+static void
+dump_enum(
+   struct dump_ctx *ctx,
+   uint e,
+   const char **enums,
+   uint enum_count )
+{
+   if (e >= enum_count)
+      ctx->printf( ctx, "%u", e );
+   else
+      ctx->printf( ctx, "%s", enums[e] );
+}
+
+#define EOL()           ctx->printf( ctx, "\n" )
+#define TXT(S)          ctx->printf( ctx, "%s", S )
+#define CHR(C)          ctx->printf( ctx, "%c", C )
+#define UIX(I)          ctx->printf( ctx, "0x%x", I )
+#define UID(I)          ctx->printf( ctx, "%u", I )
+#define INSTID(I)          ctx->printf( ctx, "% 3u", I )
+#define SID(I)          ctx->printf( ctx, "%d", I )
+#define FLT(F)          ctx->printf( ctx, "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+
+static const char *processor_type_names[] =
+{
+   "FRAG",
+   "VERT",
+   "GEOM"
+};
+
+static const char *file_names[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static const char *interpolate_names[] =
+{
+   "CONSTANT",
+   "LINEAR",
+   "PERSPECTIVE"
+};
+
+static const char *semantic_names[] =
+{
+   "POSITION",
+   "COLOR",
+   "BCOLOR",
+   "FOG",
+   "PSIZE",
+   "GENERIC",
+   "NORMAL"
+};
+
+static const char *immediate_type_names[] =
+{
+   "FLT32"
+};
+
+static const char *swizzle_names[] =
+{
+   "x",
+   "y",
+   "z",
+   "w"
+};
+
+static const char *texture_names[] =
+{
+   "UNKNOWN",
+   "1D",
+   "2D",
+   "3D",
+   "CUBE",
+   "RECT",
+   "SHADOW1D",
+   "SHADOW2D",
+   "SHADOWRECT"
+};
+
+static const char *extswizzle_names[] =
+{
+   "x",
+   "y",
+   "z",
+   "w",
+   "0",
+   "1"
+};
+
+static const char *modulate_names[TGSI_MODULATE_COUNT] =
+{
+   "",
+   "_2X",
+   "_4X",
+   "_8X",
+   "_D2",
+   "_D4",
+   "_D8"
+};
+
+static void
+_dump_register(
+   struct dump_ctx *ctx,
+   uint file,
+   int first,
+   int last )
+{
+   ENM( file, file_names );
+   CHR( '[' );
+   SID( first );
+   if (first != last) {
+      TXT( ".." );
+      SID( last );
+   }
+   CHR( ']' );
+}
+
+static void
+_dump_register_ind(
+   struct dump_ctx *ctx,
+   uint file,
+   int index,
+   uint ind_file,
+   int ind_index )
+{
+   ENM( file, file_names );
+   CHR( '[' );
+   ENM( ind_file, file_names );
+   CHR( '[' );
+   SID( ind_index );
+   CHR( ']' );
+   if (index != 0) {
+      if (index > 0)
+         CHR( '+' );
+      SID( index );
+   }
+   CHR( ']' );
+}
+
+static void
+_dump_writemask(
+   struct dump_ctx *ctx,
+   uint writemask )
+{
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      CHR( '.' );
+      if (writemask & TGSI_WRITEMASK_X)
+         CHR( 'x' );
+      if (writemask & TGSI_WRITEMASK_Y)
+         CHR( 'y' );
+      if (writemask & TGSI_WRITEMASK_Z)
+         CHR( 'z' );
+      if (writemask & TGSI_WRITEMASK_W)
+         CHR( 'w' );
+   }
+}
+
+static boolean
+iter_declaration(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_declaration *decl )
+{
+   struct dump_ctx *ctx = (struct dump_ctx *)iter;
+
+   TXT( "DCL " );
+
+   _dump_register(
+      ctx,
+      decl->Declaration.File,
+      decl->DeclarationRange.First,
+      decl->DeclarationRange.Last );
+   _dump_writemask(
+      ctx,
+      decl->Declaration.UsageMask );
+
+   if (decl->Declaration.Semantic) {
+      TXT( ", " );
+      ENM( decl->Semantic.SemanticName, semantic_names );
+      if (decl->Semantic.SemanticIndex != 0 ||
+          decl->Semantic.SemanticName == TGSI_SEMANTIC_GENERIC) {
+         CHR( '[' );
+         UID( decl->Semantic.SemanticIndex );
+         CHR( ']' );
+      }
+   }
+
+   TXT( ", " );
+   ENM( decl->Declaration.Interpolate, interpolate_names );
+
+   if (decl->Declaration.Centroid) {
+      TXT( ", CENTROID" );
+   }
+
+   if (decl->Declaration.Invariant) {
+      TXT( ", INVARIANT" );
+   }
+
+   EOL();
+
+   return TRUE;
+}
+
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration *decl )
+{
+   struct dump_ctx ctx;
+
+   ctx.printf = dump_ctx_printf;
+
+   iter_declaration( &ctx.iter, (struct tgsi_full_declaration *)decl );
+}
+
+static boolean
+iter_immediate(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_immediate *imm )
+{
+   struct dump_ctx *ctx = (struct dump_ctx *) iter;
+
+   uint i;
+
+   TXT( "IMM " );
+   ENM( imm->Immediate.DataType, immediate_type_names );
+
+   TXT( " { " );
+   for (i = 0; i < imm->Immediate.Size - 1; i++) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+      default:
+         assert( 0 );
+      }
+
+      if (i < imm->Immediate.Size - 2)
+         TXT( ", " );
+   }
+   TXT( " }" );
+
+   EOL();
+
+   return TRUE;
+}
+
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm )
+{
+   struct dump_ctx ctx;
+
+   ctx.printf = dump_ctx_printf;
+
+   iter_immediate( &ctx.iter, (struct tgsi_full_immediate *)imm );
+}
+
+static boolean
+iter_instruction(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_instruction *inst )
+{
+   struct dump_ctx *ctx = (struct dump_ctx *) iter;
+   uint instno = ctx->instno++;
+   
+   uint i;
+   boolean first_reg = TRUE;
+
+   INSTID( instno );
+   TXT( ": " );
+   TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
+
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      TXT( "_SAT" );
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      TXT( "_SATNV" );
+      break;
+   default:
+      assert( 0 );
+   }
+
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      const struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (!first_reg)
+         CHR( ',' );
+      CHR( ' ' );
+
+      _dump_register(
+         ctx,
+         dst->DstRegister.File,
+         dst->DstRegister.Index,
+         dst->DstRegister.Index );
+      ENM( dst->DstRegisterExtModulate.Modulate, modulate_names );
+      _dump_writemask( ctx, dst->DstRegister.WriteMask );
+
+      first_reg = FALSE;
+   }
+
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      const struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+
+      if (!first_reg)
+         CHR( ',' );
+      CHR( ' ' );
+
+      if (src->SrcRegisterExtMod.Negate)
+         TXT( "-(" );
+      if (src->SrcRegisterExtMod.Absolute)
+         CHR( '|' );
+      if (src->SrcRegisterExtMod.Scale2X)
+         TXT( "2*(" );
+      if (src->SrcRegisterExtMod.Bias)
+         CHR( '(' );
+      if (src->SrcRegisterExtMod.Complement)
+         TXT( "1-(" );
+      if (src->SrcRegister.Negate)
+         CHR( '-' );
+
+      if (src->SrcRegister.Indirect) {
+         _dump_register_ind(
+            ctx,
+            src->SrcRegister.File,
+            src->SrcRegister.Index,
+            src->SrcRegisterInd.File,
+            src->SrcRegisterInd.Index );
+      }
+      else {
+         _dump_register(
+            ctx,
+            src->SrcRegister.File,
+            src->SrcRegister.Index,
+            src->SrcRegister.Index );
+      }
+
+      if (src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
+          src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
+          src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
+          src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W) {
+         CHR( '.' );
+         ENM( src->SrcRegister.SwizzleX, swizzle_names );
+         ENM( src->SrcRegister.SwizzleY, swizzle_names );
+         ENM( src->SrcRegister.SwizzleZ, swizzle_names );
+         ENM( src->SrcRegister.SwizzleW, swizzle_names );
+      }
+      if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
+          src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
+          src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
+          src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
+         CHR( '.' );
+         if (src->SrcRegisterExtSwz.NegateX)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleX, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateY)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleY, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateZ)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, extswizzle_names );
+         if (src->SrcRegisterExtSwz.NegateW)
+            TXT("-");
+         ENM( src->SrcRegisterExtSwz.ExtSwizzleW, extswizzle_names );
+      }
+
+      if (src->SrcRegisterExtMod.Complement)
+         CHR( ')' );
+      if (src->SrcRegisterExtMod.Bias)
+         TXT( ")-.5" );
+      if (src->SrcRegisterExtMod.Scale2X)
+         CHR( ')' );
+      if (src->SrcRegisterExtMod.Absolute)
+         CHR( '|' );
+      if (src->SrcRegisterExtMod.Negate)
+         CHR( ')' );
+
+      first_reg = FALSE;
+   }
+
+   if (inst->InstructionExtTexture.Texture != TGSI_TEXTURE_UNKNOWN) {
+      TXT( ", " );
+      ENM( inst->InstructionExtTexture.Texture, texture_names );
+   }
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_BGNLOOP2:
+   case TGSI_OPCODE_ENDLOOP2:
+   case TGSI_OPCODE_CAL:
+      TXT( " :" );
+      UID( inst->InstructionExtLabel.Label );
+      break;
+   }
+
+   EOL();
+
+   return TRUE;
+}
+
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction *inst,
+   uint instno )
+{
+   struct dump_ctx ctx;
+
+   ctx.instno = instno;
+   ctx.printf = dump_ctx_printf;
+
+   iter_instruction( &ctx.iter, (struct tgsi_full_instruction *)inst );
+}
+
+static boolean
+prolog(
+   struct tgsi_iterate_context *iter )
+{
+   struct dump_ctx *ctx = (struct dump_ctx *) iter;
+   ENM( iter->processor.Processor, processor_type_names );
+   UID( iter->version.MajorVersion );
+   CHR( '.' );
+   UID( iter->version.MinorVersion );
+   EOL();
+   return TRUE;
+}
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   uint flags )
+{
+   struct dump_ctx ctx;
+
+   ctx.iter.prolog = prolog;
+   ctx.iter.iterate_instruction = iter_instruction;
+   ctx.iter.iterate_declaration = iter_declaration;
+   ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.epilog = NULL;
+
+   ctx.instno = 0;
+   ctx.printf = dump_ctx_printf;
+
+   tgsi_iterate_shader( tokens, &ctx.iter );
+}
+
+struct str_dump_ctx
+{
+   struct dump_ctx base;
+   char *str;
+   char *ptr;
+   size_t left;
+};
+
+static void
+str_dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
+{
+   struct str_dump_ctx *sctx = (struct str_dump_ctx *)ctx;
+   
+   if(sctx->left > 1) {
+      size_t written;
+      va_list ap;
+      va_start(ap, format);
+      written = util_vsnprintf(sctx->ptr, sctx->left, format, ap);
+      va_end(ap);
+      sctx->ptr += written;
+      sctx->left -= written;
+   }
+}
+
+void
+tgsi_dump_str(
+   const struct tgsi_token *tokens,
+   uint flags,
+   char *str,
+   size_t size)
+{
+   struct str_dump_ctx ctx;
+
+   ctx.base.iter.prolog = prolog;
+   ctx.base.iter.iterate_instruction = iter_instruction;
+   ctx.base.iter.iterate_declaration = iter_declaration;
+   ctx.base.iter.iterate_immediate = iter_immediate;
+   ctx.base.iter.epilog = NULL;
+
+   ctx.base.instno = 0;
+   ctx.base.printf = &str_dump_ctx_printf;
+
+   ctx.str = str;
+   ctx.str[0] = 0;
+   ctx.ptr = str;
+   ctx.left = size;
+
+   tgsi_iterate_shader( tokens, &ctx.base.iter );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.h b/src/gallium/auxiliary/tgsi/tgsi_dump.h
new file mode 100644
index 0000000000..ad1e647ec9
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_DUMP_H
+#define TGSI_DUMP_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+void
+tgsi_dump_str(
+   const struct tgsi_token *tokens,
+   uint flags,
+   char *str,
+   size_t size);
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   uint flags );
+
+struct tgsi_full_immediate;
+struct tgsi_full_instruction;
+struct tgsi_full_declaration;
+
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm );
+
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction *inst,
+   uint instno );
+
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration *decl );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_DUMP_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.c b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
new file mode 100644
index 0000000000..be25cb45a0
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.c
@@ -0,0 +1,719 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "util/u_string.h"
+#include "tgsi_dump_c.h"
+#include "tgsi_build.h"
+#include "tgsi_info.h"
+#include "tgsi_parse.h"
+
+static void
+dump_enum(
+   const unsigned    e,
+   const char        **enums,
+   const unsigned    enums_count )
+{
+   if (e >= enums_count) {
+      debug_printf( "%u", e );
+   }
+   else {
+      debug_printf( "%s", enums[e] );
+   }
+}
+
+#define EOL()           debug_printf( "\n" )
+#define TXT(S)          debug_printf( "%s", S )
+#define CHR(C)          debug_printf( "%c", C )
+#define UIX(I)          debug_printf( "0x%x", I )
+#define UID(I)          debug_printf( "%u", I )
+#define SID(I)          debug_printf( "%d", I )
+#define FLT(F)          debug_printf( "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+
+static const char *TGSI_PROCESSOR_TYPES[] =
+{
+   "PROCESSOR_FRAGMENT",
+   "PROCESSOR_VERTEX",
+   "PROCESSOR_GEOMETRY"
+};
+
+static const char *TGSI_TOKEN_TYPES[] =
+{
+   "TOKEN_TYPE_DECLARATION",
+   "TOKEN_TYPE_IMMEDIATE",
+   "TOKEN_TYPE_INSTRUCTION"
+};
+
+static const char *TGSI_FILES[] =
+{
+   "FILE_NULL",
+   "FILE_CONSTANT",
+   "FILE_INPUT",
+   "FILE_OUTPUT",
+   "FILE_TEMPORARY",
+   "FILE_SAMPLER",
+   "FILE_ADDRESS",
+   "FILE_IMMEDIATE"
+};
+
+static const char *TGSI_INTERPOLATES[] =
+{
+   "INTERPOLATE_CONSTANT",
+   "INTERPOLATE_LINEAR",
+   "INTERPOLATE_PERSPECTIVE"
+};
+
+static const char *TGSI_SEMANTICS[] =
+{
+   "SEMANTIC_POSITION",
+   "SEMANTIC_COLOR",
+   "SEMANTIC_BCOLOR",
+   "SEMANTIC_FOG",
+   "SEMANTIC_PSIZE",
+   "SEMANTIC_GENERIC",
+   "SEMANTIC_NORMAL"
+};
+
+static const char *TGSI_IMMS[] =
+{
+   "IMM_FLOAT32"
+};
+
+static const char *TGSI_SATS[] =
+{
+   "SAT_NONE",
+   "SAT_ZERO_ONE",
+   "SAT_MINUS_PLUS_ONE"
+};
+
+static const char *TGSI_INSTRUCTION_EXTS[] =
+{
+   "INSTRUCTION_EXT_TYPE_NV",
+   "INSTRUCTION_EXT_TYPE_LABEL",
+   "INSTRUCTION_EXT_TYPE_TEXTURE"
+};
+
+static const char *TGSI_PRECISIONS[] =
+{
+   "PRECISION_DEFAULT",
+   "PRECISION_FLOAT32",
+   "PRECISION_FLOAT16",
+   "PRECISION_FIXED12"
+};
+
+static const char *TGSI_CCS[] =
+{
+   "CC_GT",
+   "CC_EQ",
+   "CC_LT",
+   "CC_UN",
+   "CC_GE",
+   "CC_LE",
+   "CC_NE",
+   "CC_TR",
+   "CC_FL"
+};
+
+static const char *TGSI_SWIZZLES[] =
+{
+   "SWIZZLE_X",
+   "SWIZZLE_Y",
+   "SWIZZLE_Z",
+   "SWIZZLE_W"
+};
+
+static const char *TGSI_TEXTURES[] =
+{
+   "TEXTURE_UNKNOWN",
+   "TEXTURE_1D",
+   "TEXTURE_2D",
+   "TEXTURE_3D",
+   "TEXTURE_CUBE",
+   "TEXTURE_RECT",
+   "TEXTURE_SHADOW1D",
+   "TEXTURE_SHADOW2D",
+   "TEXTURE_SHADOWRECT"
+};
+
+static const char *TGSI_SRC_REGISTER_EXTS[] =
+{
+   "SRC_REGISTER_EXT_TYPE_SWZ",
+   "SRC_REGISTER_EXT_TYPE_MOD"
+};
+
+static const char *TGSI_EXTSWIZZLES[] =
+{
+   "EXTSWIZZLE_X",
+   "EXTSWIZZLE_Y",
+   "EXTSWIZZLE_Z",
+   "EXTSWIZZLE_W",
+   "EXTSWIZZLE_ZERO",
+   "EXTSWIZZLE_ONE"
+};
+
+static const char *TGSI_WRITEMASKS[] =
+{
+   "0",
+   "WRITEMASK_X",
+   "WRITEMASK_Y",
+   "WRITEMASK_XY",
+   "WRITEMASK_Z",
+   "WRITEMASK_XZ",
+   "WRITEMASK_YZ",
+   "WRITEMASK_XYZ",
+   "WRITEMASK_W",
+   "WRITEMASK_XW",
+   "WRITEMASK_YW",
+   "WRITEMASK_XYW",
+   "WRITEMASK_ZW",
+   "WRITEMASK_XZW",
+   "WRITEMASK_YZW",
+   "WRITEMASK_XYZW"
+};
+
+static const char *TGSI_DST_REGISTER_EXTS[] =
+{
+   "DST_REGISTER_EXT_TYPE_CONDCODE",
+   "DST_REGISTER_EXT_TYPE_MODULATE"
+};
+
+static const char *TGSI_MODULATES[] =
+{
+   "MODULATE_1X",
+   "MODULATE_2X",
+   "MODULATE_4X",
+   "MODULATE_8X",
+   "MODULATE_HALF",
+   "MODULATE_QUARTER",
+   "MODULATE_EIGHTH"
+};
+
+static void
+dump_declaration_verbose(
+   struct tgsi_full_declaration  *decl,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_declaration  *fd )
+{
+   TXT( "\nFile       : " );
+   ENM( decl->Declaration.File, TGSI_FILES );
+   if( deflt || fd->Declaration.UsageMask != decl->Declaration.UsageMask ) {
+      TXT( "\nUsageMask  : " );
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
+         CHR( 'X' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
+         CHR( 'Y' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
+         CHR( 'Z' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
+         CHR( 'W' );
+      }
+   }
+   if( deflt || fd->Declaration.Interpolate != decl->Declaration.Interpolate ) {
+      TXT( "\nInterpolate: " );
+      ENM( decl->Declaration.Interpolate, TGSI_INTERPOLATES );
+   }
+   if( deflt || fd->Declaration.Semantic != decl->Declaration.Semantic ) {
+      TXT( "\nSemantic   : " );
+      UID( decl->Declaration.Semantic );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( decl->Declaration.Padding );
+   }
+
+   EOL();
+   TXT( "\nFirst: " );
+   UID( decl->DeclarationRange.First );
+   TXT( "\nLast : " );
+   UID( decl->DeclarationRange.Last );
+
+   if( decl->Declaration.Semantic ) {
+      EOL();
+      TXT( "\nSemanticName : " );
+      ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
+      TXT( "\nSemanticIndex: " );
+      UID( decl->Semantic.SemanticIndex );
+      if( ignored ) {
+         TXT( "\nPadding      : " );
+         UIX( decl->Semantic.Padding );
+      }
+   }
+}
+
+static void
+dump_immediate_verbose(
+   struct tgsi_full_immediate *imm,
+   unsigned                   ignored )
+{
+   unsigned i;
+
+   TXT( "\nDataType   : " );
+   ENM( imm->Immediate.DataType, TGSI_IMMS );
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( imm->Immediate.Padding );
+   }
+
+   for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+      EOL();
+      switch( imm->Immediate.DataType ) {
+      case TGSI_IMM_FLOAT32:
+         TXT( "\nFloat: " );
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+}
+
+static void
+dump_instruction_verbose(
+   struct tgsi_full_instruction  *inst,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_instruction  *fi )
+{
+   unsigned i;
+
+   TXT( "\nOpcode     : OPCODE_" );
+   TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
+   if( deflt || fi->Instruction.Saturate != inst->Instruction.Saturate ) {
+      TXT( "\nSaturate   : " );
+      ENM( inst->Instruction.Saturate, TGSI_SATS );
+   }
+   if( deflt || fi->Instruction.NumDstRegs != inst->Instruction.NumDstRegs ) {
+      TXT( "\nNumDstRegs : " );
+      UID( inst->Instruction.NumDstRegs );
+   }
+   if( deflt || fi->Instruction.NumSrcRegs != inst->Instruction.NumSrcRegs ) {
+      TXT( "\nNumSrcRegs : " );
+      UID( inst->Instruction.NumSrcRegs );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( inst->Instruction.Padding );
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
+      EOL();
+      TXT( "\nType          : " );
+      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
+         TXT( "\nPrecision     : " );
+         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
+         TXT( "\nCondDstIndex  : " );
+         UID( inst->InstructionExtNv.CondDstIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
+         TXT( "\nCondFlowIndex : " );
+         UID( inst->InstructionExtNv.CondFlowIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
+         TXT( "\nCondMask      : " );
+         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
+         TXT( "\nCondSwizzleX  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
+         TXT( "\nCondSwizzleY  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
+         TXT( "\nCondSwizzleZ  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
+         TXT( "\nCondSwizzleW  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
+         TXT( "\nCondDstUpdate : " );
+         UID( inst->InstructionExtNv.CondDstUpdate );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
+         TXT( "\nCondFlowEnable: " );
+         UID( inst->InstructionExtNv.CondFlowEnable );
+      }
+      if( ignored ) {
+         TXT( "\nPadding       : " );
+         UIX( inst->InstructionExtNv.Padding );
+         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
+            TXT( "\nExtended      : " );
+            UID( inst->InstructionExtNv.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
+      EOL();
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
+         TXT( "\nLabel   : " );
+         UID( inst->InstructionExtLabel.Label );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtLabel.Padding );
+         if( deflt || fi->InstructionExtLabel.Extended != inst->InstructionExtLabel.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtLabel.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
+      EOL();
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
+         TXT( "\nTexture : " );
+         ENM( inst->InstructionExtTexture.Texture, TGSI_TEXTURES );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtTexture.Padding );
+         if( deflt || fi->InstructionExtTexture.Extended != inst->InstructionExtTexture.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtTexture.Extended );
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+      struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
+
+      EOL();
+      TXT( "\nFile     : " );
+      ENM( dst->DstRegister.File, TGSI_FILES );
+      if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
+         TXT( "\nWriteMask: " );
+         ENM( dst->DstRegister.WriteMask, TGSI_WRITEMASKS );
+      }
+      if( ignored ) {
+         if( deflt || fd->DstRegister.Indirect != dst->DstRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( dst->DstRegister.Indirect );
+         }
+         if( deflt || fd->DstRegister.Dimension != dst->DstRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( dst->DstRegister.Dimension );
+         }
+      }
+      if( deflt || fd->DstRegister.Index != dst->DstRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( dst->DstRegister.Index );
+      }
+      if( ignored ) {
+         TXT( "\nPadding  : " );
+         UIX( dst->DstRegister.Padding );
+         if( deflt || fd->DstRegister.Extended != dst->DstRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( dst->DstRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
+         EOL();
+         TXT( "\nType        : " );
+         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
+            TXT( "\nCondMask    : " );
+            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
+            TXT( "\nCondSwizzleX: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
+            TXT( "\nCondSwizzleY: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
+            TXT( "\nCondSwizzleZ: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
+            TXT( "\nCondSwizzleW: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
+            TXT( "\nCondSrcIndex: " );
+            UID( dst->DstRegisterExtConcode.CondSrcIndex );
+         }
+         if( ignored ) {
+            TXT( "\nPadding     : " );
+            UIX( dst->DstRegisterExtConcode.Padding );
+            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
+               TXT( "\nExtended    : " );
+               UID( dst->DstRegisterExtConcode.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
+         EOL();
+         TXT( "\nType    : " );
+         ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
+            TXT( "\nModulate: " );
+            ENM( dst->DstRegisterExtModulate.Modulate, TGSI_MODULATES );
+         }
+         if( ignored ) {
+            TXT( "\nPadding : " );
+            UIX( dst->DstRegisterExtModulate.Padding );
+            if( deflt || fd->DstRegisterExtModulate.Extended != dst->DstRegisterExtModulate.Extended ) {
+               TXT( "\nExtended: " );
+               UID( dst->DstRegisterExtModulate.Extended );
+            }
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
+
+      EOL();
+      TXT( "\nFile     : ");
+      ENM( src->SrcRegister.File, TGSI_FILES );
+      if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
+         TXT( "\nSwizzleX : " );
+         ENM( src->SrcRegister.SwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleY != src->SrcRegister.SwizzleY ) {
+         TXT( "\nSwizzleY : " );
+         ENM( src->SrcRegister.SwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleZ != src->SrcRegister.SwizzleZ ) {
+         TXT( "\nSwizzleZ : " );
+         ENM( src->SrcRegister.SwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleW != src->SrcRegister.SwizzleW ) {
+         TXT( "\nSwizzleW : " );
+         ENM( src->SrcRegister.SwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.Negate != src->SrcRegister.Negate ) {
+         TXT( "\nNegate   : " );
+         UID( src->SrcRegister.Negate );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Indirect != src->SrcRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( src->SrcRegister.Indirect );
+         }
+         if( deflt || fs->SrcRegister.Dimension != src->SrcRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( src->SrcRegister.Dimension );
+         }
+      }
+      if( deflt || fs->SrcRegister.Index != src->SrcRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( src->SrcRegister.Index );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Extended != src->SrcRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( src->SrcRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
+         EOL();
+         TXT( "\nType       : " );
+         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
+            TXT( "\nExtSwizzleX: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
+            TXT( "\nExtSwizzleY: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
+            TXT( "\nExtSwizzleZ: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
+            TXT( "\nExtSwizzleW: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
+            TXT( "\nNegateX   : " );
+            UID( src->SrcRegisterExtSwz.NegateX );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
+            TXT( "\nNegateY   : " );
+            UID( src->SrcRegisterExtSwz.NegateY );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
+            TXT( "\nNegateZ   : " );
+            UID( src->SrcRegisterExtSwz.NegateZ );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
+            TXT( "\nNegateW   : " );
+            UID( src->SrcRegisterExtSwz.NegateW );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtSwz.Padding );
+            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
+               TXT( "\nExtended   : " );
+               UID( src->SrcRegisterExtSwz.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
+         EOL();
+         TXT( "\nType     : " );
+         ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
+            TXT( "\nComplement: " );
+            UID( src->SrcRegisterExtMod.Complement );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Bias != src->SrcRegisterExtMod.Bias ) {
+            TXT( "\nBias     : " );
+            UID( src->SrcRegisterExtMod.Bias );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Scale2X != src->SrcRegisterExtMod.Scale2X ) {
+            TXT( "\nScale2X   : " );
+            UID( src->SrcRegisterExtMod.Scale2X );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Absolute != src->SrcRegisterExtMod.Absolute ) {
+            TXT( "\nAbsolute  : " );
+            UID( src->SrcRegisterExtMod.Absolute );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Negate != src->SrcRegisterExtMod.Negate ) {
+            TXT( "\nNegate   : " );
+            UID( src->SrcRegisterExtMod.Negate );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtMod.Padding );
+            if( deflt || fs->SrcRegisterExtMod.Extended != src->SrcRegisterExtMod.Extended ) {
+               TXT( "\nExtended  : " );
+               UID( src->SrcRegisterExtMod.Extended );
+            }
+         }
+      }
+   }
+}
+
+void
+tgsi_dump_c(
+   const struct tgsi_token *tokens,
+   uint flags )
+{
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   uint ignored = flags & TGSI_DUMP_C_IGNORED;
+   uint deflt = flags & TGSI_DUMP_C_DEFAULT;
+   uint instno = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   TXT( "tgsi-dump begin -----------------" );
+
+   TXT( "\nMajorVersion: " );
+   UID( parse.FullVersion.Version.MajorVersion );
+   TXT( "\nMinorVersion: " );
+   UID( parse.FullVersion.Version.MinorVersion );
+   EOL();
+
+   TXT( "\nHeaderSize: " );
+   UID( parse.FullHeader.Header.HeaderSize );
+   TXT( "\nBodySize  : " );
+   UID( parse.FullHeader.Header.BodySize );
+   TXT( "\nProcessor : " );
+   ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
+   EOL();
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      TXT( "\nType       : " );
+      ENM( parse.FullToken.Token.Type, TGSI_TOKEN_TYPES );
+      if( ignored ) {
+         TXT( "\nSize       : " );
+         UID( parse.FullToken.Token.Size );
+         if( deflt || parse.FullToken.Token.Extended ) {
+            TXT( "\nExtended   : " );
+            UID( parse.FullToken.Token.Extended );
+         }
+      }
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         dump_declaration_verbose(
+            &parse.FullToken.FullDeclaration,
+            ignored,
+            deflt,
+            &fd );
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         dump_immediate_verbose(
+            &parse.FullToken.FullImmediate,
+            ignored );
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         dump_instruction_verbose(
+            &parse.FullToken.FullInstruction,
+            ignored,
+            deflt,
+            &fi );
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      EOL();
+   }
+
+   TXT( "\ntgsi-dump end -------------------\n" );
+
+   tgsi_parse_free( &parse );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump_c.h b/src/gallium/auxiliary/tgsi/tgsi_dump_c.h
new file mode 100644
index 0000000000..d91cd35b3b
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump_c.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_DUMP_C_H
+#define TGSI_DUMP_C_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#define TGSI_DUMP_C_IGNORED 1
+#define TGSI_DUMP_C_DEFAULT 2
+
+void
+tgsi_dump_c(
+   const struct tgsi_token *tokens,
+   uint flags );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_DUMP_C_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
new file mode 100644
index 0000000000..a2d2cfd1fc
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -0,0 +1,2923 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpreter/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#define FAST_MATH 1
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_CC_I          TGSI_EXEC_TEMP_CC_I
+#define TEMP_CC_C          TGSI_EXEC_TEMP_CC_C
+#define TEMP_3_I           TGSI_EXEC_TEMP_THREE_I
+#define TEMP_3_C           TGSI_EXEC_TEMP_THREE_C
+#define TEMP_HALF_I        TGSI_EXEC_TEMP_HALF_I
+#define TEMP_HALF_C        TGSI_EXEC_TEMP_HALF_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call tgsi_exec_machine_run() many times.
+ */
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler **samplers)
+{
+   uint k;
+   struct tgsi_parse_context parse;
+   struct tgsi_exec_labels *labels = &mach->Labels;
+   struct tgsi_full_instruction *instructions;
+   struct tgsi_full_declaration *declarations;
+   uint maxInstructions = 10, numInstructions = 0;
+   uint maxDeclarations = 10, numDeclarations = 0;
+   uint instno = 0;
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   util_init_math();
+
+   mach->Tokens = tokens;
+   mach->Samplers = samplers;
+
+   k = tgsi_parse_init (&parse, mach->Tokens);
+   if (k != TGSI_PARSE_OK) {
+      debug_printf( "Problem parsing!\n" );
+      return;
+   }
+
+   mach->Processor = parse.FullHeader.Processor.Processor;
+   mach->ImmLimit = 0;
+   labels->count = 0;
+
+   declarations = (struct tgsi_full_declaration *)
+      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
+
+   if (!declarations) {
+      return;
+   }
+
+   instructions = (struct tgsi_full_instruction *)
+      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
+
+   if (!instructions) {
+      FREE( declarations );
+      return;
+   }
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      uint pointer = parse.Position;
+      uint i;
+
+      tgsi_parse_token( &parse );
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* save expanded declaration */
+         if (numDeclarations == maxDeclarations) {
+            declarations = REALLOC(declarations,
+                                   maxDeclarations
+                                   * sizeof(struct tgsi_full_declaration),
+                                   (maxDeclarations + 10)
+                                   * sizeof(struct tgsi_full_declaration));
+            maxDeclarations += 10;
+         }
+         memcpy(declarations + numDeclarations,
+                &parse.FullToken.FullDeclaration,
+                sizeof(declarations[0]));
+         numDeclarations++;
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            assert( size % 4 == 0 );
+            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
+
+            for( i = 0; i < size; i++ ) {
+               mach->Imms[mach->ImmLimit + i / 4][i % 4] = 
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+            mach->ImmLimit += size / 4;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         assert( labels->count < MAX_LABELS );
+
+         labels->labels[labels->count][0] = instno;
+         labels->labels[labels->count][1] = pointer;
+         labels->count++;
+
+         /* save expanded instruction */
+         if (numInstructions == maxInstructions) {
+            instructions = REALLOC(instructions,
+                                   maxInstructions
+                                   * sizeof(struct tgsi_full_instruction),
+                                   (maxInstructions + 10)
+                                   * sizeof(struct tgsi_full_instruction));
+            maxInstructions += 10;
+         }
+         memcpy(instructions + numInstructions,
+                &parse.FullToken.FullInstruction,
+                sizeof(instructions[0]));
+         numInstructions++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free (&parse);
+
+   if (mach->Declarations) {
+      FREE( mach->Declarations );
+   }
+   mach->Declarations = declarations;
+   mach->NumDeclarations = numDeclarations;
+
+   if (mach->Instructions) {
+      FREE( mach->Instructions );
+   }
+   mach->Instructions = instructions;
+   mach->NumInstructions = numInstructions;
+}
+
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach )
+{
+   uint i;
+
+   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
+   mach->Addrs = &mach->Temps[TGSI_EXEC_TEMP_ADDR];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+      mach->Temps[TEMP_3_I].xyzw[TEMP_3_C].f[i] = 3.0f;
+      mach->Temps[TEMP_HALF_I].xyzw[TEMP_HALF_C].f[i] = 0.5f;
+   }
+}
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
+{
+   if (mach->Instructions) {
+      FREE(mach->Instructions);
+      mach->Instructions = NULL;
+      mach->NumInstructions = 0;
+   }
+   if (mach->Declarations) {
+      FREE(mach->Declarations);
+      mach->Declarations = NULL;
+      mach->NumDeclarations = 0;
+   }
+}
+
+
+static void
+micro_abs(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
+}
+
+static void
+micro_add(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
+}
+
+static void
+micro_cos(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
+}
+
+static void
+micro_ddx(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   if (src1->f[0] != 0) {
+      dst->f[0] = src0->f[0] / src1->f[0];
+   }
+   if (src1->f[1] != 0) {
+      dst->f[1] = src0->f[1] / src1->f[1];
+   }
+   if (src1->f[2] != 0) {
+      dst->f[2] = src0->f[2] / src1->f[2];
+   }
+   if (src1->f[3] != 0) {
+      dst->f[3] = src0->f[3] / src1->f[3];
+   }
+}
+
+static void
+micro_udiv(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src)
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_exp2( src->f[0] );
+   dst->f[1] = util_fast_exp2( src->f[1] );
+   dst->f[2] = util_fast_exp2( src->f[2] );
+   dst->f[3] = util_fast_exp2( src->f[3] );
+#else
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
+#endif
+}
+
+static void
+micro_f2ut(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
+}
+
+static void
+micro_frc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
+}
+
+static void
+micro_ge(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_log2( src->f[0] );
+   dst->f[1] = util_fast_log2( src->f[1] );
+   dst->f[2] = util_fast_log2( src->f[2] );
+   dst->f[3] = util_fast_log2( src->f[3] );
+#else
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
+#endif
+}
+
+static void
+micro_le(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] <= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] <= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] <= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] <= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_lt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+#if FAST_MATH
+   dst->f[0] = util_fast_pow( src0->f[0], src1->f[0] );
+   dst->f[1] = util_fast_pow( src0->f[1], src1->f[1] );
+   dst->f[2] = util_fast_pow( src0->f[2], src1->f[2] );
+   dst->f[3] = util_fast_pow( src0->f[3], src1->f[3] );
+#else
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
+#endif
+}
+
+static void
+micro_rnd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
+}
+
+static void
+micro_sgn(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = src->f[0] < 0.0f ? -1.0f : src->f[0] > 0.0f ? 1.0f : 0.0f;
+   dst->f[1] = src->f[1] < 0.0f ? -1.0f : src->f[1] > 0.0f ? 1.0f : 0.0f;
+   dst->f[2] = src->f[2] < 0.0f ? -1.0f : src->f[2] > 0.0f ? 1.0f : 0.0f;
+   dst->f[3] = src->f[3] < 0.0f ? -1.0f : src->f[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+micro_shl(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
+}
+
+static void
+micro_sqrt( union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src )
+{
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
+}
+
+static void
+micro_sub(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct tgsi_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union tgsi_exec_channel *index,
+   union tgsi_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         assert(mach->Consts);
+         if (index->i[0] < 0)
+            chan->f[0] = 0.0f;
+         else
+            chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         if (index->i[1] < 0)
+            chan->f[1] = 0.0f;
+         else
+            chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         if (index->i[2] < 0)
+            chan->f[2] = 0.0f;
+         else
+            chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         if (index->i[3] < 0)
+            chan->f[3] = 0.0f;
+         else
+            chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         assert(index->i[0] < TGSI_EXEC_NUM_TEMPS);
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         assert( index->i[1] < (int) mach->ImmLimit );
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         assert( index->i[2] < (int) mach->ImmLimit );
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         assert( index->i[3] < (int) mach->ImmLimit );
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct tgsi_exec_machine *mach,
+   union tgsi_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union tgsi_exec_channel index;
+   uint swizzle;
+
+   /* We start with a direct index into a register file.
+    *
+    *    file[1],
+    *    where:
+    *       file = SrcRegister.File
+    *       [1] = SrcRegister.Index
+    */
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   /* There is an extra source register that indirectly subscripts
+    * a register file. The direct index now becomes an offset
+    * that is being added to the indirect register.
+    *
+    *    file[ind[2].x+1],
+    *    where:
+    *       ind = SrcRegisterInd.File
+    *       [2] = SrcRegisterInd.Index
+    *       .x = SrcRegisterInd.SwizzleX
+    */
+   if (reg->SrcRegister.Indirect) {
+      union tgsi_exec_channel index2;
+      union tgsi_exec_channel indir_index;
+      const uint execmask = mach->ExecMask;
+      uint i;
+
+      /* which address register (always zero now) */
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      /* get current value of address register[swizzle] */
+      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      /* add value of address register to the offset */
+      index.i[0] += (int) indir_index.f[0];
+      index.i[1] += (int) indir_index.f[1];
+      index.i[2] += (int) indir_index.f[2];
+      index.i[3] += (int) indir_index.f[3];
+
+      /* for disabled execution channels, zero-out the index to
+       * avoid using a potential garbage value.
+       */
+      for (i = 0; i < QUAD_SIZE; i++) {
+         if ((execmask & (1 << i)) == 0)
+            index.i[i] = 0;
+      }
+   }
+
+   /* There is an extra source register that is a second
+    * subscript to a register file. Effectively it means that
+    * the register file is actually a 2D array of registers.
+    *
+    *    file[1][3] == file[1*sizeof(file[1])+3],
+    *    where:
+    *       [3] = SrcRegisterDim.Index
+    */
+   if (reg->SrcRegister.Dimension) {
+      /* The size of the first-order array depends on the register file type.
+       * We need to multiply the index to the first array to get an effective,
+       * "flat" index that points to the beginning of the second-order array.
+       */
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[1] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[2] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         index.i[3] *= TGSI_EXEC_MAX_INPUT_ATTRIBS;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[1] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[2] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         index.i[3] *= TGSI_EXEC_MAX_CONST_BUFFER;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      /* Again, the second subscript index can be addressed indirectly
+       * identically to the first one.
+       * Nothing stops us from indirectly addressing the indirect register,
+       * but there is no need for that, so we won't exercise it.
+       *
+       *    file[1][ind[4].y+3],
+       *    where:
+       *       ind = SrcRegisterDimInd.File
+       *       [4] = SrcRegisterDimInd.Index
+       *       .y = SrcRegisterDimInd.SwizzleX
+       */
+      if (reg->SrcRegisterDim.Indirect) {
+         union tgsi_exec_channel index2;
+         union tgsi_exec_channel indir_index;
+         const uint execmask = mach->ExecMask;
+         uint i;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += (int) indir_index.f[0];
+         index.i[1] += (int) indir_index.f[1];
+         index.i[2] += (int) indir_index.f[2];
+         index.i[3] += (int) indir_index.f[3];
+
+         /* for disabled execution channels, zero-out the index to
+          * avoid using a potential garbage value.
+          */
+         for (i = 0; i < QUAD_SIZE; i++) {
+            if ((execmask & (1 << i)) == 0)
+               index.i[i] = 0;
+         }
+      }
+
+      /* If by any chance there was a need for a 3D array of register
+       * files, we would have to check whether SrcRegisterDim is followed
+       * by a dimension register and continue the saga.
+       */
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct tgsi_exec_machine *mach,
+   const union tgsi_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   uint i;
+   union tgsi_exec_channel null;
+   union tgsi_exec_channel *dst;
+   uint execmask = mach->ExecMask;
+
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_NULL:
+      dst = &null;
+      break;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      assert( reg->DstRegister.Index < TGSI_EXEC_NUM_TEMPS );
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   if (inst->InstructionExtNv.CondFlowEnable) {
+      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
+      uint swizzle;
+      uint shift;
+      uint mask;
+      uint test;
+
+      /* Only CC0 supported.
+       */
+      assert( inst->InstructionExtNv.CondFlowIndex < 1 );
+
+      switch (chan_index) {
+      case CHAN_X:
+         swizzle = inst->InstructionExtNv.CondSwizzleX;
+         break;
+      case CHAN_Y:
+         swizzle = inst->InstructionExtNv.CondSwizzleY;
+         break;
+      case CHAN_Z:
+         swizzle = inst->InstructionExtNv.CondSwizzleZ;
+         break;
+      case CHAN_W:
+         swizzle = inst->InstructionExtNv.CondSwizzleW;
+         break;
+      default:
+         assert( 0 );
+         return;
+      }
+
+      switch (swizzle) {
+      case TGSI_SWIZZLE_X:
+         shift = TGSI_EXEC_CC_X_SHIFT;
+         mask = TGSI_EXEC_CC_X_MASK;
+         break;
+      case TGSI_SWIZZLE_Y:
+         shift = TGSI_EXEC_CC_Y_SHIFT;
+         mask = TGSI_EXEC_CC_Y_MASK;
+         break;
+      case TGSI_SWIZZLE_Z:
+         shift = TGSI_EXEC_CC_Z_SHIFT;
+         mask = TGSI_EXEC_CC_Z_MASK;
+         break;
+      case TGSI_SWIZZLE_W:
+         shift = TGSI_EXEC_CC_W_SHIFT;
+         mask = TGSI_EXEC_CC_W_MASK;
+         break;
+      default:
+         assert( 0 );
+         return;
+      }
+
+      switch (inst->InstructionExtNv.CondMask) {
+      case TGSI_CC_GT:
+         test = ~(TGSI_EXEC_CC_GT << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_EQ:
+         test = ~(TGSI_EXEC_CC_EQ << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_LT:
+         test = ~(TGSI_EXEC_CC_LT << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_GE:
+         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_EQ) << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_LE:
+         test = ~((TGSI_EXEC_CC_LT | TGSI_EXEC_CC_EQ) << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_NE:
+         test = ~((TGSI_EXEC_CC_GT | TGSI_EXEC_CC_LT | TGSI_EXEC_CC_UN) << shift) & mask;
+         for (i = 0; i < QUAD_SIZE; i++)
+            if (cc->u[i] & test)
+               execmask &= ~(1 << i);
+         break;
+
+      case TGSI_CC_TR:
+         break;
+
+      case TGSI_CC_FL:
+         for (i = 0; i < QUAD_SIZE; i++)
+            execmask &= ~(1 << i);
+         break;
+
+      default:
+         assert( 0 );
+         return;
+      }
+   }
+
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      for (i = 0; i < QUAD_SIZE; i++)
+         if (execmask & (1 << i))
+            dst->i[i] = chan->i[i];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      for (i = 0; i < QUAD_SIZE; i++)
+         if (execmask & (1 << i)) {
+            if (chan->f[i] < 0.0f)
+               dst->f[i] = 0.0f;
+            else if (chan->f[i] > 1.0f)
+               dst->f[i] = 1.0f;
+            else
+               dst->i[i] = chan->i[i];
+         }
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      for (i = 0; i < QUAD_SIZE; i++)
+         if (execmask & (1 << i)) {
+            if (chan->f[i] < -1.0f)
+               dst->f[i] = -1.0f;
+            else if (chan->f[i] > 1.0f)
+               dst->f[i] = 1.0f;
+            else
+               dst->i[i] = chan->i[i];
+         }
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   if (inst->InstructionExtNv.CondDstUpdate) {
+      union tgsi_exec_channel *cc = &mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C];
+      uint shift;
+      uint mask;
+
+      /* Only CC0 supported.
+       */
+      assert( inst->InstructionExtNv.CondDstIndex < 1 );
+
+      switch (chan_index) {
+      case CHAN_X:
+         shift = TGSI_EXEC_CC_X_SHIFT;
+         mask = ~TGSI_EXEC_CC_X_MASK;
+         break;
+      case CHAN_Y:
+         shift = TGSI_EXEC_CC_Y_SHIFT;
+         mask = ~TGSI_EXEC_CC_Y_MASK;
+         break;
+      case CHAN_Z:
+         shift = TGSI_EXEC_CC_Z_SHIFT;
+         mask = ~TGSI_EXEC_CC_Z_MASK;
+         break;
+      case CHAN_W:
+         shift = TGSI_EXEC_CC_W_SHIFT;
+         mask = ~TGSI_EXEC_CC_W_MASK;
+         break;
+      default:
+         assert( 0 );
+         return;
+      }
+
+      for (i = 0; i < QUAD_SIZE; i++)
+         if (execmask & (1 << i)) {
+            cc->u[i] &= mask;
+            if (dst->f[i] < 0.0f)
+               cc->u[i] |= TGSI_EXEC_CC_LT << shift;
+            else if (dst->f[i] > 0.0f)
+               cc->u[i] |= TGSI_EXEC_CC_GT << shift;
+            else if (dst->f[i] == 0.0f)
+               cc->u[i] |= TGSI_EXEC_CC_EQ << shift;
+            else
+               cc->u[i] |= TGSI_EXEC_CC_UN << shift;
+         }
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kil(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union tgsi_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   if (inst->InstructionExtNv.CondFlowEnable) {
+      uint swizzle[4];
+      uint chan_index;
+
+      kilmask = 0x0;
+
+      swizzle[0] = inst->InstructionExtNv.CondSwizzleX;
+      swizzle[1] = inst->InstructionExtNv.CondSwizzleY;
+      swizzle[2] = inst->InstructionExtNv.CondSwizzleZ;
+      swizzle[3] = inst->InstructionExtNv.CondSwizzleW;
+
+      for (chan_index = 0; chan_index < 4; chan_index++)
+      {
+         uint i;
+
+         for (i = 0; i < 4; i++) {
+            /* TODO: evaluate the condition code */
+            if (0)
+               kilmask |= 1 << i;
+         }
+      }
+   }
+   else {
+      /* "unconditional" kil */
+      kilmask = mach->ExecMask;
+   }
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a four texture samples using STR texture coordinates.
+ */
+static void
+fetch_texel( struct tgsi_sampler *sampler,
+             const union tgsi_exec_channel *s,
+             const union tgsi_exec_channel *t,
+             const union tgsi_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union tgsi_exec_channel *r,
+             union tgsi_exec_channel *g,
+             union tgsi_exec_channel *b,
+             union tgsi_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod,
+         boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union tgsi_exec_channel r[4];
+   uint chan_index;
+   float lodBias;
+
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+/**
+ * Evaluate a constant-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_constant_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+/**
+ * Evaluate a linear-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_linear_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+/**
+ * Evaluate a perspective-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_perspective_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* eval_coef_func)(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_declaration *decl )
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         eval_coef_func eval;
+
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            eval = eval_constant_coef;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            eval = eval_linear_coef;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            eval = eval_perspective_coef;
+            break;
+
+         default:
+            eval = NULL;
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  eval( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     eval( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union tgsi_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_flr( &r[1], &r[0] );  /* r1 = floor(r0) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         micro_exp2( &r[2], &r[1] );       /* r2 = 2 ^ r1 */
+         STORE( &r[2], 0, CHAN_X );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_sub( &r[2], &r[0], &r[1] ); /* r2 = r0 - r1 */
+         STORE( &r[2], 0, CHAN_Y );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         micro_exp2( &r[2], &r[0] );       /* r2 = 2 ^ r0 */
+         STORE( &r[2], 0, CHAN_Z );        /* store r2 */
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_abs( &r[2], &r[0] );  /* r2 = abs(r0) */
+      micro_lg2( &r[1], &r[2] );  /* r1 = lg2(r2) */
+      micro_flr( &r[0], &r[1] );  /* r0 = floor(r1) */
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[0], 0, CHAN_X );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         micro_exp2( &r[0], &r[0] );       /* r0 = 2 ^ r0 */
+         micro_div( &r[0], &r[2], &r[0] ); /* r0 = r2 / r0 */
+         STORE( &r[0], 0, CHAN_Y );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[1], 0, CHAN_Z );
+      }
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[2], 2, CHAN_X );
+      micro_add( &r[0], &r[0], &r[2] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+   case TGSI_OPCODE_ARR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+#if FAST_MATH
+      micro_exp2( &r[0], &r[0] );
+#else
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+#endif
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      exec_kil (mach, inst);
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_le( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = LOD) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->FuncMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_sgn( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      /* 3-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp3(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         /* note: w channel is undefined */
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_NRM4:
+      /* 4-component vector normalize */
+      {
+         union tgsi_exec_channel tmp, dot;
+
+         /* tmp = dp4(src0, src0): */
+         FETCH( &r[0], 0, CHAN_X );
+         micro_mul( &tmp, &r[0], &r[0] );
+
+         FETCH( &r[1], 0, CHAN_Y );
+         micro_mul( &dot, &r[1], &r[1] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[2], 0, CHAN_Z );
+         micro_mul( &dot, &r[2], &r[2] );
+         micro_add( &tmp, &tmp, &dot );
+
+         FETCH( &r[3], 0, CHAN_W );
+         micro_mul( &dot, &r[3], &r[3] );
+         micro_add( &tmp, &tmp, &dot );
+
+         /* tmp = 1 / sqrt(tmp) */
+         micro_sqrt( &tmp, &tmp );
+         micro_div( &tmp, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &tmp );
+
+         FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+            /* chan = chan * tmp */
+            micro_mul( &r[chan_index], &tmp, &r[chan_index] );
+            STORE( &r[chan_index], 0, chan_index );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      UPDATE_EXEC_MASK(mach);
+      if (mach->ExecMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+   for (i = 0; i < QUAD_SIZE; i++) {
+      mach->Temps[TEMP_CC_I].xyzw[TEMP_CC_C].u[i] =
+         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_X_SHIFT) |
+         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Y_SHIFT) |
+         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_Z_SHIFT) |
+         (TGSI_EXEC_CC_EQ << TGSI_EXEC_CC_W_SHIFT);
+   }
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < (int) mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
new file mode 100644
index 0000000000..4ffd4efbff
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -0,0 +1,300 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_EXEC_H
+#define TGSI_EXEC_H
+
+#include "pipe/p_compiler.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LABELS 1024
+
+#define NUM_CHANNELS 4  /* R,G,B,A */
+#define QUAD_SIZE    4  /* 4 pixel/quad */
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union tgsi_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct tgsi_exec_vector
+{
+   union tgsi_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct tgsi_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct tgsi_sampler
+{
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct tgsi_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+};
+
+/**
+ * For branching/calling subroutines.
+ */
+struct tgsi_exec_labels
+{
+   unsigned labels[MAX_LABELS][2];
+   unsigned count;
+};
+
+
+#define TGSI_EXEC_NUM_TEMPS       128
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   6
+#define TGSI_EXEC_NUM_IMMEDIATES  256
+
+/*
+ * Locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TGSI_EXEC_TEMP_00000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_00000000_C   0
+
+#define TGSI_EXEC_TEMP_7FFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_7FFFFFFF_C   1
+
+#define TGSI_EXEC_TEMP_80000000_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_80000000_C   2
+
+#define TGSI_EXEC_TEMP_FFFFFFFF_I   (TGSI_EXEC_NUM_TEMPS + 0)
+#define TGSI_EXEC_TEMP_FFFFFFFF_C   3
+
+#define TGSI_EXEC_TEMP_ONE_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_ONE_C        0
+
+#define TGSI_EXEC_TEMP_TWO_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_TWO_C        1
+
+#define TGSI_EXEC_TEMP_128_I        (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_128_C        2
+
+#define TGSI_EXEC_TEMP_MINUS_128_I  (TGSI_EXEC_NUM_TEMPS + 1)
+#define TGSI_EXEC_TEMP_MINUS_128_C  3
+
+#define TGSI_EXEC_TEMP_KILMASK_I    (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_KILMASK_C    0
+
+#define TGSI_EXEC_TEMP_OUTPUT_I     (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_OUTPUT_C     1
+
+#define TGSI_EXEC_TEMP_PRIMITIVE_I  (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_PRIMITIVE_C  2
+
+/* NVIDIA condition code (CC) vector
+ */
+#define TGSI_EXEC_CC_GT       0x01
+#define TGSI_EXEC_CC_EQ       0x02
+#define TGSI_EXEC_CC_LT       0x04
+#define TGSI_EXEC_CC_UN       0x08
+
+#define TGSI_EXEC_CC_X_MASK   0x000000ff
+#define TGSI_EXEC_CC_X_SHIFT  0
+#define TGSI_EXEC_CC_Y_MASK   0x0000ff00
+#define TGSI_EXEC_CC_Y_SHIFT  8
+#define TGSI_EXEC_CC_Z_MASK   0x00ff0000
+#define TGSI_EXEC_CC_Z_SHIFT  16
+#define TGSI_EXEC_CC_W_MASK   0xff000000
+#define TGSI_EXEC_CC_W_SHIFT  24
+
+#define TGSI_EXEC_TEMP_CC_I         (TGSI_EXEC_NUM_TEMPS + 2)
+#define TGSI_EXEC_TEMP_CC_C         3
+
+#define TGSI_EXEC_TEMP_THREE_I      (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_TEMP_THREE_C      0
+
+#define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_TEMP_HALF_C       1
+
+/* execution mask, each value is either 0 or ~0 */
+#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
+#define TGSI_EXEC_MASK_C            2
+
+#define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
+
+#define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 5)
+
+
+#define TGSI_EXEC_MAX_COND_NESTING  20
+#define TGSI_EXEC_MAX_LOOP_NESTING  20
+#define TGSI_EXEC_MAX_CALL_NESTING  20
+
+/* The maximum number of input attributes per vertex. For 2D
+ * input register files, this is the stride between two 1D
+ * arrays.
+ */
+#define TGSI_EXEC_MAX_INPUT_ATTRIBS 17
+
+/* The maximum number of constant vectors per constant buffer.
+ */
+#define TGSI_EXEC_MAX_CONST_BUFFER  4096
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct tgsi_exec_machine
+{
+   /* Total = program temporaries + internal temporaries
+    *         + 1 padding to align to 16 bytes
+    */
+   struct tgsi_exec_vector       _Temps[TGSI_EXEC_NUM_TEMPS +
+                                        TGSI_EXEC_NUM_TEMP_EXTRAS + 1];
+
+   /*
+    * This will point to _Temps after aligning to 16B boundary.
+    */
+   struct tgsi_exec_vector       *Temps;
+   struct tgsi_exec_vector       *Addrs;
+
+   struct tgsi_sampler           **Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   const float                   (*Consts)[4];
+   struct tgsi_exec_vector       *Inputs;
+   struct tgsi_exec_vector       *Outputs;
+   const struct tgsi_token       *Tokens;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct tgsi_interp_coef *InterpCoefs;
+   struct tgsi_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+
+   struct tgsi_exec_labels Labels;
+};
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach );
+
+
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler **samplers);
+
+uint
+tgsi_exec_machine_run(
+   struct tgsi_exec_machine *mach );
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
+
+
+static INLINE void
+tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
+{
+   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
+      mask;
+}
+
+
+/** Set execution mask values prior to executing the shader */
+static INLINE void
+tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
+                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
+{
+   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
+   mask[0] = ch0 ? ~0 : 0;
+   mask[1] = ch1 ? ~0 : 0;
+   mask[2] = ch2 ? ~0 : 0;
+   mask[3] = ch3 ? ~0 : 0;
+}
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* TGSI_EXEC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
new file mode 100644
index 0000000000..68c7a6b7f5
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -0,0 +1,161 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_info.h"
+
+static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
+{
+   { 1, 1, 0, 0, "ARL" },
+   { 1, 1, 0, 0, "MOV" },
+   { 1, 1, 0, 0, "LIT" },
+   { 1, 1, 0, 0, "RCP" },
+   { 1, 1, 0, 0, "RSQ" },
+   { 1, 1, 0, 0, "EXP" },
+   { 1, 1, 0, 0, "LOG" },
+   { 1, 2, 0, 0, "MUL" },
+   { 1, 2, 0, 0, "ADD" },
+   { 1, 2, 0, 0, "DP3" },
+   { 1, 2, 0, 0, "DP4" },
+   { 1, 2, 0, 0, "DST" },
+   { 1, 2, 0, 0, "MIN" },
+   { 1, 2, 0, 0, "MAX" },
+   { 1, 2, 0, 0, "SLT" },
+   { 1, 2, 0, 0, "SGE" },
+   { 1, 3, 0, 0, "MAD" },
+   { 1, 2, 0, 0, "SUB" },
+   { 1, 3, 0, 0, "LERP" },
+   { 1, 3, 0, 0, "CND" },
+   { 1, 3, 0, 0, "CND0" },
+   { 1, 3, 0, 0, "DOT2ADD" },
+   { 1, 2, 0, 0, "INDEX" },
+   { 1, 1, 0, 0, "NEGATE" },
+   { 1, 1, 0, 0, "FRAC" },
+   { 1, 3, 0, 0, "CLAMP" },
+   { 1, 1, 0, 0, "FLOOR" },
+   { 1, 1, 0, 0, "ROUND" },
+   { 1, 1, 0, 0, "EXPBASE2" },
+   { 1, 1, 0, 0, "LOGBASE2" },
+   { 1, 2, 0, 0, "POWER" },
+   { 1, 2, 0, 0, "CROSSPRODUCT" },
+   { 1, 2, 0, 0, "MULTIPLYMATRIX" },
+   { 1, 1, 0, 0, "ABS" },
+   { 1, 1, 0, 0, "RCC" },
+   { 1, 2, 0, 0, "DPH" },
+   { 1, 1, 0, 0, "COS" },
+   { 1, 1, 0, 0, "DDX" },
+   { 1, 1, 0, 0, "DDY" },
+   { 0, 0, 0, 0, "KILP" },
+   { 1, 1, 0, 0, "PK2H" },
+   { 1, 1, 0, 0, "PK2US" },
+   { 1, 1, 0, 0, "PK4B" },
+   { 1, 1, 0, 0, "PK4UB" },
+   { 1, 2, 0, 0, "RFL" },
+   { 1, 2, 0, 0, "SEQ" },
+   { 1, 2, 0, 0, "SFL" },
+   { 1, 2, 0, 0, "SGT" },
+   { 1, 1, 0, 0, "SIN" },
+   { 1, 2, 0, 0, "SLE" },
+   { 1, 2, 0, 0, "SNE" },
+   { 1, 2, 0, 0, "STR" },
+   { 1, 2, 1, 0, "TEX" },
+   { 1, 4, 1, 0, "TXD" },
+   { 1, 2, 1, 0, "TXP" },
+   { 1, 1, 0, 0, "UP2H" },
+   { 1, 1, 0, 0, "UP2US" },
+   { 1, 1, 0, 0, "UP4B" },
+   { 1, 1, 0, 0, "UP4UB" },
+   { 1, 3, 0, 0, "X2D" },
+   { 1, 1, 0, 0, "ARA" },
+   { 1, 1, 0, 0, "ARR" },
+   { 0, 1, 0, 0, "BRA" },
+   { 0, 0, 0, 1, "CAL" },
+   { 0, 0, 0, 0, "RET" },
+   { 1, 1, 0, 0, "SSG" },
+   { 1, 3, 0, 0, "CMP" },
+   { 1, 1, 0, 0, "SCS" },
+   { 1, 2, 1, 0, "TXB" },
+   { 1, 1, 0, 0, "NRM" },
+   { 1, 2, 0, 0, "DIV" },
+   { 1, 2, 0, 0, "DP2" },
+   { 1, 2, 1, 0, "TXL" },
+   { 0, 0, 0, 0, "BRK" },
+   { 0, 1, 0, 1, "IF" },
+   { 0, 0, 0, 0, "LOOP" },
+   { 0, 1, 0, 0, "REP" },
+   { 0, 0, 0, 1, "ELSE" },
+   { 0, 0, 0, 0, "ENDIF" },
+   { 0, 0, 0, 0, "ENDLOOP" },
+   { 0, 0, 0, 0, "ENDREP" },
+   { 0, 1, 0, 0, "PUSHA" },
+   { 1, 0, 0, 0, "POPA" },
+   { 1, 1, 0, 0, "CEIL" },
+   { 1, 1, 0, 0, "I2F" },
+   { 1, 1, 0, 0, "NOT" },
+   { 1, 1, 0, 0, "TRUNC" },
+   { 1, 2, 0, 0, "SHL" },
+   { 1, 2, 0, 0, "SHR" },
+   { 1, 2, 0, 0, "AND" },
+   { 1, 2, 0, 0, "OR" },
+   { 1, 2, 0, 0, "MOD" },
+   { 1, 2, 0, 0, "XOR" },
+   { 1, 3, 0, 0, "SAD" },
+   { 1, 2, 1, 0, "TXF" },
+   { 1, 2, 1, 0, "TXQ" },
+   { 0, 0, 0, 0, "CONT" },
+   { 0, 0, 0, 0, "EMIT" },
+   { 0, 0, 0, 0, "ENDPRIM" },
+   { 0, 0, 0, 1, "BGNLOOP2" },
+   { 0, 0, 0, 0, "BGNSUB" },
+   { 0, 0, 0, 1, "ENDLOOP2" },
+   { 0, 0, 0, 0, "ENDSUB" },
+   { 1, 1, 0, 0, "NOISE1" },
+   { 1, 1, 0, 0, "NOISE2" },
+   { 1, 1, 0, 0, "NOISE3" },
+   { 1, 1, 0, 0, "NOISE4" },
+   { 0, 0, 0, 0, "NOP" },
+   { 1, 2, 0, 0, "M4X3" },
+   { 1, 2, 0, 0, "M3X4" },
+   { 1, 2, 0, 0, "M3X3" },
+   { 1, 2, 0, 0, "M3X2" },
+   { 1, 1, 0, 0, "NRM4" },
+   { 0, 1, 0, 0, "CALLNZ" },
+   { 0, 1, 0, 0, "IFC" },
+   { 0, 1, 0, 0, "BREAKC" },
+   { 0, 1, 0, 0, "KIL" },
+   { 0, 0, 0, 0, "END" },
+   { 1, 1, 0, 0, "SWZ" }
+};
+
+const struct tgsi_opcode_info *
+tgsi_get_opcode_info( uint opcode )
+{
+   if (opcode < TGSI_OPCODE_LAST)
+      return &opcode_info[opcode];
+   assert( 0 );
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.h b/src/gallium/auxiliary/tgsi/tgsi_info.h
new file mode 100644
index 0000000000..7230bdaae3
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_INFO_H
+#define TGSI_INFO_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_opcode_info
+{
+   uint num_dst;
+   uint num_src;
+   boolean is_tex;
+   boolean is_branch;
+   const char *mnemonic;
+};
+
+const struct tgsi_opcode_info *
+tgsi_get_opcode_info( uint opcode );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_INFO_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.c b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
new file mode 100644
index 0000000000..5371a88b96
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.c
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_iterate.h"
+
+boolean
+tgsi_iterate_shader(
+   const struct tgsi_token *tokens,
+   struct tgsi_iterate_context *ctx )
+{
+   struct tgsi_parse_context parse;
+
+   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK)
+      return FALSE;
+
+   ctx->processor = parse.FullHeader.Processor;
+   ctx->version = parse.FullVersion.Version;
+
+   if (ctx->prolog)
+      if (!ctx->prolog( ctx ))
+         goto fail;
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      tgsi_parse_token( &parse );
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (ctx->iterate_instruction)
+            if (!ctx->iterate_instruction( ctx, &parse.FullToken.FullInstruction ))
+               goto fail;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (ctx->iterate_declaration)
+            if (!ctx->iterate_declaration( ctx, &parse.FullToken.FullDeclaration ))
+               goto fail;
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (ctx->iterate_immediate)
+            if (!ctx->iterate_immediate( ctx, &parse.FullToken.FullImmediate ))
+               goto fail;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   if (ctx->epilog)
+      if (!ctx->epilog( ctx ))
+         goto fail;
+
+   tgsi_parse_free( &parse );
+   return TRUE;
+
+fail:
+   tgsi_parse_free( &parse );
+   return FALSE;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_iterate.h b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
new file mode 100644
index 0000000000..ec7b85bf63
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_iterate.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_ITERATE_H
+#define TGSI_ITERATE_H
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_iterate_context
+{
+   boolean
+   (* prolog)(
+      struct tgsi_iterate_context *ctx );
+
+   boolean
+   (* iterate_instruction)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_instruction *inst );
+
+   boolean
+   (* iterate_declaration)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_declaration *decl );
+
+   boolean
+   (* iterate_immediate)(
+      struct tgsi_iterate_context *ctx,
+      struct tgsi_full_immediate *imm );
+
+   boolean
+   (* epilog)(
+      struct tgsi_iterate_context *ctx );
+
+   struct tgsi_processor processor;
+   struct tgsi_version version;
+};
+
+boolean
+tgsi_iterate_shader(
+   const struct tgsi_token *tokens,
+   struct tgsi_iterate_context *ctx );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_ITERATE_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
new file mode 100644
index 0000000000..2cd56e413a
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -0,0 +1,344 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+#include "util/u_memory.h"
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token )
+{
+   full_token->Token.Type = TGSI_TOKEN_TYPE_DECLARATION;
+}
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token )
+{
+   if( full_token->Token.Type == TGSI_TOKEN_TYPE_IMMEDIATE ) {
+      FREE( (void *) full_token->FullImmediate.u.Pointer );
+   }
+}
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens )
+{
+   ctx->FullVersion.Version = *(struct tgsi_version *) &tokens[0];
+   if( ctx->FullVersion.Version.MajorVersion > 1 ) {
+      return TGSI_PARSE_ERROR;
+   }
+
+   ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[1];
+   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
+      ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[2];
+   }
+   else {
+      ctx->FullHeader.Processor = tgsi_default_processor();
+   }
+
+   ctx->Tokens = tokens;
+   ctx->Position = 1 + ctx->FullHeader.Header.HeaderSize;
+
+   tgsi_full_token_init( &ctx->FullToken );
+
+   return TGSI_PARSE_OK;
+}
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx )
+{
+   tgsi_full_token_free( &ctx->FullToken );
+}
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx )
+{
+   return ctx->Position >=
+      1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
+}
+
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
+static void
+next_token(
+   struct tgsi_parse_context *ctx,
+   void *token )
+{
+   assert( !tgsi_parse_end_of_tokens( ctx ) );
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
+}
+
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx )
+{
+   struct tgsi_token token;
+   unsigned i;
+
+   tgsi_full_token_free( &ctx->FullToken );
+   tgsi_full_token_init( &ctx->FullToken );
+
+   next_token( ctx, &token );
+
+   switch( token.Type ) {
+   case TGSI_TOKEN_TYPE_DECLARATION:
+   {
+      struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
+
+      *decl = tgsi_default_full_declaration();
+      copy_token(&decl->Declaration, &token);
+
+      next_token( ctx, &decl->DeclarationRange );
+
+      if( decl->Declaration.Semantic ) {
+         next_token( ctx, &decl->Semantic );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_IMMEDIATE:
+   {
+      struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+
+      *imm = tgsi_default_full_immediate();
+      copy_token(&imm->Immediate, &token);
+      assert( !imm->Immediate.Extended );
+
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         imm->u.Pointer = MALLOC(
+            sizeof( struct tgsi_immediate_float32 ) * (imm->Immediate.Size - 1) );
+         for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+            next_token( ctx, (struct tgsi_immediate_float32 *) &imm->u.ImmediateFloat32[i] );
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_INSTRUCTION:
+   {
+      struct tgsi_full_instruction *inst = &ctx->FullToken.FullInstruction;
+      unsigned extended;
+
+      *inst = tgsi_default_full_instruction();
+      copy_token(&inst->Instruction, &token);
+      extended = inst->Instruction.Extended;
+
+      while( extended ) {
+         struct tgsi_src_register_ext token;
+
+         next_token( ctx, &token );
+
+         switch( token.Type ) {
+         case TGSI_INSTRUCTION_EXT_TYPE_NV:
+            copy_token(&inst->InstructionExtNv, &token);
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
+            copy_token(&inst->InstructionExtLabel, &token);
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
+            copy_token(&inst->InstructionExtTexture, &token);
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         extended = token.Extended;
+      }
+
+      assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
+
+      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullDstRegisters[i].DstRegister );
+
+         /*
+          * No support for indirect or multi-dimensional addressing.
+          */
+         assert( !inst->FullDstRegisters[i].DstRegister.Indirect );
+         assert( !inst->FullDstRegisters[i].DstRegister.Dimension );
+
+         extended = inst->FullDstRegisters[i].DstRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
+               break;
+
+            case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+      }
+
+      assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
+
+      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullSrcRegisters[i].SrcRegister );
+
+         extended = inst->FullSrcRegisters[i].SrcRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
+               break;
+
+            case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Indirect ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterInd );
+
+            /*
+             * No support for indirect or multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Dimension ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDim );
+
+            /*
+             * No support for multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Extended );
+
+            if( inst->FullSrcRegisters[i].SrcRegisterDim.Indirect ) {
+               next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDimInd );
+
+               /*
+               * No support for indirect or multi-dimensional addressing.
+               */
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+            }
+         }
+      }
+
+      break;
+   }
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens)
+{
+   struct tgsi_parse_context ctx;
+   if (tgsi_parse_init(&ctx, tokens) == TGSI_PARSE_OK) {
+      unsigned len = (ctx.FullHeader.Header.HeaderSize +
+                      ctx.FullHeader.Header.BodySize +
+                      1);
+      return len;
+   }
+   return 0;
+}
+
+
+/**
+ * Make a new copy of a token array.
+ */
+struct tgsi_token *
+tgsi_dup_tokens(const struct tgsi_token *tokens)
+{
+   unsigned n = tgsi_num_tokens(tokens);
+   unsigned bytes = n * sizeof(struct tgsi_token);
+   struct tgsi_token *new_tokens = (struct tgsi_token *) MALLOC(bytes);
+   if (new_tokens)
+      memcpy(new_tokens, tokens, bytes);
+   return new_tokens;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.h b/src/gallium/auxiliary/tgsi/tgsi_parse.h
new file mode 100644
index 0000000000..054350712d
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.h
@@ -0,0 +1,151 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PARSE_H
+#define TGSI_PARSE_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_full_version
+{
+   struct tgsi_version  Version;
+};
+
+struct tgsi_full_header
+{
+   struct tgsi_header      Header;
+   struct tgsi_processor   Processor;
+};
+
+struct tgsi_full_dst_register
+{
+   struct tgsi_dst_register               DstRegister;
+   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
+   struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
+};
+
+struct tgsi_full_src_register
+{
+   struct tgsi_src_register         SrcRegister;
+   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
+   struct tgsi_src_register_ext_mod SrcRegisterExtMod;
+   struct tgsi_src_register         SrcRegisterInd;
+   struct tgsi_dimension            SrcRegisterDim;
+   struct tgsi_src_register         SrcRegisterDimInd;
+};
+
+struct tgsi_full_declaration
+{
+   struct tgsi_declaration Declaration;
+   struct tgsi_declaration_range DeclarationRange;
+   struct tgsi_declaration_semantic Semantic;
+};
+
+struct tgsi_full_immediate
+{
+   struct tgsi_immediate   Immediate;
+   union
+   {
+      const void                          *Pointer;
+      const struct tgsi_immediate_float32 *ImmediateFloat32;
+   } u;
+};
+
+#define TGSI_FULL_MAX_DST_REGISTERS 2
+#define TGSI_FULL_MAX_SRC_REGISTERS 4 /* TXD has 4 */
+
+struct tgsi_full_instruction
+{
+   struct tgsi_instruction             Instruction;
+   struct tgsi_instruction_ext_nv      InstructionExtNv;
+   struct tgsi_instruction_ext_label   InstructionExtLabel;
+   struct tgsi_instruction_ext_texture InstructionExtTexture;
+   struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
+   struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
+};
+
+union tgsi_full_token
+{
+   struct tgsi_token             Token;
+   struct tgsi_full_declaration  FullDeclaration;
+   struct tgsi_full_immediate    FullImmediate;
+   struct tgsi_full_instruction  FullInstruction;
+};
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token );
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token );
+
+struct tgsi_parse_context
+{
+   const struct tgsi_token    *Tokens;
+   unsigned                   Position;
+   struct tgsi_full_version   FullVersion;
+   struct tgsi_full_header    FullHeader;
+   union tgsi_full_token      FullToken;
+};
+
+#define TGSI_PARSE_OK      0
+#define TGSI_PARSE_ERROR   1
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens );
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx );
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx );
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx );
+
+unsigned
+tgsi_num_tokens(const struct tgsi_token *tokens);
+
+struct tgsi_token *
+tgsi_dup_tokens(const struct tgsi_token *tokens);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PARSE_H */
+
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.c b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
new file mode 100644
index 0000000000..a92b1902e3
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.c
@@ -0,0 +1,1329 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI to PowerPC code generation.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_PPC)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_sse.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_dump.h"
+#include "tgsi_exec.h"
+#include "tgsi_ppc.h"
+#include "rtasm/rtasm_ppc.h"
+
+
+/**
+ * Since it's pretty much impossible to form PPC vector immediates, load
+ * them from memory here:
+ */
+const float ppc_builtin_constants[] ALIGN16_ATTRIB = {
+   1.0f, -128.0f, 128.0, 0.0
+};
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+
+/**
+ * How many TGSI temps should be implemented with real PPC vector registers
+ * rather than memory.
+ */
+#define MAX_PPC_TEMPS 4
+
+
+struct reg_chan_vec
+{
+   struct tgsi_full_src_register src;
+   uint chan;
+   uint vec;
+};
+
+
+/**
+ * Context/state used during code gen.
+ */
+struct gen_context
+{
+   struct ppc_function *f;
+   int inputs_reg;    /**< GP register pointing to input params */
+   int outputs_reg;   /**< GP register pointing to output params */
+   int temps_reg;     /**< GP register pointing to temporary "registers" */
+   int immed_reg;     /**< GP register pointing to immediates buffer */
+   int const_reg;     /**< GP register pointing to constants buffer */
+   int builtins_reg;  /**< GP register pointint to built-in constants */
+
+   int offset_reg;    /**< used to reduce redundant li instructions */
+   int offset_value;
+
+   int one_vec;       /**< vector register with {1.0, 1.0, 1.0, 1.0} */
+   int bit31_vec;     /**< vector register with {1<<31, 1<<31, 1<<31, 1<<31} */
+
+   /**
+    * Map TGSI temps to PPC vector temps.
+    * We have 32 PPC vector regs.  Use 16 of them for storing 4 TGSI temps.
+    * XXX currently only do this for TGSI temps [0..MAX_PPC_TEMPS-1].
+    */
+   int temps_map[MAX_PPC_TEMPS][4];
+
+   /**
+    * Cache of src registers.
+    * This is used to avoid redundant load instructions.
+    */
+   struct {
+      struct tgsi_full_src_register src;
+      uint chan;
+      uint vec;
+   } regs[12];  /* 3 src regs, 4 channels */
+   uint num_regs;
+};
+
+
+/**
+ * Initialize code generation context.
+ */
+static void
+init_gen_context(struct gen_context *gen, struct ppc_function *func)
+{
+   uint i;
+
+   memset(gen, 0, sizeof(*gen));
+   gen->f = func;
+   gen->inputs_reg = ppc_reserve_register(func, 3);   /* first function param */
+   gen->outputs_reg = ppc_reserve_register(func, 4);  /* second function param */
+   gen->temps_reg = ppc_reserve_register(func, 5);    /* ... */
+   gen->immed_reg = ppc_reserve_register(func, 6);
+   gen->const_reg = ppc_reserve_register(func, 7);
+   gen->builtins_reg = ppc_reserve_register(func, 8);
+   gen->one_vec = -1;
+   gen->bit31_vec = -1;
+   gen->offset_reg = -1;
+   gen->offset_value = -9999999;
+   for (i = 0; i < MAX_PPC_TEMPS; i++) {
+      gen->temps_map[i][0] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][1] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][2] = ppc_allocate_vec_register(gen->f);
+      gen->temps_map[i][3] = ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
+/**
+ * All PPC vector load/store instructions form an effective address
+ * by adding the contents of two registers.  For example:
+ *    lvx v2,r8,r9   # v2 = memory[r8 + r9]
+ *    stvx v2,r8,r9  # memory[r8 + r9] = v2;
+ * So our lvx/stvx instructions are typically preceded by an 'li' instruction
+ * to load r9 (above) with an immediate (an offset).
+ * This code emits that 'li' instruction, but only if the offset value is
+ * different than the previous 'li'.
+ * This optimization seems to save about 10% in the instruction count.
+ * Note that we need to unconditionally emit an 'li' inside basic blocks
+ * (such as inside loops).
+ */
+static int
+emit_li_offset(struct gen_context *gen, int offset)
+{
+   if (gen->offset_reg <= 0) {
+      /* allocate a GP register for storing load/store offset */
+      gen->offset_reg = ppc_allocate_register(gen->f);
+   }
+
+   /* emit new 'li' if offset is changing */
+   if (gen->offset_value < 0 || gen->offset_value != offset) {
+      gen->offset_value = offset;
+      ppc_li(gen->f, gen->offset_reg, offset);
+   }
+
+   return gen->offset_reg;
+}
+
+
+/**
+ * Forces subsequent emit_li_offset() calls to emit an 'li'.
+ * To be called at the top of basic blocks.
+ */
+static void
+reset_li_offset(struct gen_context *gen)
+{
+   gen->offset_value = -9999999;
+}
+
+
+
+/**
+ * Load the given vector register with {value, value, value, value}.
+ * The value must be in the ppu_builtin_constants[] array.
+ * We wouldn't need this if there was a simple way to load PPC vector
+ * registers with immediate values!
+ */
+static void
+load_constant_vec(struct gen_context *gen, int dst_vec, float value)
+{
+   uint pos;
+   for (pos = 0; pos < Elements(ppc_builtin_constants); pos++) {
+      if (ppc_builtin_constants[pos] == value) {
+         int offset = pos * 4;
+         int offset_reg = emit_li_offset(gen, offset);
+
+         /* Load 4-byte word into vector register.
+          * The vector slot depends on the effective address we load from.
+          * We know that our builtins start at a 16-byte boundary so we
+          * know that 'swizzle' tells us which vector slot will have the
+          * loaded word.  The other vector slots will be undefined.
+          */
+         ppc_lvewx(gen->f, dst_vec, gen->builtins_reg, offset_reg);
+         /* splat word[pos % 4] across the vector reg */
+         ppc_vspltw(gen->f, dst_vec, dst_vec, pos % 4);
+         return;
+      }
+   }
+   assert(0 && "Need to add new constant to ppc_builtin_constants array");
+}
+
+
+/**
+ * Return index of vector register containing {1.0, 1.0, 1.0, 1.0}.
+ */
+static int
+gen_one_vec(struct gen_context *gen)
+{
+   if (gen->one_vec < 0) {
+      gen->one_vec = ppc_allocate_vec_register(gen->f);
+      load_constant_vec(gen, gen->one_vec, 1.0f);
+   }
+   return gen->one_vec;
+}
+
+/**
+ * Return index of vector register containing {1<<31, 1<<31, 1<<31, 1<<31}.
+ */
+static int
+gen_get_bit31_vec(struct gen_context *gen)
+{
+   if (gen->bit31_vec < 0) {
+      gen->bit31_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vspltisw(gen->f, gen->bit31_vec, -1);
+      ppc_vslw(gen->f, gen->bit31_vec, gen->bit31_vec, gen->bit31_vec);
+   }
+   return gen->bit31_vec;
+}
+
+
+/**
+ * Register fetch.  Return PPC vector register with result.
+ */
+static int
+emit_fetch(struct gen_context *gen,
+           const struct tgsi_full_src_register *reg,
+           const unsigned chan_index)
+{
+   uint swizzle = tgsi_util_get_full_src_register_extswizzle(reg, chan_index);
+   int dst_vec = -1;
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_INPUT:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            ppc_lvx(gen->f, dst_vec, gen->inputs_reg, offset_reg);
+         }
+         break;
+      case TGSI_FILE_TEMPORARY:
+         if (reg->SrcRegister.Index < MAX_PPC_TEMPS) {
+            /* use PPC vec register */
+            dst_vec = gen->temps_map[reg->SrcRegister.Index][swizzle];
+         }
+         else {
+            /* use memory-based temp register "file" */
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 16;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            ppc_lvx(gen->f, dst_vec, gen->temps_reg, offset_reg);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our immediates start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->immed_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+         }
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            int offset = (reg->SrcRegister.Index * 4 + swizzle) * 4;
+            int offset_reg = emit_li_offset(gen, offset);
+            dst_vec = ppc_allocate_vec_register(gen->f);
+            /* Load 4-byte word into vector register.
+             * The vector slot depends on the effective address we load from.
+             * We know that our constants start at a 16-byte boundary so we
+             * know that 'swizzle' tells us which vector slot will have the
+             * loaded word.  The other vector slots will be undefined.
+             */
+            ppc_lvewx(gen->f, dst_vec, gen->const_reg, offset_reg);
+            /* splat word[swizzle] across the vector reg */
+            ppc_vspltw(gen->f, dst_vec, dst_vec, swizzle);
+         }
+         break;
+      default:
+         assert( 0 );
+      }
+      break;
+   case TGSI_EXTSWIZZLE_ZERO:
+      ppc_vzero(gen->f, dst_vec);
+      break;
+   case TGSI_EXTSWIZZLE_ONE:
+      {
+         int one_vec = gen_one_vec(gen);
+         dst_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vmove(gen->f, dst_vec, one_vec);
+      }
+      break;
+   default:
+      assert( 0 );
+   }
+
+   assert(dst_vec >= 0);
+
+   {
+      uint sign_op = tgsi_util_get_full_src_register_sign_mode(reg, chan_index);
+      if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+         int bit31_vec = gen_get_bit31_vec(gen);
+
+         switch (sign_op) {
+         case TGSI_UTIL_SIGN_CLEAR:
+            /* vec = vec & ~bit31 */
+            ppc_vandc(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_SET:
+            /* vec = vec | bit31 */
+            ppc_vor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         case TGSI_UTIL_SIGN_TOGGLE:
+            /* vec = vec ^ bit31 */
+            ppc_vxor(gen->f, dst_vec, dst_vec, bit31_vec);
+            break;
+         default:
+            assert(0);
+         }
+      }
+   }
+
+   return dst_vec;
+}
+
+
+
+/**
+ * Test if two TGSI src registers refer to the same memory location.
+ * We use this to avoid redundant register loads.
+ */
+static boolean
+equal_src_locs(const struct tgsi_full_src_register *a, uint chan_a,
+               const struct tgsi_full_src_register *b, uint chan_b)
+{
+   int swz_a, swz_b;
+   int sign_a, sign_b;
+   if (a->SrcRegister.File != b->SrcRegister.File)
+      return FALSE;
+   if (a->SrcRegister.Index != b->SrcRegister.Index)
+      return FALSE;
+   swz_a = tgsi_util_get_full_src_register_extswizzle(a, chan_a);
+   swz_b = tgsi_util_get_full_src_register_extswizzle(b, chan_b);
+   if (swz_a != swz_b)
+      return FALSE;
+   sign_a = tgsi_util_get_full_src_register_sign_mode(a, chan_a);
+   sign_b = tgsi_util_get_full_src_register_sign_mode(b, chan_b);
+   if (sign_a != sign_b)
+      return FALSE;
+   return TRUE;
+}
+
+
+/**
+ * Given a TGSI src register and channel index, return the PPC vector
+ * register containing the value.  We use a cache to prevent re-loading
+ * the same register multiple times.
+ * \return index of PPC vector register with the desired src operand
+ */
+static int
+get_src_vec(struct gen_context *gen,
+            struct tgsi_full_instruction *inst, int src_reg, uint chan)
+{
+   const const struct tgsi_full_src_register *src = 
+      &inst->FullSrcRegisters[src_reg];
+   int vec;
+   uint i;
+
+   /* check the cache */
+   for (i = 0; i < gen->num_regs; i++) {
+      if (equal_src_locs(&gen->regs[i].src, gen->regs[i].chan, src, chan)) {
+         /* cache hit */
+         assert(gen->regs[i].vec >= 0);
+         return gen->regs[i].vec;
+      }
+   }
+
+   /* cache miss: allocate new vec reg and emit fetch/load code */
+   vec = emit_fetch(gen, src, chan);
+   gen->regs[gen->num_regs].src = *src;
+   gen->regs[gen->num_regs].chan = chan;
+   gen->regs[gen->num_regs].vec = vec;
+   gen->num_regs++;
+
+   assert(gen->num_regs <= Elements(gen->regs));
+
+   assert(vec >= 0);
+
+   return vec;
+}
+
+
+/**
+ * Clear the src operand cache.  To be called at the end of each emit function.
+ */
+static void
+release_src_vecs(struct gen_context *gen)
+{
+   uint i;
+   for (i = 0; i < gen->num_regs; i++) {
+      const const struct tgsi_full_src_register src = gen->regs[i].src;
+      if (!(src.SrcRegister.File == TGSI_FILE_TEMPORARY &&
+            src.SrcRegister.Index < MAX_PPC_TEMPS)) {
+         ppc_release_vec_register(gen->f, gen->regs[i].vec);
+      }
+   }
+   gen->num_regs = 0;
+}
+
+
+
+static int
+get_dst_vec(struct gen_context *gen, 
+            const struct tgsi_full_instruction *inst,
+            unsigned chan_index)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   if (reg->DstRegister.File == TGSI_FILE_TEMPORARY &&
+       reg->DstRegister.Index < MAX_PPC_TEMPS) {
+      int vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+      return vec;
+   }
+   else {
+      return ppc_allocate_vec_register(gen->f);
+   }
+}
+
+
+/**
+ * Register store.  Store 'src_vec' at location indicated by 'reg'.
+ * \param free_vec  Should the src_vec be released when done?
+ */
+static void
+emit_store(struct gen_context *gen,
+           int src_vec,
+           const struct tgsi_full_instruction *inst,
+           unsigned chan_index,
+           boolean free_vec)
+{
+   const struct tgsi_full_dst_register *reg = &inst->FullDstRegisters[0];
+
+   switch (reg->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         int offset_reg = emit_li_offset(gen, offset);
+         ppc_stvx(gen->f, src_vec, gen->outputs_reg, offset_reg);
+      }
+      break;
+   case TGSI_FILE_TEMPORARY:
+      if (reg->DstRegister.Index < MAX_PPC_TEMPS) {
+         if (!free_vec) {
+            int dst_vec = gen->temps_map[reg->DstRegister.Index][chan_index];
+            if (dst_vec != src_vec)
+               ppc_vmove(gen->f, dst_vec, src_vec);
+         }
+         free_vec = FALSE;
+      }
+      else {
+         int offset = (reg->DstRegister.Index * 4 + chan_index) * 16;
+         int offset_reg = emit_li_offset(gen, offset);
+         ppc_stvx(gen->f, src_vec, gen->temps_reg, offset_reg);
+      }
+      break;
+#if 0
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+#endif
+   default:
+      assert( 0 );
+   }
+
+#if 0
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+#endif
+
+   if (free_vec)
+      ppc_release_vec_register(gen->f, src_vec);
+}
+
+
+static void
+emit_scalar_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0, v1;
+   uint chan_index;
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_X);
+   v1 = ppc_allocate_vec_register(gen->f);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_RSQ:
+      /* v1 = 1.0 / sqrt(v0) */
+      ppc_vrsqrtefp(gen->f, v1, v0);
+      break;
+   case TGSI_OPCODE_RCP:
+      /* v1 = 1.0 / v0 */
+      ppc_vrefp(gen->f, v1, v0);
+      break;
+   default:
+      assert(0);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      emit_store(gen, v1, inst, chan_index, FALSE);
+   }
+
+   release_src_vecs(gen);
+   ppc_release_vec_register(gen->f, v1);
+}
+
+
+static void
+emit_unaryop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan_index;
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      int v0 = get_src_vec(gen, inst, 0, chan_index);   /* v0 = srcreg[0] */
+      int v1 = get_dst_vec(gen, inst, chan_index);
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ABS:
+         /* turn off the most significant bit of each vector float word */
+         {
+            int bit31_vec = gen_get_bit31_vec(gen);
+            ppc_vandc(gen->f, v1, v0, bit31_vec); /* v1 = v0 & ~bit31 */
+         }
+         break;
+      case TGSI_OPCODE_FLOOR:
+         ppc_vrfim(gen->f, v1, v0);         /* v1 = floor(v0) */
+         break;
+      case TGSI_OPCODE_FRAC:
+         ppc_vrfim(gen->f, v1, v0);      /* tmp = floor(v0) */
+         ppc_vsubfp(gen->f, v1, v0, v1); /* v1 = v0 - v1 */
+         break;
+      case TGSI_OPCODE_EXPBASE2:
+         ppc_vexptefp(gen->f, v1, v0);     /* v1 = 2^v0 */
+         break;
+      case TGSI_OPCODE_LOGBASE2:
+         /* XXX this may be broken! */
+         ppc_vlogefp(gen->f, v1, v0);      /* v1 = log2(v0) */
+         break;
+      case TGSI_OPCODE_MOV:
+      case TGSI_OPCODE_SWZ:
+         if (v0 != v1)
+            ppc_vmove(gen->f, v1, v0);
+         break;
+      default:
+         assert(0);
+      }
+      emit_store(gen, v1, inst, chan_index, TRUE);  /* store v0 */
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_binop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int zero_vec = -1;
+   uint chan;
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL) {
+      zero_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vzero(gen->f, zero_vec);
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+
+      /* emit binop */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_ADD:
+         ppc_vaddfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_SUB:
+         ppc_vsubfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MUL:
+         ppc_vmaddfp(gen->f, v2, v0, v1, zero_vec);
+         break;
+      case TGSI_OPCODE_MIN:
+         ppc_vminfp(gen->f, v2, v0, v1);
+         break;
+      case TGSI_OPCODE_MAX:
+         ppc_vmaxfp(gen->f, v2, v0, v1);
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
+   }
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_MUL)
+      ppc_release_vec_register(gen->f, zero_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_triop(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_src_vec(gen, inst, 2, chan);
+      int v3 = get_dst_vec(gen, inst, chan);
+
+      /* emit ALU */
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_MAD:
+         ppc_vmaddfp(gen->f, v3, v0, v1, v2);   /* v3 = v0 * v1 + v2 */
+         break;
+      case TGSI_OPCODE_LRP:
+         ppc_vsubfp(gen->f, v3, v1, v2);        /* v3 = v1 - v2 */
+         ppc_vmaddfp(gen->f, v3, v0, v3, v2);   /* v3 = v0 * v3 + v2 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* store v3 */
+      emit_store(gen, v3, inst, chan, TRUE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+/**
+ * Vector comparisons, resulting in 1.0 or 0.0 values.
+ */
+static void
+emit_inequality(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   uint chan;
+   int one_vec = gen_one_vec(gen);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      /* fetch src operands */
+      int v0 = get_src_vec(gen, inst, 0, chan);
+      int v1 = get_src_vec(gen, inst, 1, chan);
+      int v2 = get_dst_vec(gen, inst, chan);
+      boolean complement = FALSE;
+
+      switch (inst->Instruction.Opcode) {
+      case TGSI_OPCODE_SNE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SEQ:
+         ppc_vcmpeqfpx(gen->f, v2, v0, v1); /* v2 = v0 == v1 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SGE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SLT:
+         ppc_vcmpgtfpx(gen->f, v2, v1, v0); /* v2 = v1 > v0 ? ~0 : 0 */
+         break;
+
+      case TGSI_OPCODE_SLE:
+         complement = TRUE;
+         /* fall-through */
+      case TGSI_OPCODE_SGT:
+         ppc_vcmpgtfpx(gen->f, v2, v0, v1); /* v2 = v0 > v1 ? ~0 : 0 */
+         break;
+      default:
+         assert(0);
+      }
+
+      /* v2 is now {0,0,0,0} or {~0,~0,~0,~0} */
+
+      if (complement)
+         ppc_vandc(gen->f, v2, one_vec, v2);    /* v2 = one_vec & ~v2 */
+      else
+         ppc_vand(gen->f, v2, one_vec, v2);     /* v2 = one_vec & v2 */
+
+      /* store v2 */
+      emit_store(gen, v2, inst, chan, TRUE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_dotprod(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int v0, v1, v2;
+   uint chan_index;
+
+   v2 = ppc_allocate_vec_register(gen->f);
+
+   ppc_vxor(gen->f, v2, v2, v2);           /* v2 = {0, 0, 0, 0} */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_X); /* v0 = src0.XXXX */
+   v1 = get_src_vec(gen, inst, 1, CHAN_X); /* v1 = src1.XXXX */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_Y); /* v0 = src0.YYYY */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Y); /* v1 = src1.YYYY */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   v0 = get_src_vec(gen, inst, 0, CHAN_Z); /* v0 = src0.ZZZZ */
+   v1 = get_src_vec(gen, inst, 1, CHAN_Z); /* v1 = src1.ZZZZ */
+   ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_DP4) {
+      v0 = get_src_vec(gen, inst, 0, CHAN_W); /* v0 = src0.WWWW */
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vmaddfp(gen->f, v2, v0, v1, v2);    /* v2 = v0 * v1 + v2 */
+   }
+   else if (inst->Instruction.Opcode == TGSI_OPCODE_DPH) {
+      v1 = get_src_vec(gen, inst, 1, CHAN_W); /* v1 = src1.WWWW */
+      ppc_vaddfp(gen->f, v2, v2, v1);         /* v2 = v2 + v1 */
+   }
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan_index) {
+      emit_store(gen, v2, inst, chan_index, FALSE);  /* store v2, free v2 later */
+   }
+
+   release_src_vecs(gen);
+
+   ppc_release_vec_register(gen->f, v2);
+}
+
+
+/** Approximation for vr = pow(va, vb) */
+static void
+ppc_vec_pow(struct ppc_function *f, int vr, int va, int vb)
+{
+   /* pow(a,b) ~= exp2(log2(a) * b) */
+   int t_vec = ppc_allocate_vec_register(f);
+   int zero_vec = ppc_allocate_vec_register(f);
+
+   ppc_vzero(f, zero_vec);
+
+   ppc_vlogefp(f, t_vec, va);                   /* t = log2(va) */
+   ppc_vmaddfp(f, t_vec, t_vec, vb, zero_vec);  /* t = t * vb */
+   ppc_vexptefp(f, vr, t_vec);                  /* vr = 2^t */
+
+   ppc_release_vec_register(f, t_vec);
+   ppc_release_vec_register(f, zero_vec);
+}
+
+
+static void
+emit_lit(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int one_vec = gen_one_vec(gen);
+
+   /* Compute X */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      emit_store(gen, one_vec, inst, CHAN_X, FALSE);
+   }
+
+   /* Compute Y, Z */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int x_vec;
+      int zero_vec = ppc_allocate_vec_register(gen->f);
+
+      x_vec = get_src_vec(gen, inst, 0, CHAN_X);  /* x_vec = src[0].x */
+
+      ppc_vzero(gen->f, zero_vec);                /* zero = {0,0,0,0} */
+      ppc_vmaxfp(gen->f, x_vec, x_vec, zero_vec); /* x_vec = max(x_vec, 0) */
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         emit_store(gen, x_vec, inst, CHAN_Y, FALSE);
+      }
+
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+         int y_vec, w_vec;
+         int z_vec = ppc_allocate_vec_register(gen->f);
+         int pow_vec = ppc_allocate_vec_register(gen->f);
+         int pos_vec = ppc_allocate_vec_register(gen->f);
+         int p128_vec = ppc_allocate_vec_register(gen->f);
+         int n128_vec = ppc_allocate_vec_register(gen->f);
+
+         y_vec = get_src_vec(gen, inst, 0, CHAN_Y);  /* y_vec = src[0].y */
+         ppc_vmaxfp(gen->f, y_vec, y_vec, zero_vec); /* y_vec = max(y_vec, 0) */
+
+         w_vec = get_src_vec(gen, inst, 0, CHAN_W);  /* w_vec = src[0].w */
+
+         /* clamp W to [-128, 128] */
+         load_constant_vec(gen, p128_vec, 128.0f);
+         load_constant_vec(gen, n128_vec, -128.0f);
+         ppc_vmaxfp(gen->f, w_vec, w_vec, n128_vec); /* w = max(w, -128) */
+         ppc_vminfp(gen->f, w_vec, w_vec, p128_vec); /* w = min(w, 128) */
+
+         /* if temp.x > 0
+          *    z = pow(tmp.y, tmp.w)
+          * else
+          *    z = 0.0
+          */
+         ppc_vec_pow(gen->f, pow_vec, y_vec, w_vec);      /* pow = pow(y, w) */
+         ppc_vcmpgtfpx(gen->f, pos_vec, x_vec, zero_vec); /* pos = x > 0 */
+         ppc_vand(gen->f, z_vec, pow_vec, pos_vec);       /* z = pow & pos */
+
+         emit_store(gen, z_vec, inst, CHAN_Z, FALSE);
+
+         ppc_release_vec_register(gen->f, z_vec);
+         ppc_release_vec_register(gen->f, pow_vec);
+         ppc_release_vec_register(gen->f, pos_vec);
+         ppc_release_vec_register(gen->f, p128_vec);
+         ppc_release_vec_register(gen->f, n128_vec);
+      }
+
+      ppc_release_vec_register(gen->f, zero_vec);
+   }
+
+   /* Compute W */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_exp(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int one_vec = gen_one_vec(gen);
+   int src_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* Compute X = 2^floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_X);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vexptefp(gen->f, dst_vec, tmp_vec);          /* dst = 2 ^ tmp */
+      emit_store(gen, dst_vec, inst, CHAN_X, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Y = src - floor(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Y);
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vrfim(gen->f, tmp_vec, src_vec);             /* tmp = floor(src); */
+      ppc_vsubfp(gen->f, dst_vec, src_vec, tmp_vec);   /* dst = src - tmp */
+      emit_store(gen, dst_vec, inst, CHAN_Y, TRUE);
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApprox2ToX(src) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vexptefp(gen->f, dst_vec, src_vec);          /* dst = 2 ^ src */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_log(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   const int bit31_vec = gen_get_bit31_vec(gen);
+   const int one_vec = gen_one_vec(gen);
+   int src_vec, abs_vec;
+
+   /* get src arg */
+   src_vec = get_src_vec(gen, inst, 0, CHAN_X);
+
+   /* compute abs(src) */
+   abs_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vandc(gen->f, abs_vec, src_vec, bit31_vec);     /* abs = src & ~bit31 */
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) &&
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+
+      /* compute tmp = floor(log2(abs)) */
+      int tmp_vec = ppc_allocate_vec_register(gen->f);
+      ppc_vlogefp(gen->f, tmp_vec, abs_vec);           /* tmp = log2(abs) */
+      ppc_vrfim(gen->f, tmp_vec, tmp_vec);             /* tmp = floor(tmp); */
+
+      /* Compute X = tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
+         emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+      }
+      
+      /* Compute Y = abs / 2^tmp */
+      if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+         const int zero_vec = ppc_allocate_vec_register(gen->f);
+         ppc_vzero(gen->f, zero_vec);
+         ppc_vexptefp(gen->f, tmp_vec, tmp_vec);       /* tmp = 2 ^ tmp */
+         ppc_vrefp(gen->f, tmp_vec, tmp_vec);          /* tmp = 1 / tmp */
+         /* tmp = abs * tmp + zero */
+         ppc_vmaddfp(gen->f, tmp_vec, abs_vec, tmp_vec, zero_vec);
+         emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+         ppc_release_vec_register(gen->f, zero_vec);
+      }
+
+      ppc_release_vec_register(gen->f, tmp_vec);
+   }
+
+   /* Compute Z = RoughApproxLog2(abs) */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      int dst_vec = get_dst_vec(gen, inst, CHAN_Z);
+      ppc_vlogefp(gen->f, dst_vec, abs_vec);           /* dst = log2(abs) */
+      emit_store(gen, dst_vec, inst, CHAN_Z, TRUE);
+   }
+
+   /* Compute W = 1.0 */
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
+      emit_store(gen, one_vec, inst, CHAN_W, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, abs_vec);
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_pow(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int s0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+   int s1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   int pow_vec = ppc_allocate_vec_register(gen->f);
+   int chan;
+
+   ppc_vec_pow(gen->f, pow_vec, s0_vec, s1_vec);
+
+   FOR_EACH_DST0_ENABLED_CHANNEL(*inst, chan) {
+      emit_store(gen, pow_vec, inst, chan, FALSE);
+   }
+
+   ppc_release_vec_register(gen->f, pow_vec);
+
+   release_src_vecs(gen);
+}
+
+
+static void
+emit_xpd(struct gen_context *gen, struct tgsi_full_instruction *inst)
+{
+   int x0_vec, y0_vec, z0_vec;
+   int x1_vec, y1_vec, z1_vec;
+   int zero_vec, tmp_vec;
+   int tmp2_vec;
+
+   zero_vec = ppc_allocate_vec_register(gen->f);
+   ppc_vzero(gen->f, zero_vec);
+
+   tmp_vec = ppc_allocate_vec_register(gen->f);
+   tmp2_vec = ppc_allocate_vec_register(gen->f);
+
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      x0_vec = get_src_vec(gen, inst, 0, CHAN_X);
+      x1_vec = get_src_vec(gen, inst, 1, CHAN_X);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
+      y0_vec = get_src_vec(gen, inst, 0, CHAN_Y);
+      y1_vec = get_src_vec(gen, inst, 1, CHAN_Y);
+   }
+   if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
+       IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
+      z0_vec = get_src_vec(gen, inst, 0, CHAN_Z);
+      z1_vec = get_src_vec(gen, inst, 1, CHAN_Z);
+   }
+
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) {
+      /* tmp = y0 * z1 */
+      ppc_vmaddfp(gen->f, tmp_vec, y0_vec, z1_vec, zero_vec);
+      /* tmp = tmp - z0 * y1*/
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, z0_vec, y1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_X, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) {
+      /* tmp = z0 * x1 */
+      ppc_vmaddfp(gen->f, tmp_vec, z0_vec, x1_vec, zero_vec);
+      /* tmp = tmp - x0 * z1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, x0_vec, z1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Y, FALSE);
+   }
+   IF_IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) {
+      /* tmp = x0 * y1 */
+      ppc_vmaddfp(gen->f, tmp_vec, x0_vec, y1_vec, zero_vec);
+      /* tmp = tmp - y0 * x1 */
+      ppc_vnmsubfp(gen->f, tmp_vec, tmp_vec, y0_vec, x1_vec);
+      emit_store(gen, tmp_vec, inst, CHAN_Z, FALSE);
+   }
+   /* W is undefined */
+
+   ppc_release_vec_register(gen->f, tmp_vec);
+   ppc_release_vec_register(gen->f, zero_vec);
+   release_src_vecs(gen);
+}
+
+static int
+emit_instruction(struct gen_context *gen,
+                 struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_FLOOR:
+   case TGSI_OPCODE_FRAC:
+   case TGSI_OPCODE_EXPBASE2:
+   case TGSI_OPCODE_LOGBASE2:
+      emit_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_RCP:
+      emit_scalar_unaryop(gen, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_MAX:
+      emit_binop(gen, inst);
+      break;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SGE:
+      emit_inequality(gen, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_LRP:
+      emit_triop(gen, inst);
+      break;
+   case TGSI_OPCODE_DP3:
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+      emit_dotprod(gen, inst);
+      break;
+   case TGSI_OPCODE_LIT:
+      emit_lit(gen, inst);
+      break;
+   case TGSI_OPCODE_LOG:
+      emit_log(gen, inst);
+      break;
+   case TGSI_OPCODE_EXP:
+      emit_exp(gen, inst);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(gen, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(gen, inst);
+      break;
+   case TGSI_OPCODE_END:
+      /* normal end */
+      return 1;
+   default:
+      return 0;
+   }
+   return 1;
+}
+
+
+static void
+emit_declaration(
+   struct ppc_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+#if 0
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+#endif
+   }
+}
+
+
+
+static void
+emit_prologue(struct ppc_function *func)
+{
+   /* XXX set up stack frame */
+}
+
+
+static void
+emit_epilogue(struct ppc_function *func)
+{
+   ppc_return(func);
+   /* XXX restore prev stack frame */
+   debug_printf("PPC: Emitted %u instructions\n", func->num_inst);
+}
+
+
+
+/**
+ * Translate a TGSI vertex/fragment shader to PPC code.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output PPC code/function
+ * \param immediates  buffer to place immediates, later passed to PPC func
+ * \return TRUE for success, FALSE if translation failed
+ */
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *func,
+              float (*immediates)[4],
+              boolean do_swizzles )
+{
+   static int use_ppc_asm = -1;
+   struct tgsi_parse_context parse;
+   /*boolean instruction_phase = FALSE;*/
+   unsigned ok = 1;
+   uint num_immediates = 0;
+   struct gen_context gen;
+
+   if (use_ppc_asm < 0) {
+      /* If GALLIUM_NOPPC is set, don't use PPC codegen */
+      use_ppc_asm = !debug_get_bool_option("GALLIUM_NOPPC", FALSE);
+   }
+   if (!use_ppc_asm)
+      return FALSE;
+
+   if (0) {
+      debug_printf("\n********* TGSI->PPC ********\n");
+      tgsi_dump(tokens, 0);
+   }
+
+   util_init_math();
+
+   init_gen_context(&gen, func);
+
+   emit_prologue(func);
+
+   tgsi_parse_init( &parse, tokens );
+
+   while (!tgsi_parse_end_of_tokens(&parse) && ok) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(func, &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ok = emit_instruction(&gen, &parse.FullToken.FullInstruction);
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to PPC (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* splat each immediate component into a float[4] vector for SoA */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for (i = 0; i < size; i++) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+            num_immediates++;
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   emit_epilogue(func);
+
+   tgsi_parse_free( &parse );
+
+   if (ppc_num_instructions(func) == 0) {
+      /* ran out of memory for instructions */
+      ok = FALSE;
+   }
+
+   if (!ok)
+      debug_printf("TGSI->PPC translation failed\n");
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_PPC */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ppc.h b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
new file mode 100644
index 0000000000..829ec075e7
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_ppc.h
@@ -0,0 +1,51 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_PPC_H
+#define TGSI_PPC_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct ppc_function;
+
+extern const float ppc_builtin_constants[];
+
+
+boolean
+tgsi_emit_ppc(const struct tgsi_token *tokens,
+              struct ppc_function *function,
+              float (*immediates)[4],
+              boolean do_swizzles);
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_PPC_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
new file mode 100644
index 0000000000..bc7b941b78
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -0,0 +1,364 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_sanity.h"
+#include "tgsi_info.h"
+#include "tgsi_iterate.h"
+
+#define MAX_REGISTERS 256
+
+typedef uint reg_flag;
+
+#define BITS_IN_REG_FLAG (sizeof( reg_flag ) * 8)
+
+struct sanity_check_ctx
+{
+   struct tgsi_iterate_context iter;
+
+   reg_flag regs_decl[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
+   reg_flag regs_used[TGSI_FILE_COUNT][MAX_REGISTERS / BITS_IN_REG_FLAG];
+   boolean regs_ind_used[TGSI_FILE_COUNT];
+   uint num_imms;
+   uint num_instructions;
+   uint index_of_END;
+
+   uint errors;
+   uint warnings;
+};
+
+static void
+report_error(
+   struct sanity_check_ctx *ctx,
+   const char *format,
+   ... )
+{
+   va_list args;
+
+   debug_printf( "Error  : " );
+   va_start( args, format );
+   _debug_vprintf( format, args );
+   va_end( args );
+   debug_printf( "\n" );
+   ctx->errors++;
+}
+
+static void
+report_warning(
+   struct sanity_check_ctx *ctx,
+   const char *format,
+   ... )
+{
+   va_list args;
+
+   debug_printf( "Warning: " );
+   va_start( args, format );
+   _debug_vprintf( format, args );
+   va_end( args );
+   debug_printf( "\n" );
+   ctx->warnings++;
+}
+
+static boolean
+check_file_name(
+   struct sanity_check_ctx *ctx,
+   uint file )
+{
+   if (file <= TGSI_FILE_NULL || file >= TGSI_FILE_COUNT) {
+      report_error( ctx, "Invalid register file name" );
+      return FALSE;
+   }
+   return TRUE;
+}
+
+static boolean
+is_register_declared(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index )
+{
+   assert( index >= 0 && index < MAX_REGISTERS );
+
+   return (ctx->regs_decl[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+}
+
+static boolean
+is_any_register_declared(
+   struct sanity_check_ctx *ctx,
+   uint file )
+{
+   uint i;
+
+   for (i = 0; i < MAX_REGISTERS / BITS_IN_REG_FLAG; i++)
+      if (ctx->regs_decl[file][i])
+         return TRUE;
+   return FALSE;
+}
+
+static boolean
+is_register_used(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index )
+{
+   assert( index < MAX_REGISTERS );
+
+   return (ctx->regs_used[file][index / BITS_IN_REG_FLAG] & (1 << (index % BITS_IN_REG_FLAG))) ? TRUE : FALSE;
+}
+
+static const char *file_names[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static boolean
+check_register_usage(
+   struct sanity_check_ctx *ctx,
+   uint file,
+   int index,
+   const char *name,
+   boolean indirect_access )
+{
+   if (!check_file_name( ctx, file ))
+      return FALSE;
+
+   if (indirect_access) {
+      /* Note that 'index' is an offset relative to the value of the
+       * address register.  No range checking done here.
+       */
+      if (!is_any_register_declared( ctx, file ))
+         report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
+      ctx->regs_ind_used[file] = TRUE;
+   }
+   else {
+      if (index < 0 || index > MAX_REGISTERS) {
+         report_error( ctx, "%s[%i]: Invalid index %s",
+                       file_names[file], index, name );
+         return FALSE;
+      }
+
+      if (!is_register_declared( ctx, file, index ))
+         report_error( ctx, "%s[%d]: Undeclared %s register", file_names[file], index, name );
+      ctx->regs_used[file][index / BITS_IN_REG_FLAG] |= (1 << (index % BITS_IN_REG_FLAG));
+   }
+   return TRUE;
+}
+
+static boolean
+iter_instruction(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_instruction *inst )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   const struct tgsi_opcode_info *info;
+   uint i;
+
+   if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+      if (ctx->index_of_END != ~0) {
+         report_error( ctx, "Too many END instructions" );
+      }
+      ctx->index_of_END = ctx->num_instructions;
+   }
+
+   info = tgsi_get_opcode_info( inst->Instruction.Opcode );
+   if (info == NULL) {
+      report_error( ctx, "Invalid instruction opcode" );
+      return TRUE;
+   }
+
+   if (info->num_dst != inst->Instruction.NumDstRegs) {
+      report_error( ctx, "Invalid number of destination operands" );
+   }
+   if (info->num_src != inst->Instruction.NumSrcRegs) {
+      report_error( ctx, "Invalid number of source operands" );
+   }
+
+   /* Check destination and source registers' validity.
+    * Mark the registers as used.
+    */
+   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
+      check_register_usage(
+         ctx,
+         inst->FullDstRegisters[i].DstRegister.File,
+         inst->FullDstRegisters[i].DstRegister.Index,
+         "destination",
+         FALSE );
+   }
+   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+      check_register_usage(
+         ctx,
+         inst->FullSrcRegisters[i].SrcRegister.File,
+         inst->FullSrcRegisters[i].SrcRegister.Index,
+         "source",
+         (boolean)inst->FullSrcRegisters[i].SrcRegister.Indirect );
+      if (inst->FullSrcRegisters[i].SrcRegister.Indirect) {
+         uint file;
+         int index;
+
+         file = inst->FullSrcRegisters[i].SrcRegisterInd.File;
+         index = inst->FullSrcRegisters[i].SrcRegisterInd.Index;
+         check_register_usage(
+            ctx,
+            file,
+            index,
+            "indirect",
+            FALSE );
+         if (file != TGSI_FILE_ADDRESS || index != 0)
+            report_warning( ctx, "Indirect register not ADDR[0]" );
+      }
+   }
+
+   ctx->num_instructions++;
+
+   return TRUE;
+}
+
+static boolean
+iter_declaration(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_declaration *decl )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   uint file;
+   uint i;
+
+   /* No declarations allowed after the first instruction.
+    */
+   if (ctx->num_instructions > 0)
+      report_error( ctx, "Instruction expected but declaration found" );
+
+   /* Check registers' validity.
+    * Mark the registers as declared.
+    */
+   file = decl->Declaration.File;
+   if (!check_file_name( ctx, file ))
+      return TRUE;
+   for (i = decl->DeclarationRange.First; i <= decl->DeclarationRange.Last; i++) {
+      if (is_register_declared( ctx, file, i ))
+         report_error( ctx, "The same register declared twice" );
+      ctx->regs_decl[file][i / BITS_IN_REG_FLAG] |= (1 << (i % BITS_IN_REG_FLAG));
+   }
+
+   return TRUE;
+}
+
+static boolean
+iter_immediate(
+   struct tgsi_iterate_context *iter,
+   struct tgsi_full_immediate *imm )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+
+   assert( ctx->num_imms < MAX_REGISTERS );
+
+   /* No immediates allowed after the first instruction.
+    */
+   if (ctx->num_instructions > 0)
+      report_error( ctx, "Instruction expected but immediate found" );
+
+   /* Mark the register as declared.
+    */
+   ctx->regs_decl[TGSI_FILE_IMMEDIATE][ctx->num_imms / BITS_IN_REG_FLAG] |= (1 << (ctx->num_imms % BITS_IN_REG_FLAG));
+   ctx->num_imms++;
+
+   /* Check data type validity.
+    */
+   if (imm->Immediate.DataType != TGSI_IMM_FLOAT32) {
+      report_error( ctx, "Invalid immediate data type" );
+      return TRUE;
+   }
+
+   return TRUE;
+}
+
+static boolean
+epilog(
+   struct tgsi_iterate_context *iter )
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   uint file;
+
+   /* There must be an END instruction somewhere.
+    */
+   if (ctx->index_of_END == ~0) {
+      report_error( ctx, "Missing END instruction" );
+   }
+
+   /* Check if all declared registers were used.
+    */
+   for (file = TGSI_FILE_NULL; file < TGSI_FILE_COUNT; file++) {
+      uint i;
+
+      for (i = 0; i < MAX_REGISTERS; i++) {
+         if (is_register_declared( ctx, file, i ) && !is_register_used( ctx, file, i ) && !ctx->regs_ind_used[file]) {
+            report_warning( ctx, "Register never used" );
+         }
+      }
+   }
+
+   /* Print totals, if any.
+    */
+   if (ctx->errors || ctx->warnings)
+      debug_printf( "%u errors, %u warnings\n", ctx->errors, ctx->warnings );
+
+   return TRUE;
+}
+
+boolean
+tgsi_sanity_check(
+   struct tgsi_token *tokens )
+{
+   struct sanity_check_ctx ctx;
+
+   ctx.iter.prolog = NULL;
+   ctx.iter.iterate_instruction = iter_instruction;
+   ctx.iter.iterate_declaration = iter_declaration;
+   ctx.iter.iterate_immediate = iter_immediate;
+   ctx.iter.epilog = epilog;
+
+   memset( ctx.regs_decl, 0, sizeof( ctx.regs_decl ) );
+   memset( ctx.regs_used, 0, sizeof( ctx.regs_used ) );
+   memset( ctx.regs_ind_used, 0, sizeof( ctx.regs_ind_used ) );
+   ctx.num_imms = 0;
+   ctx.num_instructions = 0;
+   ctx.index_of_END = ~0;
+
+   ctx.errors = 0;
+   ctx.warnings = 0;
+
+   if (!tgsi_iterate_shader( tokens, &ctx.iter ))
+      return FALSE;
+
+   return ctx.errors == 0;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
new file mode 100644
index 0000000000..ca45e94c7a
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SANITY_H
+#define TGSI_SANITY_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/* Check the given token stream for errors and common mistakes.
+ * Diagnostic messages are printed out to the debug output.
+ * Returns TRUE if there are no errors, even though there could be some warnings.
+ */
+boolean
+tgsi_sanity_check(
+   struct tgsi_token *tokens );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_SANITY_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
new file mode 100644
index 0000000000..be4870a498
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI program scan utility.
+ * Used to determine which registers and instructions are used by a shader.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "util/u_math.h"
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+
+
+
+/**
+ */
+void
+tgsi_scan_shader(const struct tgsi_token *tokens,
+                 struct tgsi_shader_info *info)
+{
+   uint procType, i;
+   struct tgsi_parse_context parse;
+
+   memset(info, 0, sizeof(*info));
+   for (i = 0; i < TGSI_FILE_COUNT; i++)
+      info->file_max[i] = -1;
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init( &parse, tokens ) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_scan_shader()!\n");
+      return;
+   }
+   procType = parse.FullHeader.Processor.Processor;
+   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
+          procType == TGSI_PROCESSOR_VERTEX ||
+          procType == TGSI_PROCESSOR_GEOMETRY);
+
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      info->num_tokens++;
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst
+               = &parse.FullToken.FullInstruction;
+
+            assert(fullinst->Instruction.Opcode < TGSI_OPCODE_LAST);
+            info->opcode_count[fullinst->Instruction.Opcode]++;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         {
+            struct tgsi_full_declaration *fulldecl
+               = &parse.FullToken.FullDeclaration;
+            uint file = fulldecl->Declaration.File;
+            uint i;
+            for (i = fulldecl->DeclarationRange.First;
+                 i <= fulldecl->DeclarationRange.Last;
+                 i++) {
+
+               /* only first 32 regs will appear in this bitfield */
+               info->file_mask[file] |= (1 << i);
+               info->file_count[file]++;
+               info->file_max[file] = MAX2(info->file_max[file], (int)i);
+
+               if (file == TGSI_FILE_INPUT) {
+                  info->input_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
+                  info->input_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->num_inputs++;
+               }
+
+               if (file == TGSI_FILE_OUTPUT) {
+                  info->output_semantic_name[i] = (ubyte)fulldecl->Semantic.SemanticName;
+                  info->output_semantic_index[i] = (ubyte)fulldecl->Semantic.SemanticIndex;
+                  info->num_outputs++;
+               }
+
+               /* special case */
+               if (procType == TGSI_PROCESSOR_FRAGMENT &&
+                   file == TGSI_FILE_OUTPUT &&
+                   fulldecl->Semantic.SemanticName == TGSI_SEMANTIC_POSITION) {
+                  info->writes_z = TRUE;
+               }
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         info->immediate_count++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   assert( info->file_max[TGSI_FILE_INPUT] + 1 == info->num_inputs );
+   assert( info->file_max[TGSI_FILE_OUTPUT] + 1 == info->num_outputs );
+
+   info->uses_kill = (info->opcode_count[TGSI_OPCODE_KIL] ||
+                      info->opcode_count[TGSI_OPCODE_KILP]);
+
+   tgsi_parse_free (&parse);
+}
+
+
+
+/**
+ * Check if the given shader is a "passthrough" shader consisting of only
+ * MOV instructions of the form:  MOV OUT[n], IN[n]
+ *  
+ */
+boolean
+tgsi_is_passthrough_shader(const struct tgsi_token *tokens)
+{
+   struct tgsi_parse_context parse;
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init(&parse, tokens) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_is_passthrough_shader()!\n");
+      return FALSE;
+   }
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst =
+               &parse.FullToken.FullInstruction;
+            const struct tgsi_full_src_register *src =
+               &fullinst->FullSrcRegisters[0];
+            const struct tgsi_full_dst_register *dst =
+               &fullinst->FullDstRegisters[0];
+
+            /* Do a whole bunch of checks for a simple move */
+            if (fullinst->Instruction.Opcode != TGSI_OPCODE_MOV ||
+                src->SrcRegister.File != TGSI_FILE_INPUT ||
+                dst->DstRegister.File != TGSI_FILE_OUTPUT ||
+                src->SrcRegister.Index != dst->DstRegister.Index ||
+
+                src->SrcRegister.Negate ||
+                src->SrcRegisterExtMod.Negate ||
+                src->SrcRegisterExtMod.Absolute ||
+                src->SrcRegisterExtMod.Scale2X ||
+                src->SrcRegisterExtMod.Bias ||
+                src->SrcRegisterExtMod.Complement ||
+
+                src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
+                src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
+                src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
+                src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ||
+
+                src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
+                src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
+                src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
+                src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W ||
+
+                dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW)
+            {
+               tgsi_parse_free(&parse);
+               return FALSE;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* fall-through */
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* fall-through */
+      default:
+         ; /* no-op */
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   /* if we get here, it's a pass-through shader */
+   return TRUE;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
new file mode 100644
index 0000000000..5cb6efb343
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -0,0 +1,74 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SCAN_H
+#define TGSI_SCAN_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+
+/**
+ * Shader summary info
+ */
+struct tgsi_shader_info
+{
+   uint num_tokens;
+
+   /* XXX eventually remove the corresponding fields from pipe_shader_state: */
+   ubyte num_inputs;
+   ubyte num_outputs;
+   ubyte input_semantic_name[PIPE_MAX_SHADER_INPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte input_semantic_index[PIPE_MAX_SHADER_INPUTS];
+   ubyte output_semantic_name[PIPE_MAX_SHADER_OUTPUTS]; /**< TGSI_SEMANTIC_x */
+   ubyte output_semantic_index[PIPE_MAX_SHADER_OUTPUTS];
+
+   uint file_mask[TGSI_FILE_COUNT];  /**< bitmask of declared registers */
+   uint file_count[TGSI_FILE_COUNT];  /**< number of declared registers */
+   int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
+
+   uint immediate_count; /**< number of immediates declared */
+
+   uint opcode_count[TGSI_OPCODE_LAST];  /**< opcode histogram */
+
+   boolean writes_z;  /**< does fragment shader write Z value? */
+   boolean uses_kill;  /**< KIL or KILP instruction used? */
+};
+
+
+extern void
+tgsi_scan_shader(const struct tgsi_token *tokens,
+                 struct tgsi_shader_info *info);
+
+
+extern boolean
+tgsi_is_passthrough_shader(const struct tgsi_token *tokens);
+
+
+#endif /* TGSI_SCAN_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
new file mode 100644
index 0000000000..cac44af7f4
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -0,0 +1,2723 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86)
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#if defined(PIPE_ARCH_SSE)
+#include "util/u_sse.h"
+#endif
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_sse2.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+/* for 1/sqrt()
+ *
+ * This costs about 100fps (close to 10%) in gears:
+ */
+#define HIGH_PRECISION 1
+
+#define FAST_MATH 1
+
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
+#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
+#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
+#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
+
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+static struct x86_reg
+get_immediate_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DI );
+}
+
+
+/**
+ * Data access helpers.
+ */
+
+
+static struct x86_reg
+get_immediate(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_immediate_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   x86_ret( func );
+}
+
+
+/**
+ * Data fetch helpers.
+ */
+
+/**
+ * Copy a shader constant to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src const buffer index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_const(
+   struct x86_function *func,
+   uint xmm,
+   int vec,
+   uint chan,
+   uint indirect,
+   uint indirectFile,
+   int indirectIndex )
+{
+   if (indirect) {
+      /* 'vec' is the offset from the address register's value.
+       * We're loading CONST[ADDR+vec] into an xmm register.
+       */
+      struct x86_reg r0 = get_input_base();
+      struct x86_reg r1 = get_output_base();
+      uint i;
+
+      assert( indirectFile == TGSI_FILE_ADDRESS );
+      assert( indirectIndex == 0 );
+
+      x86_push( func, r0 );
+      x86_push( func, r1 );
+
+      /*
+       * Loop over the four pixels or vertices in the quad.
+       * Get the value of the address (offset) register for pixel/vertex[i],
+       * add it to the src offset and index into the constant buffer.
+       * Note that we're working on SOA data.
+       * If any of the pixel/vertex execution channels are unused their
+       * values will be garbage.  It's very important that we don't use
+       * those garbage values as indexes into the constant buffer since
+       * that'll cause segfaults.
+       * The solution is to bitwise-AND the offset with the execution mask
+       * register whose values are either 0 or ~0.
+       * The caller must setup the execution mask register to indicate
+       * which channels are valid/alive before running the shader.
+       * The execution mask will also figure into loops and conditionals
+       * someday.
+       */
+      for (i = 0; i < QUAD_SIZE; i++) {
+         /* r1 = address register[i] */
+         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
+         /* r0 = execution mask[i] */
+         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
+         /* r1 = r1 & r0 */
+         x86_and( func, r1, r0 );
+         /* r0 = 'vec', the offset */
+         x86_lea( func, r0, get_const( vec, chan ) );
+
+         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
+          */
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+         x86_add( func, r1, r1 );
+
+         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
+         x86_mov( func, r1, x86_deref( r0 ) );
+         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
+      }
+
+      x86_pop( func, r1 );
+      x86_pop( func, r0 );
+
+      sse_movaps(
+         func,
+         make_xmm( xmm ),
+         get_temp( TEMP_R0, CHAN_X ) );
+   }
+   else {
+      /* 'vec' is the index into the src register file, such as TEMP[vec] */
+      assert( vec >= 0 );
+
+      sse_movss(
+         func,
+         make_xmm( xmm ),
+         get_const( vec, chan ) );
+      sse_shufps(
+         func,
+         make_xmm( xmm ),
+         make_xmm( xmm ),
+         SHUF( 0, 0, 0, 0 ) );
+   }
+}
+
+static void
+emit_immediate(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_immediate( vec, chan ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+
+/**
+ * Copy a shader input to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src input attrib
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+/**
+ * Store an xmm register to a shader output
+ * \param xmm  the source xmm register
+ * \param vec  the dest output attrib
+ * \param chan  src dest channel to store (X, Y, Z or W)
+ */
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+/**
+ * Copy a shader temporary to xmm register
+ * \param xmm  the destination xmm register
+ * \param vec  the src temp register
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ */
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+/**
+ * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
+ * \param xmm  the destination xmm register
+ * \param vec  the src input/attribute coefficient index
+ * \param chan  src channel to fetch (X, Y, Z or W)
+ * \param member  0=a0, 1=dadx, 2=dady
+ */
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   sse_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   sse_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   sse_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   assert( vec == 0 );
+
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_TEMP_ADDR,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+
+/**
+ * Function call helpers.
+ */
+
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( i ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( i ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   /* Restore GP registers in a reverse order.
+    */
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_CX) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX) );
+
+   sse_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (PIPE_CDECL *code)() )
+{
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      code );
+}
+
+
+#if defined(PIPE_ARCH_SSE)
+
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+#endif /* PIPE_ARCH_SSE */
+
+
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+cos4f(
+   float *store )
+{
+   store[0] = cosf( store[0] );
+   store[1] = cosf( store[1] );
+   store[2] = cosf( store[2] );
+   store[3] = cosf( store[3] );
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save, 
+      xmm_dst,
+      cos4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
+ex24f(
+   float *store )
+{
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
+#endif
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_i2f(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse2_cvtdq2ps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void PIPE_CDECL
+flr4f(
+   float *store )
+{
+   store[0] = floorf( store[0] );
+   store[1] = floorf( store[1] );
+   store[2] = floorf( store[2] );
+   store[3] = floorf( store[3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      flr4f );
+}
+
+static void PIPE_CDECL
+frc4f(
+   float *store )
+{
+   store[0] -= floorf( store[0] );
+   store[1] -= floorf( store[1] );
+   store[2] -= floorf( store[2] );
+   store[3] -= floorf( store[3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      frc4f );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
+lg24f(
+   float *store )
+{
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
+#else
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
+#endif
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   sse_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
+__attribute__((force_align_arg_pointer))
+#endif
+pow4f(
+   float *store )
+{
+#if defined(PIPE_ARCH_SSE)
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
+#else
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
+#endif
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_save,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void PIPE_CDECL
+rnd4f(
+   float *store )
+{
+   store[0] = floorf( store[0] + 0.5f );
+   store[1] = floorf( store[1] + 0.5f );
+   store[2] = floorf( store[2] + 0.5f );
+   store[3] = floorf( store[3] + 0.5f );
+}
+
+static void
+emit_rnd(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      rnd4f );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+#if HIGH_PRECISION
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   {
+      struct x86_reg dst = make_xmm( xmm_dst );
+      struct x86_reg src = make_xmm( xmm_src );
+      struct x86_reg tmp0 = make_xmm( 2 );
+      struct x86_reg tmp1 = make_xmm( 3 );
+
+      assert( xmm_dst != xmm_src );
+      assert( xmm_dst != 2 && xmm_dst != 3 );
+      assert( xmm_src != 2 && xmm_src != 3 );
+
+      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
+      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
+      sse_rsqrtps( func, tmp1, src  );
+      sse_mulps(   func, src,  tmp1 );
+      sse_mulps(   func, dst,  tmp1 );
+      sse_mulps(   func, src,  tmp1 );
+      sse_subps(   func, tmp0, src  );
+      sse_mulps(   func, dst,  tmp0 );
+   }
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
+   sse_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+#endif
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   sse_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void PIPE_CDECL
+sgn4f(
+   float *store )
+{
+   store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
+   store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
+   store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
+   store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
+}
+
+static void
+emit_sgn(
+   struct x86_function *func,
+   unsigned xmm_save, 
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sgn4f );
+}
+
+static void PIPE_CDECL
+sin4f(
+   float *store )
+{
+   store[0] = sinf( store[0] );
+   store[1] = sinf( store[1] );
+   store[2] = sinf( store[2] );
+   store[3] = sinf( store[3] );
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_save,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   sse_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+/**
+ * Register fetch.
+ */
+
+static void
+emit_fetch(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_src_register *reg,
+   const unsigned chan_index )
+{
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch (reg->SrcRegister.File) {
+      case TGSI_FILE_CONSTANT:
+         emit_const(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle,
+            reg->SrcRegister.Indirect,
+            reg->SrcRegisterInd.File,
+            reg->SrcRegisterInd.Index );
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         emit_immediate(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_INPUT:
+         emit_inputf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         emit_tempf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      emit_tempf(
+         func,
+         xmm,
+         TEMP_ONE_I,
+         TEMP_ONE_C );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+}
+
+#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
+
+/**
+ * Register store.
+ */
+
+static void
+emit_store(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   unsigned chan_index )
+{
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_OUTPUT:
+      emit_output(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      emit_temps(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* assert( 0 ); */
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+}
+
+#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   x86_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         sse_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            sse_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            x86_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   x86_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   x86_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+
+static void
+emit_kilp(
+   struct x86_function *func )
+{
+   /* XXX todo / fix me */
+}
+
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      sse_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TEMP_ONE_I,
+            TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      sse_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      sse_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      sse_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      sse_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static int
+emit_instruction(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 3, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            sse_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            sse_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            sse_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 1, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 1, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_MOV( func, 1, 0 );
+            emit_flr( func, 2, 1 );
+            /* dst.x = ex2(floor(src.x)) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               emit_MOV( func, 2, 1 );
+               emit_ex2( func, 3, 2 );
+               STORE( func, *inst, 2, 0, CHAN_X );
+            }
+            /* dst.y = src.x - floor(src.x) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_MOV( func, 2, 0 );
+               emit_sub( func, 2, 1 );
+               STORE( func, *inst, 2, 0, CHAN_Y );
+            }
+         }
+         /* dst.z = ex2(src.x) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            emit_ex2( func, 3, 0 );
+            STORE( func, *inst, 0, 0, CHAN_Z );
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_LOG:
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_abs( func, 0 );
+         emit_MOV( func, 1, 0 );
+         emit_lg2( func, 2, 1 );
+         /* dst.z = lg2(abs(src.x)) */
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            STORE( func, *inst, 1, 0, CHAN_Z );
+         }
+         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            emit_flr( func, 2, 1 );
+            /* dst.x = floor(lg2(abs(src.x))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
+               STORE( func, *inst, 1, 0, CHAN_X );
+            }
+            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
+            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+               emit_ex2( func, 2, 1 );
+               emit_rcp( func, 1, 1 );
+               emit_mul( func, 0, 1 );
+               STORE( func, *inst, 0, 0, CHAN_Y );
+            }
+         }
+      }
+      /* dst.w = 1.0 */
+      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TEMP_ONE_I,
+            TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         sse_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KILP:
+      /* predicated kill */
+      emit_kilp( func );
+      return 0; /* XXX fix me */
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* conditional kill */
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_rnd( func, 0, 0 );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+      emit_ret( func );
+      break;
+
+   case TGSI_OPCODE_END:
+      break;
+
+   case TGSI_OPCODE_SSG:
+   /* TGSI_OPCODE_SGN */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_sgn( func, 0, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_sin( func, 0, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_00000000_I,
+	    TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+	 emit_tempf(
+	    func,
+	    0,
+	    TEMP_ONE_I,
+	    TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      /* fall-through */
+   case TGSI_OPCODE_NRM4:
+      /* 3 or 4-component normalization */
+      {
+         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
+         /* note: cannot use xmm regs 2/3 here (see emit_rsqrt() above) */
+         FETCH( func, *inst, 4, 0, CHAN_X );    /* xmm4 = src[0].x */
+         FETCH( func, *inst, 5, 0, CHAN_Y );    /* xmm5 = src[0].y */
+         FETCH( func, *inst, 6, 0, CHAN_Z );    /* xmm6 = src[0].z */
+         if (dims == 4) {
+            FETCH( func, *inst, 7, 0, CHAN_W ); /* xmm7 = src[0].w */
+         }
+         emit_MOV( func, 0, 4 );                /* xmm0 = xmm3 */
+         emit_mul( func, 0, 4 );                /* xmm0 *= xmm3 */
+         emit_MOV( func, 1, 5 );                /* xmm1 = xmm4 */
+         emit_mul( func, 1, 5 );                /* xmm1 *= xmm4 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         emit_MOV( func, 1, 6 );                /* xmm1 = xmm5 */
+         emit_mul( func, 1, 6 );                /* xmm1 *= xmm5 */
+         emit_add( func, 0, 1 );                /* xmm0 += xmm1 */
+         if (dims == 4) {
+            emit_MOV( func, 1, 7 );             /* xmm1 = xmm7 */
+            emit_mul( func, 1, 7 );             /* xmm1 *= xmm7 */
+            emit_add( func, 0, 0 );             /* xmm0 += xmm1 */
+         }
+         emit_rsqrt( func, 1, 0 );              /* xmm1 = 1/sqrt(xmm0) */
+         FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+            if (chan_index < dims) {
+               emit_mul( func, 4+chan_index, 1); /* xmm[4+ch] *= xmm1 */
+               STORE( func, *inst, 4+chan_index, 0, chan_index );
+            }
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
+      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
+      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
+      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
+      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
+      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
+      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
+      }
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         emit_i2f( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct x86_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      first = decl->DeclarationRange.First;
+      last = decl->DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Declaration.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+   }
+}
+
+static void aos_to_soa( struct x86_function *func, 
+                        uint arg_aos,
+                        uint arg_soa, 
+                        uint arg_num, 
+                        uint arg_stride )
+{
+   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
+   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
+   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
+   int inner_loop;
+
+
+   /* Save EBX */
+   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
+
+   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
+   x86_mov( func, soa_input,  x86_fn_arg( func, arg_soa ) );
+   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
+   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      x86_push( func, aos_input );
+      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_add( func, aos_input, stride );
+      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
+      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
+      x86_pop( func, aos_input );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
+      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
+      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
+      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
+
+      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
+      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
+      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
+      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
+
+      /* Advance to next input */
+      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
+      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
+   }
+   /* while --num_inputs */
+   x86_dec( func, num_inputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_input );
+}
+
+static void soa_to_aos( struct x86_function *func, uint aos, uint soa, uint num, uint stride )
+{
+   struct x86_reg soa_output;
+   struct x86_reg aos_output;
+   struct x86_reg num_outputs;
+   struct x86_reg temp;
+   int inner_loop;
+
+   soa_output = x86_make_reg( file_REG32, reg_AX );
+   aos_output = x86_make_reg( file_REG32, reg_BX );
+   num_outputs = x86_make_reg( file_REG32, reg_CX );
+   temp = x86_make_reg( file_REG32, reg_DX );
+
+   /* Save EBX */
+   x86_push( func, aos_output );
+
+   x86_mov( func, soa_output, x86_fn_arg( func, soa ) );
+   x86_mov( func, aos_output, x86_fn_arg( func, aos ) );
+   x86_mov( func, num_outputs, x86_fn_arg( func, num ) );
+
+   /* do */
+   inner_loop = x86_get_label( func );
+   {
+      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
+      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
+      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
+      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
+
+      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
+      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
+      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
+      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
+      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
+      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
+
+      x86_mov( func, temp, x86_fn_arg( func, stride ) );
+      x86_push( func, aos_output );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
+      x86_add( func, aos_output, temp );
+      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_add( func, aos_output, temp );
+      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
+      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
+      x86_pop( func, aos_output );
+
+      /* Advance to next output */
+      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
+      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
+   }
+   /* while --num_outputs */
+   x86_dec( func, num_outputs );
+   x86_jcc( func, cc_NE, inner_loop );
+
+   /* Restore EBX */
+   x86_pop( func, aos_output );
+}
+
+/**
+ * Translate a TGSI vertex/fragment shader to SSE2 code.
+ * Slightly different things are done for vertex vs. fragment shaders.
+ *
+ * Note that fragment shaders are responsible for interpolating shader
+ * inputs. Because on x86 we have only 4 GP registers, and here we
+ * have 5 shader arguments (input, output, const, temp and coef), the
+ * code is split into two phases -- DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff
+ * argument, as outputs are not needed in the DECLARATION phase.
+ *
+ * \param tokens  the TGSI input shader
+ * \param func  the output SSE code/function
+ * \param immediates  buffer to place immediates, later passed to SSE func
+ * \param return  1 for success, 0 if translation failed
+ */
+unsigned
+tgsi_emit_sse2(
+   const struct tgsi_token *tokens,
+   struct x86_function *func,
+   float (*immediates)[4],
+   boolean do_swizzles )
+{
+   struct tgsi_parse_context parse;
+   boolean instruction_phase = FALSE;
+   unsigned ok = 1;
+   uint num_immediates = 0;
+
+   util_init_math();
+
+   func->csr = func->store;
+
+   tgsi_parse_init( &parse, tokens );
+
+   /* Can't just use EDI, EBX without save/restoring them:
+    */
+   x86_push(
+      func,
+      get_immediate_base() );
+
+   x86_push(
+      func,
+      get_temp_base() );
+
+
+   /*
+    * Different function args for vertex/fragment shaders:
+    */
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /* DECLARATION phase, do not load output argument. */
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      /* skipping outputs argument here */
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_coef_base(),
+         x86_fn_arg( func, 5 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 6 ) );
+   }
+   else {
+      assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
+
+      if (do_swizzles)
+         aos_to_soa( func, 
+                     6,         /* aos_input */
+                     1,         /* machine->input */
+                     7,         /* num_inputs */
+                     8 );       /* input_stride */
+
+      x86_mov(
+         func,
+         get_input_base(),
+         x86_fn_arg( func, 1 ) );
+      x86_mov(
+         func,
+         get_output_base(),
+         x86_fn_arg( func, 2 ) );
+      x86_mov(
+         func,
+         get_const_base(),
+         x86_fn_arg( func, 3 ) );
+      x86_mov(
+         func,
+         get_temp_base(),
+         x86_fn_arg( func, 4 ) );
+      x86_mov(
+         func,
+         get_immediate_base(),
+         x86_fn_arg( func, 5 ) );
+   }
+
+   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            emit_declaration(
+               func,
+               &parse.FullToken.FullDeclaration );
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
+            if( !instruction_phase ) {
+               /* INSTRUCTION phase, overwrite coeff with output. */
+               instruction_phase = TRUE;
+               x86_mov(
+                  func,
+                  get_output_base(),
+                  x86_fn_arg( func, 2 ) );
+            }
+         }
+
+         ok = emit_instruction(
+            func,
+            &parse.FullToken.FullInstruction );
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* simply copy the immediate values into the next immediates[] slot */
+         {
+            const uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            uint i;
+            assert(size <= 4);
+            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
+            for( i = 0; i < size; i++ ) {
+               immediates[num_immediates][i] =
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+#if 0
+            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
+                   num_immediates,
+                   immediates[num_immediates][0],
+                   immediates[num_immediates][1],
+                   immediates[num_immediates][2],
+                   immediates[num_immediates][3]);
+#endif
+            num_immediates++;
+         }
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
+      if (do_swizzles)
+         soa_to_aos( func, 9, 2, 10, 11 );
+   }
+
+   /* Can't just use EBX, EDI without save/restoring them:
+    */
+   x86_pop(
+      func,
+      get_temp_base() );
+
+   x86_pop(
+      func,
+      get_immediate_base() );
+
+   emit_ret( func );
+
+   tgsi_parse_free( &parse );
+
+   return ok;
+}
+
+#endif /* PIPE_ARCH_X86 */
+
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
new file mode 100644
index 0000000000..af838b2a25
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
@@ -0,0 +1,49 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_SSE2_H
+#define TGSI_SSE2_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+struct tgsi_token;
+struct x86_function;
+
+unsigned
+tgsi_emit_sse2(
+   const struct tgsi_token *tokens,
+   struct x86_function *function,
+   float (*immediates)[4],
+   boolean do_swizzles );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_SSE2_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
new file mode 100644
index 0000000000..9454563361
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -0,0 +1,1097 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "tgsi_text.h"
+#include "tgsi_build.h"
+#include "tgsi_info.h"
+#include "tgsi_parse.h"
+#include "tgsi_sanity.h"
+#include "tgsi_util.h"
+
+static boolean is_alpha_underscore( const char *cur )
+{
+   return
+      (*cur >= 'a' && *cur <= 'z') ||
+      (*cur >= 'A' && *cur <= 'Z') ||
+      *cur == '_';
+}
+
+static boolean is_digit( const char *cur )
+{
+   return *cur >= '0' && *cur <= '9';
+}
+
+static boolean is_digit_alpha_underscore( const char *cur )
+{
+   return is_digit( cur ) || is_alpha_underscore( cur );
+}
+
+static boolean uprcase( char c )
+{
+   if (c >= 'a' && c <= 'z')
+      return c += 'A' - 'a';
+   return c;
+}
+
+static boolean str_match_no_case( const char **pcur, const char *str )
+{
+   const char *cur = *pcur;
+
+   while (*str != '\0' && *str == uprcase( *cur )) {
+      str++;
+      cur++;
+   }
+   if (*str == '\0') {
+      *pcur = cur;
+      return TRUE;
+   }
+   return FALSE;
+}
+
+/* Eat zero or more whitespaces.
+ */
+static void eat_opt_white( const char **pcur )
+{
+   while (**pcur == ' ' || **pcur == '\t' || **pcur == '\n')
+      (*pcur)++;
+}
+
+/* Eat one or more whitespaces.
+ * Return TRUE if at least one whitespace eaten.
+ */
+static boolean eat_white( const char **pcur )
+{
+   const char *cur = *pcur;
+
+   eat_opt_white( pcur );
+   return *pcur > cur;
+}
+
+/* Parse unsigned integer.
+ * No checks for overflow.
+ */
+static boolean parse_uint( const char **pcur, uint *val )
+{
+   const char *cur = *pcur;
+
+   if (is_digit( cur )) {
+      *val = *cur++ - '0';
+      while (is_digit( cur ))
+         *val = *val * 10 + *cur++ - '0';
+      *pcur = cur;
+      return TRUE;
+   }
+   return FALSE;
+}
+
+/* Parse floating point.
+ */
+static boolean parse_float( const char **pcur, float *val )
+{
+   const char *cur = *pcur;
+   boolean integral_part = FALSE;
+   boolean fractional_part = FALSE;
+
+   *val = (float) atof( cur );
+
+   if (*cur == '-' || *cur == '+')
+      cur++;
+   if (is_digit( cur )) {
+      cur++;
+      integral_part = TRUE;
+      while (is_digit( cur ))
+         cur++;
+   }
+   if (*cur == '.') {
+      cur++;
+      if (is_digit( cur )) {
+         cur++;
+         fractional_part = TRUE;
+         while (is_digit( cur ))
+            cur++;
+      }
+   }
+   if (!integral_part && !fractional_part)
+      return FALSE;
+   if (uprcase( *cur ) == 'E') {
+      cur++;
+      if (*cur == '-' || *cur == '+')
+         cur++;
+      if (is_digit( cur )) {
+         cur++;
+         while (is_digit( cur ))
+            cur++;
+      }
+      else
+         return FALSE;
+   }
+   *pcur = cur;
+   return TRUE;
+}
+
+struct translate_ctx
+{
+   const char *text;
+   const char *cur;
+   struct tgsi_token *tokens;
+   struct tgsi_token *tokens_cur;
+   struct tgsi_token *tokens_end;
+   struct tgsi_header *header;
+};
+
+static void report_error( struct translate_ctx *ctx, const char *msg )
+{
+   debug_printf( "\nError: %s", msg );
+}
+
+/* Parse shader header.
+ * Return TRUE for one of the following headers.
+ *    FRAG1.1
+ *    GEOM1.1
+ *    VERT1.1
+ */
+static boolean parse_header( struct translate_ctx *ctx )
+{
+   uint processor;
+
+   if (str_match_no_case( &ctx->cur, "FRAG1.1" ))
+      processor = TGSI_PROCESSOR_FRAGMENT;
+   else if (str_match_no_case( &ctx->cur, "VERT1.1" ))
+      processor = TGSI_PROCESSOR_VERTEX;
+   else if (str_match_no_case( &ctx->cur, "GEOM1.1" ))
+      processor = TGSI_PROCESSOR_GEOMETRY;
+   else {
+      report_error( ctx, "Unknown header" );
+      return FALSE;
+   }
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   *(struct tgsi_version *) ctx->tokens_cur++ = tgsi_build_version();
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   ctx->header = (struct tgsi_header *) ctx->tokens_cur++;
+   *ctx->header = tgsi_build_header();
+
+   if (ctx->tokens_cur >= ctx->tokens_end)
+      return FALSE;
+   *(struct tgsi_processor *) ctx->tokens_cur++ = tgsi_build_processor( processor, ctx->header );
+
+   return TRUE;
+}
+
+static boolean parse_label( struct translate_ctx *ctx, uint *val )
+{
+   const char *cur = ctx->cur;
+
+   if (parse_uint( &cur, val )) {
+      eat_opt_white( &cur );
+      if (*cur == ':') {
+         cur++;
+         ctx->cur = cur;
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+static const char *file_names[TGSI_FILE_COUNT] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static boolean
+parse_file( const char **pcur, uint *file )
+{
+   uint i;
+
+   for (i = 0; i < TGSI_FILE_COUNT; i++) {
+      const char *cur = *pcur;
+
+      if (str_match_no_case( &cur, file_names[i] )) {
+         if (!is_digit_alpha_underscore( cur )) {
+            *pcur = cur;
+            *file = i;
+            return TRUE;
+         }
+      }
+   }
+   return FALSE;
+}
+
+static boolean
+parse_opt_writemask(
+   struct translate_ctx *ctx,
+   uint *writemask )
+{
+   const char *cur;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == '.') {
+      cur++;
+      *writemask = TGSI_WRITEMASK_NONE;
+      eat_opt_white( &cur );
+      if (uprcase( *cur ) == 'X') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_X;
+      }
+      if (uprcase( *cur ) == 'Y') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_Y;
+      }
+      if (uprcase( *cur ) == 'Z') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_Z;
+      }
+      if (uprcase( *cur ) == 'W') {
+         cur++;
+         *writemask |= TGSI_WRITEMASK_W;
+      }
+
+      if (*writemask == TGSI_WRITEMASK_NONE) {
+         report_error( ctx, "Writemask expected" );
+         return FALSE;
+      }
+
+      ctx->cur = cur;
+   }
+   else {
+      *writemask = TGSI_WRITEMASK_XYZW;
+   }
+   return TRUE;
+}
+
+/* <register_file_bracket> ::= <file> `['
+ */
+static boolean
+parse_register_file_bracket(
+   struct translate_ctx *ctx,
+   uint *file )
+{
+   if (!parse_file( &ctx->cur, file )) {
+      report_error( ctx, "Unknown register file" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '[') {
+      report_error( ctx, "Expected `['" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* <register_file_bracket_index> ::= <register_file_bracket> <uint>
+ */
+static boolean
+parse_register_file_bracket_index(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (!parse_uint( &ctx->cur, &uindex )) {
+      report_error( ctx, "Expected literal unsigned integer" );
+      return FALSE;
+   }
+   *index = (int) uindex;
+   return TRUE;
+}
+
+/* Parse destination register operand.
+ *    <register_dst> ::= <register_file_bracket_index> `]'
+ */
+static boolean
+parse_register_dst(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index )
+{
+   if (!parse_register_file_bracket_index( ctx, file, index ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* Parse source register operand.
+ *    <register_src> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket> <register_dst> `]' |
+ *                       <register_file_bracket> <register_dst> `+' <uint> `]' |
+ *                       <register_file_bracket> <register_dst> `-' <uint> `]'
+ */
+static boolean
+parse_register_src(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *index,
+   uint *ind_file,
+   int *ind_index )
+{
+   const char *cur;
+   uint uindex;
+
+   if (!parse_register_file_bracket( ctx, file ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   cur = ctx->cur;
+   if (parse_file( &cur, ind_file )) {
+      if (!parse_register_dst( ctx, ind_file, ind_index ))
+         return FALSE;
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur == '+' || *ctx->cur == '-') {
+         boolean negate;
+
+         negate = *ctx->cur == '-';
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+         if (!parse_uint( &ctx->cur, &uindex )) {
+            report_error( ctx, "Expected literal unsigned integer" );
+            return FALSE;
+         }
+         if (negate)
+            *index = -(int) uindex;
+         else
+            *index = (int) uindex;
+      }
+      else {
+         *index = 0;
+      }
+   }
+   else {
+      if (!parse_uint( &ctx->cur, &uindex )) {
+         report_error( ctx, "Expected literal unsigned integer" );
+         return FALSE;
+      }
+      *index = (int) uindex;
+      *ind_file = TGSI_FILE_NULL;
+      *ind_index = 0;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+/* Parse register declaration.
+ *    <register_dcl> ::= <register_file_bracket_index> `]' |
+ *                       <register_file_bracket_index> `..' <index> `]'
+ */
+static boolean
+parse_register_dcl(
+   struct translate_ctx *ctx,
+   uint *file,
+   int *first,
+   int *last )
+{
+   if (!parse_register_file_bracket_index( ctx, file, first ))
+      return FALSE;
+   eat_opt_white( &ctx->cur );
+   if (ctx->cur[0] == '.' && ctx->cur[1] == '.') {
+      uint uindex;
+
+      ctx->cur += 2;
+      eat_opt_white( &ctx->cur );
+      if (!parse_uint( &ctx->cur, &uindex )) {
+         report_error( ctx, "Expected literal integer" );
+         return FALSE;
+      }
+      *last = (int) uindex;
+      eat_opt_white( &ctx->cur );
+   }
+   else {
+      *last = *first;
+   }
+   if (*ctx->cur != ']') {
+      report_error( ctx, "Expected `]' or `..'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   return TRUE;
+}
+
+static const char *modulate_names[TGSI_MODULATE_COUNT] =
+{
+   "_1X",
+   "_2X",
+   "_4X",
+   "_8X",
+   "_D2",
+   "_D4",
+   "_D8"
+};
+
+static boolean
+parse_dst_operand(
+   struct translate_ctx *ctx,
+   struct tgsi_full_dst_register *dst )
+{
+   uint file;
+   int index;
+   uint writemask;
+   const char *cur;
+
+   if (!parse_register_dst( ctx, &file, &index ))
+      return FALSE;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == '_') {
+      uint i;
+
+      for (i = 0; i < TGSI_MODULATE_COUNT; i++) {
+         if (str_match_no_case( &cur, modulate_names[i] )) {
+            if (!is_digit_alpha_underscore( cur )) {
+               dst->DstRegisterExtModulate.Modulate = i;
+               ctx->cur = cur;
+               break;
+            }
+         }
+      }
+   }
+
+   if (!parse_opt_writemask( ctx, &writemask ))
+      return FALSE;
+
+   dst->DstRegister.File = file;
+   dst->DstRegister.Index = index;
+   dst->DstRegister.WriteMask = writemask;
+   return TRUE;
+}
+
+static boolean
+parse_optional_swizzle(
+   struct translate_ctx *ctx,
+   uint swizzle[4],
+   boolean *parsed_swizzle,
+   boolean *parsed_extswizzle )
+{
+   const char *cur = ctx->cur;
+
+   *parsed_swizzle = FALSE;
+   *parsed_extswizzle = FALSE;
+
+   eat_opt_white( &cur );
+   if (*cur == '.') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < 4; i++) {
+         if (uprcase( *cur ) == 'X')
+            swizzle[i] = TGSI_SWIZZLE_X;
+         else if (uprcase( *cur ) == 'Y')
+            swizzle[i] = TGSI_SWIZZLE_Y;
+         else if (uprcase( *cur ) == 'Z')
+            swizzle[i] = TGSI_SWIZZLE_Z;
+         else if (uprcase( *cur ) == 'W')
+            swizzle[i] = TGSI_SWIZZLE_W;
+         else {
+            if (*cur == '0')
+               swizzle[i] = TGSI_EXTSWIZZLE_ZERO;
+            else if (*cur == '1')
+               swizzle[i] = TGSI_EXTSWIZZLE_ONE;
+            else {
+               report_error( ctx, "Expected register swizzle component `x', `y', `z', `w', `0' or `1'" );
+               return FALSE;
+            }
+            *parsed_extswizzle = TRUE;
+         }
+         cur++;
+      }
+      *parsed_swizzle = TRUE;
+      ctx->cur = cur;
+   }
+   return TRUE;
+}
+
+static boolean
+parse_src_operand(
+   struct translate_ctx *ctx,
+   struct tgsi_full_src_register *src )
+{
+   const char *cur;
+   float value;
+   uint file;
+   int index;
+   uint ind_file;
+   int ind_index;
+   uint swizzle[4];
+   boolean parsed_swizzle;
+   boolean parsed_extswizzle;
+
+   if (*ctx->cur == '-') {
+      cur = ctx->cur;
+      cur++;
+      eat_opt_white( &cur );
+      if (*cur == '(') {
+         cur++;
+         src->SrcRegisterExtMod.Negate = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (*ctx->cur == '|') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegisterExtMod.Absolute = 1;
+   }
+
+   if (*ctx->cur == '-') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegister.Negate = 1;
+   }
+
+   cur = ctx->cur;
+   if (parse_float( &cur, &value )) {
+      if (value == 2.0f) {
+         eat_opt_white( &cur );
+         if (*cur != '*') {
+            report_error( ctx, "Expected `*'" );
+            return FALSE;
+         }
+         cur++;
+         if (*cur != '(') {
+            report_error( ctx, "Expected `('" );
+            return FALSE;
+         }
+         cur++;
+         src->SrcRegisterExtMod.Scale2X = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (*ctx->cur == '(') {
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      src->SrcRegisterExtMod.Bias = 1;
+   }
+
+   cur = ctx->cur;
+   if (parse_float( &cur, &value )) {
+      if (value == 1.0f) {
+         eat_opt_white( &cur );
+         if (*cur != '-') {
+            report_error( ctx, "Expected `-'" );
+            return FALSE;
+         }
+         cur++;
+         if (*cur != '(') {
+            report_error( ctx, "Expected `('" );
+            return FALSE;
+         }
+         cur++;
+         src->SrcRegisterExtMod.Complement = 1;
+         eat_opt_white( &cur );
+         ctx->cur = cur;
+      }
+   }
+
+   if (!parse_register_src( ctx, &file, &index, &ind_file, &ind_index ))
+      return FALSE;
+   src->SrcRegister.File = file;
+   src->SrcRegister.Index = index;
+   if (ind_file != TGSI_FILE_NULL) {
+      src->SrcRegister.Indirect = 1;
+      src->SrcRegisterInd.File = ind_file;
+      src->SrcRegisterInd.Index = ind_index;
+   }
+
+   /* Parse optional swizzle.
+    */
+   if (parse_optional_swizzle( ctx, swizzle, &parsed_swizzle, &parsed_extswizzle )) {
+      if (parsed_extswizzle) {
+         assert( parsed_swizzle );
+
+         src->SrcRegisterExtSwz.ExtSwizzleX = swizzle[0];
+         src->SrcRegisterExtSwz.ExtSwizzleY = swizzle[1];
+         src->SrcRegisterExtSwz.ExtSwizzleZ = swizzle[2];
+         src->SrcRegisterExtSwz.ExtSwizzleW = swizzle[3];
+      }
+      else if (parsed_swizzle) {
+         src->SrcRegister.SwizzleX = swizzle[0];
+         src->SrcRegister.SwizzleY = swizzle[1];
+         src->SrcRegister.SwizzleZ = swizzle[2];
+         src->SrcRegister.SwizzleW = swizzle[3];
+      }
+   }
+
+   if (src->SrcRegisterExtMod.Complement) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Bias) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '-') {
+         report_error( ctx, "Expected `-'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (!parse_float( &ctx->cur, &value )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+      if (value != 0.5f) {
+         report_error( ctx, "Expected 0.5" );
+         return FALSE;
+      }
+   }
+
+   if (src->SrcRegisterExtMod.Scale2X) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Absolute) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != '|') {
+         report_error( ctx, "Expected `|'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   if (src->SrcRegisterExtMod.Negate) {
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ')') {
+         report_error( ctx, "Expected `)'" );
+         return FALSE;
+      }
+      ctx->cur++;
+   }
+
+   return TRUE;
+}
+
+static const char *texture_names[TGSI_TEXTURE_COUNT] =
+{
+   "UNKNOWN",
+   "1D",
+   "2D",
+   "3D",
+   "CUBE",
+   "RECT",
+   "SHADOW1D",
+   "SHADOW2D",
+   "SHADOWRECT"
+};
+
+static boolean
+parse_instruction(
+   struct translate_ctx *ctx,
+   boolean has_label )
+{
+   uint i;
+   uint saturate = TGSI_SAT_NONE;
+   const struct tgsi_opcode_info *info;
+   struct tgsi_full_instruction inst;
+   uint advance;
+
+   /* Parse instruction name.
+    */
+   eat_opt_white( &ctx->cur );
+   for (i = 0; i < TGSI_OPCODE_LAST; i++) {
+      const char *cur = ctx->cur;
+
+      info = tgsi_get_opcode_info( i );
+      if (str_match_no_case( &cur, info->mnemonic )) {
+         if (str_match_no_case( &cur, "_SATNV" ))
+            saturate = TGSI_SAT_MINUS_PLUS_ONE;
+         else if (str_match_no_case( &cur, "_SAT" ))
+            saturate = TGSI_SAT_ZERO_ONE;
+
+         if (info->num_dst + info->num_src + info->is_tex == 0) {
+            if (!is_digit_alpha_underscore( cur )) {
+               ctx->cur = cur;
+               break;
+            }
+         }
+         else if (*cur == '\0' || eat_white( &cur )) {
+            ctx->cur = cur;
+            break;
+         }
+      }
+   }
+   if (i == TGSI_OPCODE_LAST) {
+      if (has_label)
+         report_error( ctx, "Unknown opcode" );
+      else
+         report_error( ctx, "Expected `DCL', `IMM' or a label" );
+      return FALSE;
+   }
+
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = i;
+   inst.Instruction.Saturate = saturate;
+   inst.Instruction.NumDstRegs = info->num_dst;
+   inst.Instruction.NumSrcRegs = info->num_src;
+
+   /* Parse instruction operands.
+    */
+   for (i = 0; i < info->num_dst + info->num_src + info->is_tex; i++) {
+      if (i > 0) {
+         eat_opt_white( &ctx->cur );
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+
+      if (i < info->num_dst) {
+         if (!parse_dst_operand( ctx, &inst.FullDstRegisters[i] ))
+            return FALSE;
+      }
+      else if (i < info->num_dst + info->num_src) {
+         if (!parse_src_operand( ctx, &inst.FullSrcRegisters[i - info->num_dst] ))
+            return FALSE;
+      }
+      else {
+         uint j;
+
+         for (j = 0; j < TGSI_TEXTURE_COUNT; j++) {
+            if (str_match_no_case( &ctx->cur, texture_names[j] )) {
+               if (!is_digit_alpha_underscore( ctx->cur )) {
+                  inst.InstructionExtTexture.Texture = j;
+                  break;
+               }
+            }
+         }
+         if (j == TGSI_TEXTURE_COUNT) {
+            report_error( ctx, "Expected texture target" );
+            return FALSE;
+         }
+      }
+   }
+
+   if (info->is_branch) {
+      uint target;
+
+      eat_opt_white( &ctx->cur );
+      if (*ctx->cur != ':') {
+         report_error( ctx, "Expected `:'" );
+         return FALSE;
+      }
+      ctx->cur++;
+      eat_opt_white( &ctx->cur );
+      if (!parse_uint( &ctx->cur, &target )) {
+         report_error( ctx, "Expected a label" );
+         return FALSE;
+      }
+      inst.InstructionExtLabel.Label = target;
+   }
+
+   advance = tgsi_build_full_instruction(
+      &inst,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static const char *semantic_names[TGSI_SEMANTIC_COUNT] =
+{
+   "POSITION",
+   "COLOR",
+   "BCOLOR",
+   "FOG",
+   "PSIZE",
+   "GENERIC",
+   "NORMAL"
+};
+
+static const char *interpolate_names[TGSI_INTERPOLATE_COUNT] =
+{
+   "CONSTANT",
+   "LINEAR",
+   "PERSPECTIVE"
+};
+
+static boolean parse_declaration( struct translate_ctx *ctx )
+{
+   struct tgsi_full_declaration decl;
+   uint file;
+   int first;
+   int last;
+   uint writemask;
+   const char *cur;
+   uint advance;
+
+   if (!eat_white( &ctx->cur )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   if (!parse_register_dcl( ctx, &file, &first, &last ))
+      return FALSE;
+   if (!parse_opt_writemask( ctx, &writemask ))
+      return FALSE;
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = file;
+   decl.Declaration.UsageMask = writemask;
+   decl.DeclarationRange.First = first;
+   decl.DeclarationRange.Last = last;
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == ',') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < TGSI_SEMANTIC_COUNT; i++) {
+         if (str_match_no_case( &cur, semantic_names[i] )) {
+            const char *cur2 = cur;
+            uint index;
+
+            if (is_digit_alpha_underscore( cur ))
+               continue;
+            eat_opt_white( &cur2 );
+            if (*cur2 == '[') {
+               cur2++;
+               eat_opt_white( &cur2 );
+               if (!parse_uint( &cur2, &index )) {
+                  report_error( ctx, "Expected literal integer" );
+                  return FALSE;
+               }
+               eat_opt_white( &cur2 );
+               if (*cur2 != ']') {
+                  report_error( ctx, "Expected `]'" );
+                  return FALSE;
+               }
+               cur2++;
+
+               decl.Semantic.SemanticIndex = index;
+
+               cur = cur2;
+            }
+
+            decl.Declaration.Semantic = 1;
+            decl.Semantic.SemanticName = i;
+
+            ctx->cur = cur;
+            break;
+         }
+      }
+   }
+
+   cur = ctx->cur;
+   eat_opt_white( &cur );
+   if (*cur == ',') {
+      uint i;
+
+      cur++;
+      eat_opt_white( &cur );
+      for (i = 0; i < TGSI_INTERPOLATE_COUNT; i++) {
+         if (str_match_no_case( &cur, interpolate_names[i] )) {
+            if (is_digit_alpha_underscore( cur ))
+               continue;
+            decl.Declaration.Interpolate = i;
+
+            ctx->cur = cur;
+            break;
+         }
+      }
+      if (i == TGSI_INTERPOLATE_COUNT) {
+         report_error( ctx, "Expected semantic or interpolate attribute" );
+         return FALSE;
+      }
+   }
+
+   advance = tgsi_build_full_declaration(
+      &decl,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static boolean parse_immediate( struct translate_ctx *ctx )
+{
+   struct tgsi_full_immediate imm;
+   uint i;
+   float values[4];
+   uint advance;
+
+   if (!eat_white( &ctx->cur )) {
+      report_error( ctx, "Syntax error" );
+      return FALSE;
+   }
+   if (!str_match_no_case( &ctx->cur, "FLT32" ) || is_digit_alpha_underscore( ctx->cur )) {
+      report_error( ctx, "Expected `FLT32'" );
+      return FALSE;
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '{') {
+      report_error( ctx, "Expected `{'" );
+      return FALSE;
+   }
+   ctx->cur++;
+   for (i = 0; i < 4; i++) {
+      eat_opt_white( &ctx->cur );
+      if (i > 0) {
+         if (*ctx->cur != ',') {
+            report_error( ctx, "Expected `,'" );
+            return FALSE;
+         }
+         ctx->cur++;
+         eat_opt_white( &ctx->cur );
+      }
+      if (!parse_float( &ctx->cur, &values[i] )) {
+         report_error( ctx, "Expected literal floating point" );
+         return FALSE;
+      }
+   }
+   eat_opt_white( &ctx->cur );
+   if (*ctx->cur != '}') {
+      report_error( ctx, "Expected `}'" );
+      return FALSE;
+   }
+   ctx->cur++;
+
+   imm = tgsi_default_full_immediate();
+   imm.Immediate.Size += 4;
+   imm.Immediate.DataType = TGSI_IMM_FLOAT32;
+   imm.u.Pointer = values;
+
+   advance = tgsi_build_full_immediate(
+      &imm,
+      ctx->tokens_cur,
+      ctx->header,
+      (uint) (ctx->tokens_end - ctx->tokens_cur) );
+   if (advance == 0)
+      return FALSE;
+   ctx->tokens_cur += advance;
+
+   return TRUE;
+}
+
+static boolean translate( struct translate_ctx *ctx )
+{
+   eat_opt_white( &ctx->cur );
+   if (!parse_header( ctx ))
+      return FALSE;
+
+   while (*ctx->cur != '\0') {
+      uint label_val = 0;
+
+      if (!eat_white( &ctx->cur )) {
+         report_error( ctx, "Syntax error" );
+         return FALSE;
+      }
+
+      if (*ctx->cur == '\0')
+         break;
+
+      if (parse_label( ctx, &label_val )) {
+         if (!parse_instruction( ctx, TRUE ))
+            return FALSE;
+      }
+      else if (str_match_no_case( &ctx->cur, "DCL" )) {
+         if (!parse_declaration( ctx ))
+            return FALSE;
+      }
+      else if (str_match_no_case( &ctx->cur, "IMM" )) {
+         if (!parse_immediate( ctx ))
+            return FALSE;
+      }
+      else if (!parse_instruction( ctx, FALSE )) {
+         return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean
+tgsi_text_translate(
+   const char *text,
+   struct tgsi_token *tokens,
+   uint num_tokens )
+{
+   struct translate_ctx ctx;
+
+   ctx.text = text;
+   ctx.cur = text;
+   ctx.tokens = tokens;
+   ctx.tokens_cur = tokens;
+   ctx.tokens_end = tokens + num_tokens;
+
+   if (!translate( &ctx ))
+      return FALSE;
+
+   return tgsi_sanity_check( tokens );
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.h b/src/gallium/auxiliary/tgsi/tgsi_text.h
new file mode 100644
index 0000000000..8eeeeef140
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TEXT_H
+#define TGSI_TEXT_H
+
+#include "pipe/p_shader_tokens.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+boolean
+tgsi_text_translate(
+   const char *text,
+   struct tgsi_token *tokens,
+   uint num_tokens );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_TEXT_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.c b/src/gallium/auxiliary/tgsi/tgsi_transform.c
new file mode 100644
index 0000000000..ea87da31e5
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI program transformation utility.
+ *
+ * Authors:  Brian Paul
+ */
+
+#include "pipe/p_debug.h"
+
+#include "tgsi_transform.h"
+
+
+
+static void
+emit_instruction(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_instruction *inst)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_instruction(inst,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_declaration(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_declaration *decl)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_declaration(decl,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_immediate(struct tgsi_transform_context *ctx,
+               const struct tgsi_full_immediate *imm)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_immediate(imm,
+                                   ctx->tokens_out + ti,
+                                   ctx->header,
+                                   ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+
+/**
+ * Apply user-defined transformations to the input shader to produce
+ * the output shader.
+ * For example, a register search-and-replace operation could be applied
+ * by defining a transform_instruction() callback that examined and changed
+ * the instruction src/dest regs.
+ *
+ * \return number of tokens emitted
+ */
+int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx)
+{
+   uint procType;
+
+   /* input shader */
+   struct tgsi_parse_context parse;
+
+   /* output shader */
+   struct tgsi_processor *processor;
+
+
+   /**
+    ** callback context init
+    **/
+   ctx->emit_instruction = emit_instruction;
+   ctx->emit_declaration = emit_declaration;
+   ctx->emit_immediate = emit_immediate;
+   ctx->tokens_out = tokens_out;
+   ctx->max_tokens_out = max_tokens_out;
+
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init( &parse, tokens_in ) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_transform_shader()!\n");
+      return -1;
+   }
+   procType = parse.FullHeader.Processor.Processor;
+   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
+          procType == TGSI_PROCESSOR_VERTEX ||
+          procType == TGSI_PROCESSOR_GEOMETRY);
+
+
+   /**
+    **  Setup output shader
+    **/
+   *(struct tgsi_version *) &tokens_out[0] = tgsi_build_version();
+
+   ctx->header = (struct tgsi_header *) (tokens_out + 1);
+   *ctx->header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) (tokens_out + 2);
+   *processor = tgsi_build_processor( procType, ctx->header );
+
+   ctx->ti = 3;
+
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst
+               = &parse.FullToken.FullInstruction;
+
+            if (ctx->transform_instruction)
+               ctx->transform_instruction(ctx, fullinst);
+            else
+               ctx->emit_instruction(ctx, fullinst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         {
+            struct tgsi_full_declaration *fulldecl
+               = &parse.FullToken.FullDeclaration;
+
+            if (ctx->transform_declaration)
+               ctx->transform_declaration(ctx, fulldecl);
+            else
+               ctx->emit_declaration(ctx, fulldecl);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            struct tgsi_full_immediate *fullimm
+               = &parse.FullToken.FullImmediate;
+
+            if (ctx->transform_immediate)
+               ctx->transform_immediate(ctx, fullimm);
+            else
+               ctx->emit_immediate(ctx, fullimm);
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   if (ctx->epilog) {
+      ctx->epilog(ctx);
+   }
+
+   tgsi_parse_free (&parse);
+
+   return ctx->ti;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
new file mode 100644
index 0000000000..a121adbaef
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TRANSFORM_H
+#define TGSI_TRANSFORM_H
+
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
+
+
+
+/**
+ * Subclass this to add caller-specific data
+ */
+struct tgsi_transform_context
+{
+/**** PUBLIC ***/
+
+   /**
+    * User-defined callbacks invoked per instruction.
+    */
+   void (*transform_instruction)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_instruction *inst);
+
+   void (*transform_declaration)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_declaration *decl);
+
+   void (*transform_immediate)(struct tgsi_transform_context *ctx,
+                               struct tgsi_full_immediate *imm);
+
+   /**
+    * Called at end of input program to allow caller to append extra
+    * instructions.  Return number of tokens emitted.
+    */
+   void (*epilog)(struct tgsi_transform_context *ctx);
+
+
+/*** PRIVATE ***/
+
+   /**
+    * These are setup by tgsi_transform_shader() and cannot be overridden.
+    * Meant to be called from in the above user callback functions.
+    */
+   void (*emit_instruction)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_instruction *inst);
+   void (*emit_declaration)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_declaration *decl);
+   void (*emit_immediate)(struct tgsi_transform_context *ctx,
+                          const struct tgsi_full_immediate *imm);
+
+   struct tgsi_header *header;
+   uint max_tokens_out;
+   struct tgsi_token *tokens_out;
+   uint ti;
+};
+
+
+
+extern int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx);
+
+
+#endif /* TGSI_TRANSFORM_H */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
new file mode 100644
index 0000000000..50101a9bb0
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -0,0 +1,299 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+#include "tgsi_util.h"
+
+union pointer_hack
+{
+   void *pointer;
+   uint64_t uint64;
+};
+
+void *
+tgsi_align_128bit(
+   void *unaligned )
+{
+   union pointer_hack ph;
+
+   ph.uint64 = 0;
+   ph.pointer = unaligned;
+   ph.uint64 = (ph.uint64 + 15) & ~15;
+   return ph.pointer;
+}
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         swizzle );
+   }
+
+   return swizzle;
+}
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->SwizzleX = swizzle;
+      break;
+   case 1:
+      reg->SwizzleY = swizzle;
+      break;
+   case 2:
+      reg->SwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->SwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->ExtSwizzleX = swizzle;
+      break;
+   case 1:
+      reg->ExtSwizzleY = swizzle;
+      break;
+   case 2:
+      reg->ExtSwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->ExtSwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode )
+{
+   reg->SrcRegisterExtSwz.NegateX = 0;
+   reg->SrcRegisterExtSwz.NegateY = 0;
+   reg->SrcRegisterExtSwz.NegateZ = 0;
+   reg->SrcRegisterExtSwz.NegateW = 0;
+
+   switch (sign_mode)
+   {
+   case TGSI_UTIL_SIGN_CLEAR:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      reg->SrcRegister.Negate = 1;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
new file mode 100644
index 0000000000..7877f34558
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_UTIL_H
+#define TGSI_UTIL_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+void *
+tgsi_align_128bit(
+   void *unaligned );
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component);
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component );
+
+#define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
+#define TGSI_UTIL_SIGN_SET      1   /* Force negative */
+#define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
+#define TGSI_UTIL_SIGN_KEEP     3   /* No change */
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode );
+
+#if defined __cplusplus
+}
+#endif
+
+#endif /* TGSI_UTIL_H */
diff --git a/src/gallium/auxiliary/translate/Makefile b/src/gallium/auxiliary/translate/Makefile
new file mode 100644
index 0000000000..ad2a5b705e
--- /dev/null
+++ b/src/gallium/auxiliary/translate/Makefile
@@ -0,0 +1,15 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = translate
+
+C_SOURCES = \
+	translate_generic.c \
+	translate_sse.c \
+	translate.c \
+        translate_cache.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/translate/SConscript b/src/gallium/auxiliary/translate/SConscript
new file mode 100644
index 0000000000..9553a67537
--- /dev/null
+++ b/src/gallium/auxiliary/translate/SConscript
@@ -0,0 +1,12 @@
+Import('*')
+
+translate = env.ConvenienceLibrary(
+	target = 'translate',
+	source = [
+		'translate_generic.c',
+		'translate_sse.c',
+		'translate.c',
+		'translate_cache.c',
+	])
+
+auxiliaries.insert(0, translate)
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
new file mode 100644
index 0000000000..7678903f75
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_config.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+struct translate *translate_create( const struct translate_key *key )
+{
+   struct translate *translate = NULL;
+
+#if defined(PIPE_ARCH_X86)
+   translate = translate_sse2_create( key );
+   if (translate)
+      return translate;
+#endif
+
+   return translate_generic_create( key );
+}
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
new file mode 100644
index 0000000000..34526eb061
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef _TRANSLATE_H
+#define _TRANSLATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+struct translate_element 
+{
+   enum pipe_format input_format;
+   enum pipe_format output_format;
+   unsigned input_buffer:8;
+   unsigned input_offset:24;
+   unsigned output_offset;
+};
+
+
+struct translate_key {
+   unsigned output_stride;
+   unsigned nr_elements;
+   struct translate_element element[PIPE_MAX_ATTRIBS + 1];
+};
+
+
+struct translate {
+   struct translate_key key;
+
+   void (*release)( struct translate * );
+
+   void (*set_buffer)( struct translate *,
+		       unsigned i,
+		       const void *ptr,
+		       unsigned stride );
+
+   void (PIPE_CDECL *run_elts)( struct translate *,
+                                const unsigned *elts,
+                                unsigned count,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run)( struct translate *,
+                           unsigned start,
+                           unsigned count,
+                           void *output_buffer);
+};
+
+
+
+#if 0
+struct translate_context *translate_context_create( void );
+void translate_context_destroy( struct translate_context * );
+
+struct translate *translate_lookup_or_create( struct translate_context *tctx,
+					      const struct translate_key *key );
+#endif
+
+
+struct translate *translate_create( const struct translate_key *key );
+
+static INLINE int translate_keysize( const struct translate_key *key )
+{
+   return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
+}
+
+static INLINE int translate_key_compare( const struct translate_key *a,
+                                         const struct translate_key *b )
+{
+   int keysize = translate_keysize(a);
+   return memcmp(a, b, keysize);
+}
+
+
+static INLINE void translate_key_sanitize( struct translate_key *a )
+{
+   int keysize = translate_keysize(a);
+   char *ptr = (char *)a;
+   memset(ptr + keysize, 0, sizeof(*a) - keysize);
+}
+
+
+/*******************************************************************************
+ *  Private:
+ */
+struct translate *translate_sse2_create( const struct translate_key *key );
+
+struct translate *translate_generic_create( const struct translate_key *key );
+
+
+#endif
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
new file mode 100644
index 0000000000..d8069a149c
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+#include "translate_cache.h"
+
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct translate_cache {
+   struct cso_hash *hash;
+};
+
+struct translate_cache * translate_cache_create( void )
+{
+   struct translate_cache *cache = MALLOC_STRUCT(translate_cache);
+   cache->hash = cso_hash_create();
+   return cache;
+}
+
+
+static INLINE void delete_translates(struct translate_cache *cache)
+{
+   struct cso_hash *hash = cache->hash;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      struct translate *state = (struct translate*)cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         state->release(state);
+      }
+   }
+}
+
+void translate_cache_destroy(struct translate_cache *cache)
+{
+   delete_translates(cache);
+   cso_hash_delete(cache->hash);
+   FREE(cache);
+}
+
+
+static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+{
+   unsigned size = sizeof(struct translate_key) -
+                   sizeof(struct translate_element) * (PIPE_MAX_ATTRIBS - key->nr_elements);
+   return size;
+}
+
+static INLINE unsigned create_key(struct translate_key *key)
+{
+   unsigned hash_key;
+   unsigned size = translate_hash_key_size(key);
+   /*debug_printf("key size = %d, (els = %d)\n",
+     size, key->nr_elements);*/
+   hash_key = cso_construct_key(key, size);
+   return hash_key;
+}
+
+struct translate * translate_cache_find(struct translate_cache *cache,
+                                        struct translate_key *key)
+{
+   unsigned hash_key = create_key(key);
+   struct translate *translate = (struct translate*)
+      cso_hash_find_data_from_template(cache->hash,
+                                       hash_key,
+                                       key, sizeof(*key));
+
+   if (!translate) {
+      /* create/insert */
+      translate = translate_create(key);
+      cso_hash_insert(cache->hash, hash_key, translate);
+   }
+
+   return translate;
+}
diff --git a/src/gallium/auxiliary/translate/translate_cache.h b/src/gallium/auxiliary/translate/translate_cache.h
new file mode 100644
index 0000000000..7dba871e57
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_cache.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _TRANSLATE_CACHE_H
+#define _TRANSLATE_CACHE_H
+
+
+/*******************************************************************************
+ * Translate cache.
+ * Simply used to cache created translates. Avoids unecessary creation of
+ * translate's if one suitable for a given translate_key has already been
+ * created.
+ *
+ * Note: this functionality depends and requires the CSO module.
+ */
+struct translate_cache;
+
+struct translate_key;
+struct translate;
+
+struct translate_cache *translate_cache_create( void );
+void translate_cache_destroy(struct translate_cache *cache);
+
+/**
+ * Will try to find a translate structure matched by the given key.
+ * If such a structure doesn't exist in the cache the function
+ * will automatically create it, insert it in the cache and
+ * return the created version.
+ *
+ */
+struct translate *translate_cache_find(struct translate_cache *cache,
+                                       struct translate_key *key);
+
+#endif
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
new file mode 100644
index 0000000000..8d39b64c6c
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -0,0 +1,700 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+
+#define DRAW_DBG 0
+
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*emit_func)(const float *attrib, void *ptr);
+
+
+
+struct translate_generic {
+   struct translate translate;
+
+   struct {
+      fetch_func fetch;
+      unsigned buffer;
+      unsigned input_offset;
+
+      emit_func emit;
+      unsigned output_offset;
+      
+      char *input_ptr;
+      unsigned input_stride;
+
+   } attrib[PIPE_MAX_ATTRIBS];
+
+   unsigned nr_attrib;
+};
+
+
+static struct translate_generic *translate_generic( struct translate *translate )
+{
+   return (struct translate_generic *)translate;
+}
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define ATTRIB( NAME, SZ, TYPE, FROM, TO )		\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   const float defaults[4] = { 0.0f,0.0f,0.0f,1.0f };	\
+   unsigned i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = FROM(i);				\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}							\
+							\
+static void						\
+emit_##NAME(const float *attrib, void *ptr)		\
+{  \
+   unsigned i;						\
+   TYPE *out = (TYPE *)ptr;				\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      out[i] = TO(attrib[i]);				\
+   }							\
+}
+
+
+#define FROM_64_FLOAT(i)   ((float) ((double *) ptr)[i])
+#define FROM_32_FLOAT(i)   (((float *) ptr)[i])
+
+#define FROM_8_USCALED(i)  ((float) ((unsigned char *) ptr)[i])
+#define FROM_16_USCALED(i) ((float) ((unsigned short *) ptr)[i])
+#define FROM_32_USCALED(i) ((float) ((unsigned int *) ptr)[i])
+
+#define FROM_8_SSCALED(i)  ((float) ((char *) ptr)[i])
+#define FROM_16_SSCALED(i) ((float) ((short *) ptr)[i])
+#define FROM_32_SSCALED(i) ((float) ((int *) ptr)[i])
+
+#define FROM_8_UNORM(i)    ((float) ((unsigned char *) ptr)[i] / 255.0f)
+#define FROM_16_UNORM(i)   ((float) ((unsigned short *) ptr)[i] / 65535.0f)
+#define FROM_32_UNORM(i)   ((float) ((unsigned int *) ptr)[i] / 4294967295.0f)
+
+#define FROM_8_SNORM(i)    ((float) ((char *) ptr)[i] / 127.0f)
+#define FROM_16_SNORM(i)   ((float) ((short *) ptr)[i] / 32767.0f)
+#define FROM_32_SNORM(i)   ((float) ((int *) ptr)[i] / 2147483647.0f)
+
+#define FROM_32_FIXED(i)   (((int *) ptr)[i] / 65536.0f)
+
+#define TO_64_FLOAT(x)   ((double) x)
+#define TO_32_FLOAT(x)   (x)
+
+#define TO_8_USCALED(x)  ((unsigned char) x)
+#define TO_16_USCALED(x) ((unsigned short) x)
+#define TO_32_USCALED(x) ((unsigned int) x)
+
+#define TO_8_SSCALED(x)  ((char) x)
+#define TO_16_SSCALED(x) ((short) x)
+#define TO_32_SSCALED(x) ((int) x)
+
+#define TO_8_UNORM(x)    ((unsigned char) (x * 255.0f))
+#define TO_16_UNORM(x)   ((unsigned short) (x * 65535.0f))
+#define TO_32_UNORM(x)   ((unsigned int) (x * 4294967295.0f))
+
+#define TO_8_SNORM(x)    ((char) (x * 127.0f))
+#define TO_16_SNORM(x)   ((short) (x * 32767.0f))
+#define TO_32_SNORM(x)   ((int) (x * 2147483647.0f))
+
+#define TO_32_FIXED(x)   ((int) (x * 65536.0f))
+
+
+
+ATTRIB( R64G64B64A64_FLOAT,   4, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64B64_FLOAT,      3, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64_FLOAT,         2, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64_FLOAT,            1, double, FROM_64_FLOAT, TO_64_FLOAT )
+
+ATTRIB( R32G32B32A32_FLOAT,   4, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32B32_FLOAT,      3, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32_FLOAT,         2, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32_FLOAT,            1, float, FROM_32_FLOAT, TO_32_FLOAT )
+
+ATTRIB( R32G32B32A32_USCALED, 4, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32B32_USCALED,    3, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32_USCALED,       2, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32_USCALED,          1, unsigned, FROM_32_USCALED, TO_32_USCALED )
+
+ATTRIB( R32G32B32A32_SSCALED, 4, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32B32_SSCALED,    3, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32_SSCALED,       2, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32_SSCALED,          1, int, FROM_32_SSCALED, TO_32_SSCALED )
+
+ATTRIB( R32G32B32A32_UNORM, 4, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32B32_UNORM,    3, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32_UNORM,       2, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32_UNORM,          1, unsigned, FROM_32_UNORM, TO_32_UNORM )
+
+ATTRIB( R32G32B32A32_SNORM, 4, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32B32_SNORM,    3, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32_SNORM,       2, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32_SNORM,          1, int, FROM_32_SNORM, TO_32_SNORM )
+
+ATTRIB( R16G16B16A16_USCALED, 4, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16B16_USCALED,    3, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16_USCALED,       2, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16_USCALED,          1, ushort, FROM_16_USCALED, TO_16_USCALED )
+
+ATTRIB( R16G16B16A16_SSCALED, 4, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16B16_SSCALED,    3, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16_SSCALED,       2, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16_SSCALED,          1, short, FROM_16_SSCALED, TO_16_SSCALED )
+
+ATTRIB( R16G16B16A16_UNORM, 4, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16B16_UNORM,    3, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16_UNORM,       2, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16_UNORM,          1, ushort, FROM_16_UNORM, TO_16_UNORM )
+
+ATTRIB( R16G16B16A16_SNORM, 4, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16B16_SNORM,    3, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16_SNORM,       2, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16_SNORM,          1, short, FROM_16_SNORM, TO_16_SNORM )
+
+ATTRIB( R8G8B8A8_USCALED,   4, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8B8_USCALED,     3, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8_USCALED,       2, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8_USCALED,         1, ubyte, FROM_8_USCALED, TO_8_USCALED )
+
+ATTRIB( R8G8B8A8_SSCALED,  4, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8B8_SSCALED,    3, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8_SSCALED,      2, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8_SSCALED,        1, char, FROM_8_SSCALED, TO_8_SSCALED )
+
+ATTRIB( R8G8B8A8_UNORM,  4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8B8_UNORM,    3, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8_UNORM,      2, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8_UNORM,        1, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+ATTRIB( R8G8B8A8_SNORM,  4, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8B8_SNORM,    3, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8_SNORM,      2, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8_SNORM,        1, char, FROM_8_SNORM, TO_8_SNORM )
+
+ATTRIB( A8R8G8B8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+//ATTRIB( R8G8B8A8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+ATTRIB( R32G32B32A32_FIXED,   4, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32G32B32_FIXED,      3, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32G32_FIXED,         2, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32_FIXED,            1, int, FROM_32_FIXED, TO_32_FIXED )
+
+
+
+static void
+fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
+{
+   attrib[2] = FROM_8_UNORM(0);
+   attrib[1] = FROM_8_UNORM(1);
+   attrib[0] = FROM_8_UNORM(2);
+   attrib[3] = FROM_8_UNORM(3);
+}
+
+static void
+emit_B8G8R8A8_UNORM( const float *attrib, void *ptr)
+{
+   ubyte *out = (ubyte *)ptr;
+   out[2] = TO_8_UNORM(attrib[0]);
+   out[1] = TO_8_UNORM(attrib[1]);
+   out[0] = TO_8_UNORM(attrib[2]);
+   out[3] = TO_8_UNORM(attrib[3]);
+}
+
+static void 
+fetch_NULL( const void *ptr, float *attrib )
+{
+   attrib[0] = 0;
+   attrib[1] = 0;
+   attrib[2] = 0;
+   attrib[3] = 1;
+}
+
+static void 
+emit_NULL( const float *attrib, void *ptr )
+{
+   /* do nothing is the only sensible option */
+}
+
+static fetch_func get_fetch_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return &fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return &fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return &fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return &fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return &fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return &fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return &fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return &fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return &fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return &fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return &fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return &fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return &fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return &fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return &fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return &fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return &fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return &fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return &fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return &fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return &fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return &fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return &fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return &fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return &fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return &fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return &fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return &fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return &fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return &fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return &fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return &fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return &fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return &fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return &fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return &fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return &fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return &fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return &fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return &fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return &fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return &fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return &fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return &fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return &fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return &fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return &fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return &fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return &fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return &fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return &fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return &fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return &fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return &fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return &fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return &fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return &fetch_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return &fetch_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_R32_FIXED:
+      return &fetch_R32_FIXED;
+   case PIPE_FORMAT_R32G32_FIXED:
+      return &fetch_R32G32_FIXED;
+   case PIPE_FORMAT_R32G32B32_FIXED:
+      return &fetch_R32G32B32_FIXED;
+   case PIPE_FORMAT_R32G32B32A32_FIXED:
+      return &fetch_R32G32B32A32_FIXED;
+
+   default:
+      assert(0); 
+      return &fetch_NULL;
+   }
+}
+
+
+
+
+static emit_func get_emit_func( enum pipe_format format )
+{
+   /* silence warnings */
+   (void) emit_R32G32B32A32_FIXED;
+   (void) emit_R32G32B32_FIXED;
+   (void) emit_R32G32_FIXED;
+   (void) emit_R32_FIXED;
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return &emit_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return &emit_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return &emit_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return &emit_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return &emit_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return &emit_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return &emit_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return &emit_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return &emit_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return &emit_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return &emit_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return &emit_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return &emit_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return &emit_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return &emit_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return &emit_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return &emit_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return &emit_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return &emit_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return &emit_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return &emit_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return &emit_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return &emit_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return &emit_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return &emit_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return &emit_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return &emit_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return &emit_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return &emit_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return &emit_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return &emit_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return &emit_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return &emit_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return &emit_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return &emit_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return &emit_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return &emit_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return &emit_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return &emit_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return &emit_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return &emit_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return &emit_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return &emit_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return &emit_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return &emit_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return &emit_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return &emit_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return &emit_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return &emit_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return &emit_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return &emit_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return &emit_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return &emit_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return &emit_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return &emit_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return &emit_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return &emit_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return &emit_B8G8R8A8_UNORM;
+
+   default:
+      assert(0); 
+      return &emit_NULL;
+   }
+}
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void PIPE_CDECL generic_run_elts( struct translate *translate,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = *elts++;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
+                             i, elt, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+
+static void PIPE_CDECL generic_run( struct translate *translate,
+                                    unsigned start,
+                                    unsigned count,
+                                    void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = start + i;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
+                             i, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+			       
+static void generic_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   unsigned i;
+
+   for (i = 0; i < tg->nr_attrib; i++) {
+      if (tg->attrib[i].buffer == buf) {
+	 tg->attrib[i].input_ptr = ((char *)ptr +
+				    tg->attrib[i].input_offset);
+	 tg->attrib[i].input_stride = stride;
+      }
+   }
+}
+
+
+static void generic_release( struct translate *translate )
+{
+   /* Refcount?
+    */
+   FREE(translate);
+}
+
+struct translate *translate_generic_create( const struct translate_key *key )
+{
+   struct translate_generic *tg = CALLOC_STRUCT(translate_generic);
+   unsigned i;
+
+   if (tg == NULL)
+      return NULL;
+
+   tg->translate.key = *key;
+   tg->translate.release = generic_release;
+   tg->translate.set_buffer = generic_set_buffer;
+   tg->translate.run_elts = generic_run_elts;
+   tg->translate.run = generic_run;
+
+   for (i = 0; i < key->nr_elements; i++) {
+
+      tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
+      tg->attrib[i].buffer = key->element[i].input_buffer;
+      tg->attrib[i].input_offset = key->element[i].input_offset;
+
+      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      tg->attrib[i].output_offset = key->element[i].output_offset;
+
+   }
+
+   tg->nr_attrib = key->nr_elements;
+
+
+   return &tg->translate;
+}
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
new file mode 100644
index 0000000000..b62db8d8f3
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "translate.h"
+
+
+#if defined(PIPE_ARCH_X86)
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+typedef void (PIPE_CDECL *run_func)( struct translate *translate,
+                                     unsigned start,
+                                     unsigned count,
+                                     void *output_buffer );
+
+typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
+                                          const unsigned *elts,
+                                          unsigned count,
+                                          void *output_buffer );
+
+struct translate_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
+};
+
+
+struct translate_sse {
+   struct translate translate;
+
+   struct x86_function linear_func;
+   struct x86_function elt_func;
+   struct x86_function *func;
+
+   boolean loaded_identity;
+   boolean loaded_255;
+   boolean loaded_inv_255;
+
+   float identity[4];
+   float float_255[4];
+   float inv_255[4];
+
+   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffers;
+
+   run_func      gen_run;
+   run_elts_func gen_run_elts;
+
+   /* these are actually known values, but putting them in a struct
+    * like this is helpful to keep them in sync across the file.
+    */
+   struct x86_reg tmp_EAX;
+   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
+   struct x86_reg outbuf_ECX;
+   struct x86_reg machine_EDX;
+   struct x86_reg count_ESI;    /* decrements to zero */
+};
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+
+
+static struct x86_reg get_identity( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 6);
+
+   if (!p->loaded_identity) {
+      p->loaded_identity = TRUE;
+      p->identity[0] = 0;
+      p->identity[1] = 0;
+      p->identity[2] = 0;
+      p->identity[3] = 1;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->identity[0])));
+   }
+
+   return reg;
+}
+
+static struct x86_reg get_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 7);
+
+   if (!p->loaded_255) {
+      p->loaded_255 = TRUE;
+      p->float_255[0] =
+	 p->float_255[1] =
+	 p->float_255[2] =
+	 p->float_255[3] = 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->float_255[0])));
+   }
+
+   return reg;
+}
+
+static struct x86_reg get_inv_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 5);
+
+   if (!p->loaded_inv_255) {
+      p->loaded_inv_255 = TRUE;
+      p->inv_255[0] =
+	 p->inv_255[1] =
+	 p->inv_255[2] =
+	 p->inv_255[3] = 1.0f / 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->inv_255[0])));
+   }
+
+   return reg;
+}
+
+
+static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
+				    struct x86_reg data,
+				    struct x86_reg arg0 )
+{
+   sse_movups(p->func, data, arg0);
+}
+
+static void emit_load_R32G32B32( struct translate_sse *p, 			   
+				 struct x86_reg data,
+				 struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(p->func, data, x86_make_disp(arg0, 8));
+   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+   sse_movlps(p->func, data, arg0);
+}
+
+static void emit_load_R32G32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* 0 0 0 1
+    * a b 0 1
+    */
+   sse_movups(p->func, data, get_identity(p) );
+   sse_movlps(p->func, data, arg0);
+}
+
+
+static void emit_load_R32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* a 0 0 0
+    * a 0 0 1
+    */
+   sse_movss(p->func, data, arg0);
+   sse_orps(p->func, data, get_identity(p) );
+}
+
+
+static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src )
+{
+
+   /* Load and unpack twice:
+    */
+   sse_movss(p->func, data, src);
+   sse2_punpcklbw(p->func, data, get_identity(p));
+   sse2_punpcklbw(p->func, data, get_identity(p));
+
+   /* Convert to float:
+    */
+   sse2_cvtdq2ps(p->func, data, data);
+
+
+   /* Scale by 1/255.0
+    */
+   sse_mulps(p->func, data, get_inv_255(p));
+}
+
+
+
+
+static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
+				     struct x86_reg dest,
+				     struct x86_reg dataXMM )
+{
+   sse_movups(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32G32B32( struct translate_sse *p, 
+				  struct x86_reg dest,
+				  struct x86_reg dataXMM )
+{
+   /* Emit two, shuffle, emit one.
+    */
+   sse_movlps(p->func, dest, dataXMM);
+   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
+}
+
+static void emit_store_R32G32( struct translate_sse *p, 
+			       struct x86_reg dest,
+			       struct x86_reg dataXMM )
+{
+   sse_movlps(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32( struct translate_sse *p, 
+			    struct x86_reg dest,
+			    struct x86_reg dataXMM )
+{
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg dest,
+				       struct x86_reg dataXMM )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(p->func, dataXMM, get_255(p));
+
+   /* Pack and emit:
+    */
+   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+   sse2_packssdw(p->func, dataXMM, dataXMM);
+   sse2_packuswb(p->func, dataXMM, dataXMM);
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+
+
+/* Extended swizzles?  Maybe later.
+ */  
+static void emit_swizzle( struct translate_sse *p,
+			  struct x86_reg dest,
+			  struct x86_reg src,
+			  unsigned char shuffle )
+{
+   sse_shufps(p->func, dest, src, shuffle);
+}
+
+
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg srcECX,
+			       struct x86_reg dstEAX)
+{
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+   switch (a->input_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_load_R32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_load_R32G32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_load_R32G32B32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_load_R32G32B32A32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      break;
+   default:
+      return FALSE;
+   }
+
+   switch (a->output_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_store_R32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_store_R32G32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_store_R32G32B32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   default:
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean init_inputs( struct translate_sse *p,
+                            boolean linear )
+{
+   unsigned i;
+   if (linear) {
+      for (i = 0; i < p->nr_buffers; i++) {
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].stride));
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].ptr));
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].base_ptr));
+         struct x86_reg elt = p->idx_EBX;
+         struct x86_reg tmp = p->tmp_EAX;
+
+
+         /* Calculate pointer to first attrib:
+          */
+         x86_mov(p->func, tmp, buf_stride);
+         x86_imul(p->func, tmp, elt);
+         x86_add(p->func, tmp, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (p->nr_buffers == 1) 
+            x86_mov( p->func, elt, tmp );
+         else
+            x86_mov( p->func, buf_ptr, tmp );
+      }
+   }
+
+   return TRUE;
+}
+
+
+static struct x86_reg get_buffer_ptr( struct translate_sse *p,
+                                      boolean linear,
+                                      unsigned buf_idx,
+                                      struct x86_reg elt )
+{
+   if (linear && p->nr_buffers == 1) {
+      return p->idx_EBX;
+   }
+   else if (linear) {
+      struct x86_reg ptr = p->tmp_EAX;
+      struct x86_reg buf_ptr = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].ptr));
+      
+      x86_mov(p->func, ptr, buf_ptr);
+      return ptr;
+   }
+   else {
+      struct x86_reg ptr = p->tmp_EAX;
+
+      struct x86_reg buf_stride = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].stride));
+
+      struct x86_reg buf_base_ptr = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].base_ptr));
+
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(p->func, ptr, buf_stride);
+      x86_imul(p->func, ptr, elt);
+      x86_add(p->func, ptr, buf_base_ptr);
+      return ptr;
+   }
+}
+
+
+
+static boolean incr_inputs( struct translate_sse *p, 
+                            boolean linear )
+{
+   if (linear && p->nr_buffers == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDX,
+                                            get_offset(p, &p->buffer[0].stride));
+
+      x86_add(p->func, p->idx_EBX, stride);
+      sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+   }
+   else if (linear) {
+      unsigned i;
+
+      /* Is this worthwhile??
+       */
+      for (i = 0; i < p->nr_buffers; i++) {
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+                                                get_offset(p, &p->buffer[i].ptr));
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+                                                   get_offset(p, &p->buffer[i].stride));
+
+         x86_mov(p->func, p->tmp_EAX, buf_ptr);
+         x86_add(p->func, p->tmp_EAX, buf_stride);
+         if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+         x86_mov(p->func, buf_ptr, p->tmp_EAX);
+      }
+   } 
+   else {
+      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+   }
+   
+   return TRUE;
+}
+
+
+/* Build run( struct translate *machine,
+ *            unsigned start,
+ *            unsigned count,
+ *            void *output_buffer )
+ * or
+ *  run_elts( struct translate *machine,
+ *            unsigned *elts,
+ *            unsigned count,
+ *            void *output_buffer )
+ *
+ *  Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct translate_sse *p,
+				  struct x86_function *func,
+				  boolean linear )
+{
+   int fixup, label;
+   unsigned j;
+
+   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
+   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
+   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
+   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
+   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
+
+   p->func = func;
+   p->loaded_inv_255 = FALSE;
+   p->loaded_255 = FALSE;
+   p->loaded_identity = FALSE;
+
+   x86_init_func(p->func);
+
+   /* Push a few regs?
+    */
+   x86_push(p->func, p->idx_EBX);
+   x86_push(p->func, p->count_ESI);
+
+   /* Load arguments into regs:
+    */
+   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
+   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
+   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
+   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
+   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+   fixup = x86_jcc_forward(p->func, cc_E);
+
+   /* always load, needed or not:
+    */
+   init_inputs(p, linear);
+
+   /* Note address for loop jump
+    */
+   label = x86_get_label(p->func);
+   {
+      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+      int last_vb = -1;
+      struct x86_reg vb;
+
+      for (j = 0; j < p->translate.key.nr_elements; j++) {
+         const struct translate_element *a = &p->translate.key.element[j];
+
+         /* Figure out source pointer address:
+          */
+         if (a->input_buffer != last_vb) {
+            last_vb = a->input_buffer;
+            vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
+         }
+         
+         if (!translate_attr( p, a, 
+                              x86_make_disp(vb, a->input_offset), 
+                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
+            return FALSE;
+      }
+
+      /* Next output vertex:
+       */
+      x86_lea(p->func, 
+              p->outbuf_ECX, 
+              x86_make_disp(p->outbuf_ECX, 
+                            p->translate.key.output_stride));
+
+      /* Incr index
+       */ 
+      incr_inputs( p, linear );
+   }
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(p->func, p->count_ESI);
+   x86_jcc(p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func->need_emms)
+      mmx_emms(p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(p->func, fixup);
+
+   /* Pop regs and return
+    */
+   
+   x86_pop(p->func, p->count_ESI);
+   x86_pop(p->func, p->idx_EBX);
+   x86_ret(p->func);
+
+   return TRUE;
+}
+
+
+
+
+
+
+			       
+static void translate_sse_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   if (buf < p->nr_buffers) {
+      p->buffer[buf].base_ptr = (char *)ptr;
+      p->buffer[buf].stride = stride;
+   }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", 
+                       __FUNCTION__, buf, 
+                       p->nr_buffers, 
+                       ptr, stride);
+}
+
+
+static void translate_sse_release( struct translate *translate )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   x86_release_func( &p->linear_func );
+   x86_release_func( &p->elt_func );
+
+   FREE(p);
+}
+
+static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
+			      const unsigned *elts,
+			      unsigned count,
+			      void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run_elts( translate,
+		    elts,
+		    count,
+		    output_buffer );
+}
+
+static void PIPE_CDECL translate_sse_run( struct translate *translate,
+			 unsigned start,
+			 unsigned count,
+			 void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run( translate,
+	       start,
+	       count,
+	       output_buffer );
+}
+
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   struct translate_sse *p = NULL;
+   unsigned i;
+
+   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+      goto fail;
+
+   p = CALLOC_STRUCT( translate_sse );
+   if (p == NULL) 
+      goto fail;
+
+   p->translate.key = *key;
+   p->translate.release = translate_sse_release;
+   p->translate.set_buffer = translate_sse_set_buffer;
+   p->translate.run_elts = translate_sse_run_elts;
+   p->translate.run = translate_sse_run;
+
+   for (i = 0; i < key->nr_elements; i++) 
+      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
+
+   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+
+   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+      goto fail;
+
+   p->gen_run = (run_func)x86_get_func(&p->linear_func);
+   if (p->gen_run == NULL)
+      goto fail;
+
+   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
+   if (p->gen_run_elts == NULL)
+      goto fail;
+
+   return &p->translate;
+
+ fail:
+   if (p)
+      translate_sse_release( &p->translate );
+
+   return NULL;
+}
+
+
+
+#else
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
new file mode 100644
index 0000000000..f611f82773
--- /dev/null
+++ b/src/gallium/auxiliary/util/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = util
+
+C_SOURCES = \
+	p_debug.c \
+	u_blit.c \
+	u_draw_quad.c \
+	u_gen_mipmap.c \
+	u_handle_table.c \
+	u_hash_table.c \
+	u_keymap.c \
+	u_linear.c \
+	u_math.c \
+	u_mm.c \
+	u_rect.c \
+	u_simple_shaders.c \
+	u_snprintf.c \
+	u_stream_stdc.c \
+	u_stream_wd.c \
+	u_tile.c \
+	u_time.c \
+	u_timed_winsys.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
new file mode 100644
index 0000000000..8a04955a16
--- /dev/null
+++ b/src/gallium/auxiliary/util/SConscript
@@ -0,0 +1,26 @@
+Import('*')
+
+util = env.ConvenienceLibrary(
+	target = 'util',
+	source = [
+		'p_debug.c',
+		'p_debug_mem.c',
+		'p_debug_prof.c',
+		'u_blit.c',
+		'u_draw_quad.c',
+		'u_gen_mipmap.c',
+		'u_handle_table.c',
+		'u_hash_table.c',
+		'u_keymap.c',
+		'u_math.c',
+		'u_mm.c',
+		'u_rect.c',
+		'u_simple_shaders.c',
+		'u_snprintf.c',
+		'u_stream_stdc.c',
+		'u_stream_wd.c',
+		'u_tile.c',
+		'u_time.c',
+	])
+
+auxiliaries.insert(0, util)
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
new file mode 100644
index 0000000000..acdfa211c8
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -0,0 +1,756 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (c) 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_config.h" 
+
+#include <stdarg.h>
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+
+#include <windows.h>
+#include <winddi.h>
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+#include <stdio.h> 
+#include <stdlib.h> 
+#include <windows.h> 
+#include <types.h> 
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN      // Exclude rarely-used stuff from Windows headers
+#endif
+#include <windows.h>
+
+#else
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#endif
+
+#include "pipe/p_compiler.h" 
+#include "pipe/p_debug.h" 
+#include "pipe/p_format.h" 
+#include "pipe/p_state.h" 
+#include "pipe/p_inlines.h" 
+#include "util/u_memory.h" 
+#include "util/u_string.h" 
+#include "util/u_stream.h" 
+#include "util/u_math.h" 
+#include "util/u_tile.h" 
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+static INLINE void 
+_EngDebugPrint(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   EngDebugPrint("", (PCHAR)format, ap);
+   va_end(ap);
+}
+#endif
+
+
+void _debug_vprintf(const char *format, va_list ap)
+{
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   /* EngDebugPrint does not handle float point arguments, so we need to use
+    * our own vsnprintf implementation. It is also very slow, so buffer until
+    * we find a newline. */
+   static char buf[512] = {'\0'};
+   size_t len = strlen(buf);
+   int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
+   if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
+      _EngDebugPrint("%s", buf);
+      buf[0] = '\0';
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   /* EngDebugPrint does not handle float point arguments, so we need to use
+    * our own vsnprintf implementation. It is also very slow, so buffer until
+    * we find a newline. */
+   static char buf[512 + 1] = {'\0'};
+   size_t len = strlen(buf);
+   int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
+   if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
+      OutputDebugStringA(buf);
+      buf[0] = '\0';
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   wchar_t *wide_format;
+   long wide_str_len;   
+   char buf[512];   
+   int ret;   
+#if (_WIN32_WCE < 600)
+   ret = vsprintf(buf, format, ap);   
+   if(ret < 0){   
+       sprintf(buf, "Cant handle debug print!");   
+       ret = 25;
+   }
+#else
+   ret = vsprintf_s(buf, 512, format, ap);   
+   if(ret < 0){   
+       sprintf_s(buf, 512, "Cant handle debug print!");   
+       ret = 25;
+   }
+#endif
+   buf[ret] = '\0';   
+   /* Format is ascii - needs to be converted to wchar_t for printing */   
+   wide_str_len = MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1, NULL, 0);   
+   wide_format = (wchar_t *) malloc((wide_str_len+1) * sizeof(wchar_t));   
+   if (wide_format) {   
+      MultiByteToWideChar(CP_ACP, 0, (const char *) buf, -1,   
+            wide_format, wide_str_len);   
+      NKDbgPrintfW(wide_format, wide_format);   
+      free(wide_format);   
+   } 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   /* TODO */
+#else /* !PIPE_SUBSYSTEM_WINDOWS */
+#ifdef DEBUG
+   vfprintf(stderr, format, ap);
+#endif
+#endif
+}
+
+
+#ifdef DEBUG
+void debug_print_blob( const char *name,
+                       const void *blob,
+                       unsigned size )
+{
+   const unsigned *ublob = (const unsigned *)blob;
+   unsigned i;
+
+   debug_printf("%s (%d dwords%s)\n", name, size/4,
+                size%4 ? "... plus a few bytes" : "");
+
+   for (i = 0; i < size/4; i++) {
+      debug_printf("%d:\t%08x\n", i, ublob[i]);
+   }
+}
+#endif
+
+
+void _debug_break(void) 
+{
+#if defined(PIPE_ARCH_X86) && defined(PIPE_CC_GCC)
+   __asm("int3");
+#elif defined(PIPE_ARCH_X86) && defined(PIPE_CC_MSVC)
+   _asm {int 3};
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   EngDebugBreak();
+#else
+   abort();
+#endif
+}
+
+
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+static const char *
+find(const char *start, const char *end, char c) 
+{
+   const char *p;
+   for(p = start; !end || p != end; ++p) {
+      if(*p == c)
+	 return p;
+      if(*p < 32)
+	 break;
+   }
+   return NULL;
+}
+
+static int 
+compare(const char *start, const char *end, const char *s)
+{
+   const char *p, *q;
+   for(p = start, q = s; p != end && *q != '\0'; ++p, ++q) {
+      if(*p != *q)
+	 return 0;
+   }
+   return p == end && *q == '\0';
+}
+
+static void 
+copy(char *dst, const char *start, const char *end, size_t n) 
+{
+   const char *p;
+   char *q;
+   for(p = start, q = dst, n = n - 1; p != end && n; ++p, ++q, --n)
+      *q = *p;
+   *q = '\0';
+}
+#endif
+
+
+static INLINE const char *
+_debug_get_option(const char *name)
+{
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   /* EngMapFile creates the file if it does not exists, so it must either be
+    * disabled on release versions (or put in a less conspicuous place). */
+#ifdef DEBUG
+   const char *result = NULL;
+   ULONG_PTR iFile = 0;
+   const void *pMap = NULL;
+   const char *sol, *eol, *sep;
+   static char output[1024];
+   
+   pMap = EngMapFile(L"\\??\\c:\\gallium.cfg", 0, &iFile);
+   if(pMap) {
+      sol = (const char *)pMap;
+      while(1) {
+	 /* TODO: handle LF line endings */
+	 eol = find(sol, NULL, '\r');
+	 if(!eol || eol == sol)
+	    break;
+	 sep = find(sol, eol, '=');
+	 if(!sep)
+	    break;
+	 if(compare(sol, sep, name)) {
+	    copy(output, sep + 1, eol, sizeof(output));
+	    result = output;
+	    break;
+	 }
+	 sol = eol + 2;
+      }
+      EngUnmapFile(iFile);
+   }
+   return result;
+#else
+   return NULL;
+#endif
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE) || defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT) 
+   /* TODO: implement */
+   return NULL;
+#else
+   return getenv(name);
+#endif
+}
+
+const char *
+debug_get_option(const char *name, const char *dfault)
+{
+   const char *result;
+
+   result = _debug_get_option(name);
+   if(!result)
+      result = dfault;
+      
+   debug_printf("%s: %s = %s\n", __FUNCTION__, name, result ? result : "(null)");
+   
+   return result;
+}
+
+boolean
+debug_get_bool_option(const char *name, boolean dfault)
+{
+   const char *str = _debug_get_option(name);
+   boolean result;
+   
+   if(str == NULL)
+      result = dfault;
+   else if(!util_strcmp(str, "n"))
+      result = FALSE;
+   else if(!util_strcmp(str, "no"))
+      result = FALSE;
+   else if(!util_strcmp(str, "0"))
+      result = FALSE;
+   else if(!util_strcmp(str, "f"))
+      result = FALSE;
+   else if(!util_strcmp(str, "false"))
+      result = FALSE;
+   else
+      result = TRUE;
+
+   debug_printf("%s: %s = %s\n", __FUNCTION__, name, result ? "TRUE" : "FALSE");
+   
+   return result;
+}
+
+
+long
+debug_get_num_option(const char *name, long dfault)
+{
+   long result;
+   const char *str;
+   
+   str = _debug_get_option(name);
+   if(!str)
+      result = dfault;
+   else {
+      long sign;
+      char c;
+      c = *str++;
+      if(c == '-') {
+	 sign = -1;
+	 c = *str++;
+      } 
+      else {
+	 sign = 1;
+      }
+      result = 0;
+      while('0' <= c && c <= '9') {
+	 result = result*10 + (c - '0');
+	 c = *str++;
+      }
+      result *= sign;
+   }
+   
+   debug_printf("%s: %s = %li\n", __FUNCTION__, name, result);
+
+   return result;
+}
+
+
+unsigned long
+debug_get_flags_option(const char *name, 
+                       const struct debug_named_value *flags,
+                       unsigned long dfault)
+{
+   unsigned long result;
+   const char *str;
+   
+   str = _debug_get_option(name);
+   if(!str)
+      result = dfault;
+   else if (!util_strcmp(str, "help")) {
+      result = dfault;
+      while (flags->name) {
+         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+         flags++;
+      }
+   }
+   else {
+      result = 0;
+      while( flags->name ) {
+	 if (!util_strcmp(str, "all") || util_strstr(str, flags->name ))
+	    result |= flags->value;
+	 ++flags;
+      }
+   }
+
+   if (str) {
+      debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+   }
+   else {
+      debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   }
+
+   return result;
+}
+
+
+void _debug_assert_fail(const char *expr, 
+                        const char *file, 
+                        unsigned line, 
+                        const char *function) 
+{
+   _debug_printf("%s:%u:%s: Assertion `%s' failed.\n", file, line, function, expr);
+#if defined(PIPE_OS_WINDOWS) && !defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   if (debug_get_bool_option("GALLIUM_ABORT_ON_ASSERT", FALSE))
+#else
+   if (debug_get_bool_option("GALLIUM_ABORT_ON_ASSERT", TRUE))
+#endif
+      debug_break();
+   else
+      _debug_printf("continuing...\n");
+}
+
+
+const char *
+debug_dump_enum(const struct debug_named_value *names, 
+                unsigned long value)
+{
+   static char rest[64];
+   
+   while(names->name) {
+      if(names->value == value)
+	 return names->name;
+      ++names;
+   }
+
+   util_snprintf(rest, sizeof(rest), "0x%08lx", value);
+   return rest;
+}
+
+
+const char *
+debug_dump_flags(const struct debug_named_value *names, 
+                 unsigned long value)
+{
+   static char output[4096];
+   static char rest[256];
+   int first = 1;
+
+   output[0] = '\0';
+
+   while(names->name) {
+      if((names->value & value) == names->value) {
+	 if (!first)
+	    util_strncat(output, "|", sizeof(output));
+	 else
+	    first = 0;
+	 util_strncat(output, names->name, sizeof(output));
+	 value &= ~names->value;
+      }
+      ++names;
+   }
+   
+   if (value) {
+      if (!first)
+	 util_strncat(output, "|", sizeof(output));
+      else
+	 first = 0;
+      
+      util_snprintf(rest, sizeof(rest), "0x%08lx", value);
+      util_strncat(output, rest, sizeof(output));
+   }
+   
+   if(first)
+      return "0";
+   
+   return output;
+}
+
+
+static const struct debug_named_value pipe_format_names[] = {
+#ifdef DEBUG
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_NONE),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8R8G8B8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8R8G8B8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8A8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8X8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A1R5G5B5_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A4R4G4B4_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R5G6B5_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A2B10G10R10_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_L8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_I8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8L8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_L16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_YCBCR),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_YCBCR_REV),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z32_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z32_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_S8Z24_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z24S8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8Z24_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_Z24X8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_S8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64B64_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R64G64B64A64_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_FLOAT),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R32G32B32A32_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R16G16B16A16_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_UNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_USCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B6G5R5_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8B8G8R8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8B8G8R8_SNORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SSCALED),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_L8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8L8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8A8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_R8G8B8X8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_A8R8G8B8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8R8G8B8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8A8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B8G8R8X8_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_X8UB8UG8SR8S_NORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_B6UG5SR5S_NORM),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_RGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_RGBA),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT3_RGBA),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT5_RGBA),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_SRGB),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT1_SRGBA),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT3_SRGBA),
+   DEBUG_NAMED_VALUE(PIPE_FORMAT_DXT5_SRGBA),
+#endif
+   DEBUG_NAMED_VALUE_END
+};
+
+#ifdef DEBUG
+void debug_print_format(const char *msg, unsigned fmt )
+{
+   debug_printf("%s: %s\n", msg, debug_dump_enum(pipe_format_names, fmt)); 
+}
+#endif
+
+const char *pf_name( enum pipe_format format )
+{
+   return debug_dump_enum(pipe_format_names, format);
+}
+
+
+#ifdef DEBUG
+void debug_dump_image(const char *prefix,
+                      unsigned format, unsigned cpp,
+                      unsigned width, unsigned height,
+                      unsigned stride,
+                      const void *data)     
+{
+#ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
+   static unsigned no = 0; 
+   char filename[256];
+   WCHAR wfilename[sizeof(filename)];
+   ULONG_PTR iFile = 0;
+   struct {
+      unsigned format;
+      unsigned cpp;
+      unsigned width;
+      unsigned height;
+   } header;
+   unsigned char *pMap = NULL;
+   unsigned i;
+
+   util_snprintf(filename, sizeof(filename), "\\??\\c:\\%03u%s.raw", ++no, prefix);
+   for(i = 0; i < sizeof(filename); ++i)
+      wfilename[i] = (WCHAR)filename[i];
+   
+   pMap = (unsigned char *)EngMapFile(wfilename, sizeof(header) + height*width*cpp, &iFile);
+   if(!pMap)
+      return;
+   
+   header.format = format;
+   header.cpp = cpp;
+   header.width = width;
+   header.height = height;
+   memcpy(pMap, &header, sizeof(header));
+   pMap += sizeof(header);
+   
+   for(i = 0; i < height; ++i) {
+      memcpy(pMap, (unsigned char *)data + stride*i, cpp*width);
+      pMap += cpp*width;
+   }
+      
+   EngUnmapFile(iFile);
+#endif
+}
+
+void debug_dump_surface(const char *prefix,
+                        struct pipe_surface *surface)     
+{
+   unsigned surface_usage;
+   void *data;
+
+   if (!surface)
+      goto error1;
+
+   /* XXX: force mappable surface */
+   surface_usage = surface->usage;
+   surface->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+   
+   data = pipe_surface_map(surface, 
+                           PIPE_BUFFER_USAGE_CPU_READ);
+   if(!data)
+      goto error2;
+   
+   debug_dump_image(prefix, 
+                    surface->format,
+                    surface->block.size, 
+                    surface->nblocksx,
+                    surface->nblocksy,
+                    surface->stride,
+                    data);
+   
+   pipe_surface_unmap(surface);
+error2:
+   surface->usage = surface_usage;
+error1:
+   ;
+}
+
+
+#pragma pack(push,2)
+struct bmp_file_header {
+   uint16_t bfType;
+   uint32_t bfSize;
+   uint16_t bfReserved1;
+   uint16_t bfReserved2;
+   uint32_t bfOffBits;
+};
+#pragma pack(pop)
+
+struct bmp_info_header {
+   uint32_t biSize;
+   int32_t biWidth;
+   int32_t biHeight;
+   uint16_t biPlanes;
+   uint16_t biBitCount;
+   uint32_t biCompression;
+   uint32_t biSizeImage;
+   int32_t biXPelsPerMeter;
+   int32_t biYPelsPerMeter;
+   uint32_t biClrUsed;
+   uint32_t biClrImportant;
+};
+
+struct bmp_rgb_quad {
+   uint8_t rgbBlue;
+   uint8_t rgbGreen;
+   uint8_t rgbRed;
+   uint8_t rgbAlpha;
+};
+
+void 
+debug_dump_surface_bmp(const char *filename,
+                       struct pipe_surface *surface)
+{
+#ifndef PIPE_SUBSYSTEM_WINDOWS_MINIPORT
+   struct util_stream *stream;
+   unsigned surface_usage;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   float *rgba;
+   unsigned x, y;
+
+   if (!surface)
+      goto error1;
+
+   rgba = MALLOC(surface->width*4*sizeof(float));
+   if(!rgba)
+      goto error1;
+   
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + surface->height*surface->width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+   
+   bmih.biSize = 40;
+   bmih.biWidth = surface->width;
+   bmih.biHeight = surface->height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = surface->height*surface->width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+   
+   stream = util_stream_create(filename, bmfh.bfSize);
+   if(!stream)
+      goto error2;
+   
+   util_stream_write(stream, &bmfh, 14);
+   util_stream_write(stream, &bmih, 40);
+   
+   /* XXX: force mappable surface */
+   surface_usage = surface->usage;
+   surface->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+   y = surface->height;
+   while(y--) {
+      pipe_get_tile_rgba(surface,
+                         0, y, surface->width, 1,
+                         rgba);
+      for(x = 0; x < surface->width; ++x)
+      {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = float_to_ubyte(rgba[x*4 + 0]);
+         pixel.rgbGreen = float_to_ubyte(rgba[x*4 + 1]);
+         pixel.rgbBlue  = float_to_ubyte(rgba[x*4 + 2]);
+         pixel.rgbAlpha = float_to_ubyte(rgba[x*4 + 3]);
+         util_stream_write(stream, &pixel, 4);
+      }  
+   }
+   
+   surface->usage = surface_usage;
+
+   util_stream_close(stream);
+error2:
+   FREE(rgba);
+error1:
+   ;
+#endif
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/p_debug_mem.c b/src/gallium/auxiliary/util/p_debug_mem.c
new file mode 100644
index 0000000000..250fd60f63
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_debug_mem.c
@@ -0,0 +1,311 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Memory debugging.
+ * 
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#include "pipe/p_config.h" 
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+#include <windows.h>
+#include <winddi.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#include <wdm.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_debug.h" 
+#include "util/u_double_list.h" 
+
+
+#define DEBUG_MEMORY_MAGIC 0x6e34090aU 
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && !defined(WINCE)
+#define real_malloc(_size) EngAllocMem(0, _size, 'D3AG')
+#define real_free(_ptr) EngFreeMem(_ptr)
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#define real_malloc(_size) ExAllocatePool(0, _size)
+#define real_free(_ptr) ExFreePool(_ptr)
+#else
+#define real_malloc(_size) malloc(_size)
+#define real_free(_ptr) free(_ptr)
+#endif
+
+
+struct debug_memory_header 
+{
+   struct list_head head;
+   
+   unsigned long no;
+   const char *file;
+   unsigned line;
+   const char *function;
+   size_t size;
+   unsigned magic;
+};
+
+struct debug_memory_footer
+{
+   unsigned magic;
+};
+
+
+static struct list_head list = { &list, &list };
+
+static unsigned long last_no = 0;
+
+
+static INLINE struct debug_memory_header *
+header_from_data(void *data)
+{
+   if(data)
+      return (struct debug_memory_header *)((char *)data - sizeof(struct debug_memory_header));
+   else
+      return NULL;
+}
+
+static INLINE void *
+data_from_header(struct debug_memory_header *hdr)
+{
+   if(hdr)
+      return (void *)((char *)hdr + sizeof(struct debug_memory_header));
+   else
+      return NULL;
+}
+
+static INLINE struct debug_memory_footer *
+footer_from_header(struct debug_memory_header *hdr)
+{
+   if(hdr)
+      return (struct debug_memory_footer *)((char *)hdr + sizeof(struct debug_memory_header) + hdr->size);
+   else
+      return NULL;
+}
+
+
+void *
+debug_malloc(const char *file, unsigned line, const char *function,
+             size_t size) 
+{
+   struct debug_memory_header *hdr;
+   struct debug_memory_footer *ftr;
+   
+   hdr = real_malloc(sizeof(*hdr) + size + sizeof(*ftr));
+   if(!hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)size);
+      return NULL;
+   }
+ 
+   hdr->no = last_no++;
+   hdr->file = file;
+   hdr->line = line;
+   hdr->function = function;
+   hdr->size = size;
+   hdr->magic = DEBUG_MEMORY_MAGIC;
+
+   ftr = footer_from_header(hdr);
+   ftr->magic = DEBUG_MEMORY_MAGIC;
+   
+   LIST_ADDTAIL(&hdr->head, &list);
+   
+   return data_from_header(hdr);
+}
+
+void
+debug_free(const char *file, unsigned line, const char *function,
+           void *ptr) 
+{
+   struct debug_memory_header *hdr;
+   struct debug_memory_footer *ftr;
+   
+   if(!ptr)
+      return;
+   
+   hdr = header_from_data(ptr);
+   if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+      debug_printf("%s:%u:%s: freeing bad or corrupted memory %p\n",
+                   file, line, function,
+                   ptr);
+      debug_assert(0);
+      return;
+   }
+
+   ftr = footer_from_header(hdr);
+   if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+      debug_printf("%s:%u:%s: buffer overflow %p\n",
+                   hdr->file, hdr->line, hdr->function,
+                   ptr);
+      debug_assert(0);
+   }
+
+   LIST_DEL(&hdr->head);
+   hdr->magic = 0;
+   ftr->magic = 0;
+   
+   real_free(hdr);
+}
+
+void *
+debug_calloc(const char *file, unsigned line, const char *function,
+             size_t count, size_t size )
+{
+   void *ptr = debug_malloc( file, line, function, count * size );
+   if( ptr )
+      memset( ptr, 0, count * size );
+   return ptr;
+}
+
+void *
+debug_realloc(const char *file, unsigned line, const char *function,
+              void *old_ptr, size_t old_size, size_t new_size )
+{
+   struct debug_memory_header *old_hdr, *new_hdr;
+   struct debug_memory_footer *old_ftr, *new_ftr;
+   void *new_ptr;
+   
+   if(!old_ptr)
+      return debug_malloc( file, line, function, new_size );
+   
+   if(!new_size) {
+      debug_free( file, line, function, old_ptr );
+      return NULL;
+   }
+   
+   old_hdr = header_from_data(old_ptr);
+   if(old_hdr->magic != DEBUG_MEMORY_MAGIC) {
+      debug_printf("%s:%u:%s: reallocating bad or corrupted memory %p\n",
+                   file, line, function,
+                   old_ptr);
+      debug_assert(0);
+      return NULL;
+   }
+   
+   old_ftr = footer_from_header(old_hdr);
+   if(old_ftr->magic != DEBUG_MEMORY_MAGIC) {
+      debug_printf("%s:%u:%s: buffer overflow %p\n",
+                   old_hdr->file, old_hdr->line, old_hdr->function,
+                   old_ptr);
+      debug_assert(0);
+   }
+
+   /* alloc new */
+   new_hdr = real_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr));
+   if(!new_hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)new_size);
+      return NULL;
+   }
+   new_hdr->no = old_hdr->no;
+   new_hdr->file = old_hdr->file;
+   new_hdr->line = old_hdr->line;
+   new_hdr->function = old_hdr->function;
+   new_hdr->size = new_size;
+   new_hdr->magic = DEBUG_MEMORY_MAGIC;
+   
+   new_ftr = footer_from_header(new_hdr);
+   new_ftr->magic = DEBUG_MEMORY_MAGIC;
+   
+   LIST_REPLACE(&old_hdr->head, &new_hdr->head);
+
+   /* copy data */
+   new_ptr = data_from_header(new_hdr);
+   memcpy( new_ptr, old_ptr, old_size < new_size ? old_size : new_size );
+
+   /* free old */
+   old_hdr->magic = 0;
+   old_ftr->magic = 0;
+   real_free(old_hdr);
+
+   return new_ptr;
+}
+
+unsigned long
+debug_memory_begin(void)
+{
+   return last_no;
+}
+
+void 
+debug_memory_end(unsigned long start_no)
+{
+   size_t total_size = 0;
+   struct list_head *entry;
+
+   if(start_no == last_no)
+      return;
+
+   entry = list.prev;
+   for (; entry != &list; entry = entry->prev) {
+      struct debug_memory_header *hdr;
+      void *ptr;
+      struct debug_memory_footer *ftr;
+
+      hdr = LIST_ENTRY(struct debug_memory_header, entry, head);
+      ptr = data_from_header(hdr);
+      ftr = footer_from_header(hdr);
+
+      if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: bad or corrupted memory %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
+
+      if((start_no <= hdr->no && hdr->no < last_no) ||
+	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
+	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
+		      hdr->file, hdr->line, hdr->function,
+		      hdr->size, ptr);
+	 total_size += hdr->size;
+      }
+
+      if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: buffer overflow %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
+   }
+
+   if(total_size) {
+      debug_printf("Total of %u KB of system memory apparently leaked\n",
+		   (total_size + 1023)/1024);
+   }
+   else {
+      debug_printf("No memory leaks detected.\n");
+   }
+}
diff --git a/src/gallium/auxiliary/util/p_debug_prof.c b/src/gallium/auxiliary/util/p_debug_prof.c
new file mode 100644
index 0000000000..5f9772ef91
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_debug_prof.c
@@ -0,0 +1,320 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Poor-man profiling.
+ * 
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ * 
+ * @sa http://blogs.msdn.com/joshpoley/archive/2008/03/12/poor-man-s-profiler.aspx
+ * @sa http://www.johnpanzer.com/aci_cuj/index.html
+ */
+
+#include "pipe/p_config.h" 
+
+#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+#include <windows.h>
+#include <winddi.h>
+
+#include "pipe/p_debug.h" 
+#include "util/u_string.h" 
+
+
+#define PROFILE_TABLE_SIZE (1024*1024)
+#define FILE_NAME_SIZE 256
+
+struct debug_profile_entry
+{
+   uintptr_t caller;
+   uintptr_t callee;
+   uint64_t samples;
+};
+
+static unsigned long enabled = 0;
+
+static WCHAR wFileName[FILE_NAME_SIZE] = L"\\??\\c:\\00000000.prof";
+static ULONG_PTR iFile = 0;
+
+static struct debug_profile_entry *table = NULL;
+static unsigned long free_table_entries = 0;
+static unsigned long max_table_entries = 0;
+
+uint64_t start_stamp = 0;
+uint64_t end_stamp = 0;
+
+
+static void
+debug_profile_entry(uintptr_t caller, uintptr_t callee, uint64_t samples)
+{
+   unsigned hash = ( caller + callee ) & PROFILE_TABLE_SIZE - 1;
+   
+   while(1) {
+      if(table[hash].caller == 0 && table[hash].callee == 0) {
+         table[hash].caller = caller;
+         table[hash].callee = callee;
+         table[hash].samples = samples;
+         --free_table_entries;
+         break;
+      }
+      else if(table[hash].caller == caller && table[hash].callee == callee) {
+         table[hash].samples += samples;
+         break;
+      }
+      else {
+         ++hash;
+      }
+   }
+}
+
+
+static uintptr_t caller_stack[1024];
+static unsigned last_caller = 0;
+
+
+static int64_t delta(void) {
+   int64_t result = end_stamp - start_stamp;
+   if(result > UINT64_C(0xffffffff))
+      result = 0;
+   return result;
+}
+
+
+static void __cdecl 
+debug_profile_enter(uintptr_t callee)
+{
+   uintptr_t caller = last_caller ? caller_stack[last_caller - 1] : 0;
+                
+   if (caller)
+      debug_profile_entry(caller, 0, delta());
+   debug_profile_entry(caller, callee, 1);
+   caller_stack[last_caller++] = callee;
+}
+
+
+static void __cdecl
+debug_profile_exit(uintptr_t callee)
+{
+   debug_profile_entry(callee, 0, delta());
+   if(last_caller)
+      --last_caller;
+}
+   
+   
+/**
+ * Called at the start of every method or function.
+ * 
+ * @sa http://msdn.microsoft.com/en-us/library/c63a9b7h.aspx
+ */
+void __declspec(naked) __cdecl 
+_penter(void) {
+   _asm {
+      push eax
+      mov eax, [enabled]
+      test eax, eax
+      jz skip
+
+      push edx
+      
+      rdtsc
+      mov dword ptr [end_stamp], eax
+      mov dword ptr [end_stamp+4], edx
+
+      xor eax, eax
+      mov [enabled], eax
+
+      mov eax, [esp+8]
+
+      push ebx
+      push ecx
+      push ebp
+      push edi
+      push esi
+
+      push eax
+      call debug_profile_enter
+      add esp, 4
+
+      pop esi
+      pop edi
+      pop ebp
+      pop ecx
+      pop ebx
+
+      mov eax, 1
+      mov [enabled], eax 
+
+      rdtsc
+      mov dword ptr [start_stamp], eax
+      mov dword ptr [start_stamp+4], edx
+      
+      pop edx
+skip:
+      pop eax
+      ret
+   }
+}
+
+
+/**
+ * Called at the end of Calls the end of every method or function.
+ * 
+ * @sa http://msdn.microsoft.com/en-us/library/xc11y76y.aspx
+ */
+void __declspec(naked) __cdecl 
+_pexit(void) {
+   _asm {
+      push eax
+      mov eax, [enabled]
+      test eax, eax
+      jz skip
+
+      push edx
+      
+      rdtsc
+      mov dword ptr [end_stamp], eax
+      mov dword ptr [end_stamp+4], edx
+
+      xor eax, eax
+      mov [enabled], eax
+
+      mov eax, [esp+8]
+
+      push ebx
+      push ecx
+      push ebp
+      push edi
+      push esi
+
+      push eax
+      call debug_profile_exit
+      add esp, 4
+
+      pop esi
+      pop edi
+      pop ebp
+      pop ecx
+      pop ebx
+
+      mov eax, 1
+      mov [enabled], eax 
+
+      rdtsc
+      mov dword ptr [start_stamp], eax
+      mov dword ptr [start_stamp+4], edx
+      
+      pop edx
+skip:
+      pop eax
+      ret
+   }
+}
+
+
+/**
+ * Reference function for calibration. 
+ */
+void __declspec(naked) 
+__debug_profile_reference(void) {
+   _asm {
+      call _penter
+      call _pexit
+      ret
+   }
+}
+
+
+void
+debug_profile_start(void)
+{
+   WCHAR *p;
+
+   // increment starting from the less significant digit
+   p = &wFileName[14];
+   while(1) {
+      if(*p == '9') {
+         *p-- = '0';
+      }
+      else {
+         *p += 1;
+         break;
+      }
+   }
+
+   table = EngMapFile(wFileName, 
+                      PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry), 
+                      &iFile);
+   if(table) {
+      unsigned i;
+      
+      free_table_entries = max_table_entries = PROFILE_TABLE_SIZE;
+      memset(table, 0, PROFILE_TABLE_SIZE*sizeof(struct debug_profile_entry));
+      
+      table[0].caller = (uintptr_t)&__debug_profile_reference;
+      table[0].callee = 0;
+      table[0].samples = 0;
+      --free_table_entries;
+
+      _asm {
+         push edx
+         push eax
+      
+         rdtsc
+         mov dword ptr [start_stamp], eax
+         mov dword ptr [start_stamp+4], edx
+         
+         pop edx
+         pop eax
+      }
+
+      last_caller = 0;
+      
+      enabled = 1;
+
+      for(i = 0; i < 8; ++i) {
+         _asm {
+            call __debug_profile_reference
+         }
+      }
+   }
+}
+
+
+void 
+debug_profile_stop(void)
+{
+   enabled = 0;
+
+   if(iFile)
+      EngUnmapFile(iFile);
+   iFile = 0;
+   table = NULL;
+   free_table_entries = max_table_entries = 0;
+}
+
+#endif /* PROFILE */
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
new file mode 100644
index 0000000000..d28201ac8d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -0,0 +1,520 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Copy/blit pixel rect between surfaces
+ *  
+ * @author Brian Paul
+ */
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_blit.h"
+#include "util/u_draw_quad.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+
+#include "cso_cache/cso_context.h"
+
+
+struct blit_state
+{
+   struct pipe_context *pipe;
+   struct cso_context *cso;
+
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state depthstencil;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_viewport_state viewport;
+
+   struct pipe_shader_state vert_shader;
+   struct pipe_shader_state frag_shader;
+   void *vs;
+   void *fs;
+
+   struct pipe_buffer *vbuf;  /**< quad vertices */
+   float vertices[4][2][4];   /**< vertex/texcoords for quad */
+};
+
+
+/**
+ * Create state object for blit.
+ * Intended to be created once and re-used for many blit() calls.
+ */
+struct blit_state *
+util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
+{
+   struct blit_state *ctx;
+   uint i;
+
+   ctx = CALLOC_STRUCT(blit_state);
+   if (!ctx)
+      return NULL;
+
+   ctx->pipe = pipe;
+   ctx->cso = cso;
+
+   /* disabled blending/masking */
+   memset(&ctx->blend, 0, sizeof(ctx->blend));
+   ctx->blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+   ctx->blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   ctx->blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+   ctx->blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+   ctx->blend.colormask = PIPE_MASK_RGBA;
+
+   /* no-op depth/stencil/alpha */
+   memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
+
+   /* rasterizer */
+   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
+   ctx->rasterizer.front_winding = PIPE_WINDING_CW;
+   ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
+   ctx->rasterizer.bypass_clipping = 1;
+   /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
+
+   /* samplers */
+   memset(&ctx->sampler, 0, sizeof(ctx->sampler));
+   ctx->sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+   ctx->sampler.min_img_filter = 0; /* set later */
+   ctx->sampler.mag_img_filter = 0; /* set later */
+   ctx->sampler.normalized_coords = 1;
+
+   /* viewport (identity, we setup vertices in wincoords) */
+   ctx->viewport.scale[0] = 1.0;
+   ctx->viewport.scale[1] = 1.0;
+   ctx->viewport.scale[2] = 1.0;
+   ctx->viewport.scale[3] = 1.0;
+   ctx->viewport.translate[0] = 0.0;
+   ctx->viewport.translate[1] = 0.0;
+   ctx->viewport.translate[2] = 0.0;
+   ctx->viewport.translate[3] = 0.0;
+
+   /* vertex shader */
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0 };
+      ctx->vs = util_make_vertex_passthrough_shader(pipe, 2, semantic_names,
+                                                    semantic_indexes,
+                                                    &ctx->vert_shader);
+   }
+
+   /* fragment shader */
+   ctx->fs = util_make_fragment_tex_shader(pipe, &ctx->frag_shader);
+
+   ctx->vbuf = pipe_buffer_create(pipe->screen,
+                                  32,
+                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  sizeof(ctx->vertices));
+   if (!ctx->vbuf) {
+      FREE(ctx);
+      ctx->pipe->delete_fs_state(ctx->pipe, ctx->fs);
+      ctx->pipe->delete_vs_state(ctx->pipe, ctx->vs);
+      return NULL;
+   }
+
+   /* init vertex data that doesn't change */
+   for (i = 0; i < 4; i++) {
+      ctx->vertices[i][0][3] = 1.0f; /* w */
+      ctx->vertices[i][1][2] = 0.0f; /* r */
+      ctx->vertices[i][1][3] = 1.0f; /* q */
+   }
+
+   return ctx;
+}
+
+
+/**
+ * Destroy a blit context
+ */
+void
+util_destroy_blit(struct blit_state *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   pipe->delete_vs_state(pipe, ctx->vs);
+   pipe->delete_fs_state(pipe, ctx->fs);
+
+   FREE((void*) ctx->vert_shader.tokens);
+   FREE((void*) ctx->frag_shader.tokens);
+
+   pipe_buffer_reference(pipe->screen, &ctx->vbuf, NULL);
+
+   FREE(ctx);
+}
+
+
+/**
+ * Setup vertex data for the textured quad we'll draw.
+ * Note: y=0=top
+ */
+static void
+setup_vertex_data(struct blit_state *ctx,
+                  float x0, float y0, float x1, float y1, float z)
+{
+   void *buf;
+
+   ctx->vertices[0][0][0] = x0;
+   ctx->vertices[0][0][1] = y0;
+   ctx->vertices[0][0][2] = z;
+   ctx->vertices[0][1][0] = 0.0f; /*s*/
+   ctx->vertices[0][1][1] = 0.0f; /*t*/
+
+   ctx->vertices[1][0][0] = x1;
+   ctx->vertices[1][0][1] = y0;
+   ctx->vertices[1][0][2] = z;
+   ctx->vertices[1][1][0] = 1.0f; /*s*/
+   ctx->vertices[1][1][1] = 0.0f; /*t*/
+
+   ctx->vertices[2][0][0] = x1;
+   ctx->vertices[2][0][1] = y1;
+   ctx->vertices[2][0][2] = z;
+   ctx->vertices[2][1][0] = 1.0f;
+   ctx->vertices[2][1][1] = 1.0f;
+
+   ctx->vertices[3][0][0] = x0;
+   ctx->vertices[3][0][1] = y1;
+   ctx->vertices[3][0][2] = z;
+   ctx->vertices[3][1][0] = 0.0f;
+   ctx->vertices[3][1][1] = 1.0f;
+
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
+
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
+}
+
+
+/**
+ * Setup vertex data for the textured quad we'll draw.
+ * Note: y=0=top
+ */
+static void
+setup_vertex_data_tex(struct blit_state *ctx,
+                      float x0, float y0, float x1, float y1,
+                      float s0, float t0, float s1, float t1,
+                      float z)
+{
+   void *buf;
+
+   ctx->vertices[0][0][0] = x0;
+   ctx->vertices[0][0][1] = y0;
+   ctx->vertices[0][0][2] = z;
+   ctx->vertices[0][1][0] = s0; /*s*/
+   ctx->vertices[0][1][1] = t0; /*t*/
+
+   ctx->vertices[1][0][0] = x1;
+   ctx->vertices[1][0][1] = y0;
+   ctx->vertices[1][0][2] = z;
+   ctx->vertices[1][1][0] = s1; /*s*/
+   ctx->vertices[1][1][1] = t0; /*t*/
+
+   ctx->vertices[2][0][0] = x1;
+   ctx->vertices[2][0][1] = y1;
+   ctx->vertices[2][0][2] = z;
+   ctx->vertices[2][1][0] = s1;
+   ctx->vertices[2][1][1] = t1;
+
+   ctx->vertices[3][0][0] = x0;
+   ctx->vertices[3][0][1] = y1;
+   ctx->vertices[3][0][2] = z;
+   ctx->vertices[3][1][0] = s0;
+   ctx->vertices[3][1][1] = t1;
+
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
+
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
+}
+/**
+ * Copy pixel block from src surface to dst surface.
+ * Overlapping regions are acceptable.
+ * XXX need some control over blitting Z and/or stencil.
+ */
+void
+util_blit_pixels(struct blit_state *ctx,
+                 struct pipe_surface *src,
+                 int srcX0, int srcY0,
+                 int srcX1, int srcY1,
+                 struct pipe_surface *dst,
+                 int dstX0, int dstY0,
+                 int dstX1, int dstY1,
+                 float z, uint filter)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_texture texTemp, *tex;
+   struct pipe_surface *texSurf;
+   struct pipe_framebuffer_state fb;
+   const int srcW = abs(srcX1 - srcX0);
+   const int srcH = abs(srcY1 - srcY0);
+   const int srcLeft = MIN2(srcX0, srcX1);
+   const int srcTop = MIN2(srcY0, srcY1);
+
+   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
+          filter == PIPE_TEX_MIPFILTER_LINEAR);
+
+   if (srcLeft != srcX0) {
+      /* left-right flip */
+      int tmp = dstX0;
+      dstX0 = dstX1;
+      dstX1 = tmp;
+   }
+
+   if (srcTop != srcY0) {
+      /* up-down flip */
+      int tmp = dstY0;
+      dstY0 = dstY1;
+      dstY1 = tmp;
+   }
+
+   assert(screen->is_format_supported(screen, src->format, PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_SAMPLER, 0));
+   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_SAMPLER, 0));
+
+   if(dst->format == src->format && (dstX1 - dstX0) == srcW && (dstY1 - dstY0) == srcH) {
+      /* FIXME: this will most surely fail for overlapping rectangles */
+      pipe->surface_copy(pipe, FALSE,
+			 dst, dstX0, dstY0,   /* dest */
+			 src, srcX0, srcY0, /* src */
+			 srcW, srcH);     /* size */
+      return;
+   }
+   
+   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
+
+   /*
+    * XXX for now we're always creating a temporary texture.
+    * Strictly speaking that's not always needed.
+    */
+
+   /* create temp texture */
+   memset(&texTemp, 0, sizeof(texTemp));
+   texTemp.target = PIPE_TEXTURE_2D;
+   texTemp.format = src->format;
+   texTemp.last_level = 0;
+   texTemp.width[0] = srcW;
+   texTemp.height[0] = srcH;
+   texTemp.depth[0] = 1;
+   texTemp.compressed = 0;
+   pf_get_block(src->format, &texTemp.block);
+
+   tex = screen->texture_create(screen, &texTemp);
+   if (!tex)
+      return;
+
+   texSurf = screen->get_tex_surface(screen, tex, 0, 0, 0, 
+                                     PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   /* load temp texture */
+   pipe->surface_copy(pipe, FALSE,
+                      texSurf, 0, 0,   /* dest */
+                      src, srcLeft, srcTop, /* src */
+                      srcW, srcH);     /* size */
+
+   /* free the surface, update the texture if necessary.
+    */
+   screen->tex_surface_release(screen, &texSurf);
+
+   /* save state (restored below) */
+   cso_save_blend(ctx->cso);
+   cso_save_depth_stencil_alpha(ctx->cso);
+   cso_save_rasterizer(ctx->cso);
+   cso_save_samplers(ctx->cso);
+   cso_save_sampler_textures(ctx->cso);
+   cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
+
+   /* set misc state we care about */
+   cso_set_blend(ctx->cso, &ctx->blend);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
+   /* sampler */
+   ctx->sampler.min_img_filter = filter;
+   ctx->sampler.mag_img_filter = filter;
+   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+   cso_single_sampler_done(ctx->cso);
+
+   /* texture */
+   cso_set_sampler_textures(ctx->cso, 1, &tex);
+
+   /* shaders */
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
+
+   /* drawing dest */
+   memset(&fb, 0, sizeof(fb));
+   fb.width = dst->width;
+   fb.height = dst->height;
+   fb.num_cbufs = 1;
+   fb.cbufs[0] = dst;
+   cso_set_framebuffer(ctx->cso, &fb);
+
+   /* draw quad */
+   setup_vertex_data(ctx,
+                     (float) dstX0, (float) dstY0, 
+                     (float) dstX1, (float) dstY1, z);
+
+   util_draw_vertex_buffer(ctx->pipe, ctx->vbuf,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+
+   /* restore state we changed */
+   cso_restore_blend(ctx->cso);
+   cso_restore_depth_stencil_alpha(ctx->cso);
+   cso_restore_rasterizer(ctx->cso);
+   cso_restore_samplers(ctx->cso);
+   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+
+   screen->texture_release(screen, &tex);
+}
+
+/**
+ * Copy pixel block from src texture to dst surface.
+ * Overlapping regions are acceptable.
+ *
+ * XXX Should support selection of level.
+ * XXX need some control over blitting Z and/or stencil.
+ */
+void
+util_blit_pixels_tex(struct blit_state *ctx,
+                 struct pipe_texture *tex,
+                 int srcX0, int srcY0,
+                 int srcX1, int srcY1,
+                 struct pipe_surface *dst,
+                 int dstX0, int dstY0,
+                 int dstX1, int dstY1,
+                 float z, uint filter)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_framebuffer_state fb;
+   float s0, t0, s1, t1;
+
+   assert(filter == PIPE_TEX_MIPFILTER_NEAREST ||
+          filter == PIPE_TEX_MIPFILTER_LINEAR);
+
+   assert(tex->width[0] != 0);
+   assert(tex->height[0] != 0);
+
+   s0 = srcX0 / (float)tex->width[0];
+   s1 = srcX1 / (float)tex->width[0];
+   t0 = srcY0 / (float)tex->height[0];
+   t1 = srcY1 / (float)tex->height[0];
+
+   assert(screen->is_format_supported(screen, dst->format, PIPE_TEXTURE_2D,
+                                      PIPE_TEXTURE_USAGE_RENDER_TARGET, 0));
+
+   /* save state (restored below) */
+   cso_save_blend(ctx->cso);
+   cso_save_depth_stencil_alpha(ctx->cso);
+   cso_save_rasterizer(ctx->cso);
+   cso_save_samplers(ctx->cso);
+   cso_save_sampler_textures(ctx->cso);
+   cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
+
+   /* set misc state we care about */
+   cso_set_blend(ctx->cso, &ctx->blend);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
+   /* sampler */
+   ctx->sampler.min_img_filter = filter;
+   ctx->sampler.mag_img_filter = filter;
+   cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+   cso_single_sampler_done(ctx->cso);
+
+   /* texture */
+   cso_set_sampler_textures(ctx->cso, 1, &tex);
+
+   /* shaders */
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
+
+   /* drawing dest */
+   memset(&fb, 0, sizeof(fb));
+   fb.width = dst->width;
+   fb.height = dst->height;
+   fb.num_cbufs = 1;
+   fb.cbufs[0] = dst;
+   cso_set_framebuffer(ctx->cso, &fb);
+
+   /* draw quad */
+   setup_vertex_data_tex(ctx,
+                     (float) dstX0, (float) dstY0,
+                     (float) dstX1, (float) dstY1,
+                     s0, t0, s1, t1,
+                     z);
+
+   util_draw_vertex_buffer(ctx->pipe, ctx->vbuf,
+                           PIPE_PRIM_TRIANGLE_FAN,
+                           4,  /* verts */
+                           2); /* attribs/vert */
+
+   /* restore state we changed */
+   cso_restore_blend(ctx->cso);
+   cso_restore_depth_stencil_alpha(ctx->cso);
+   cso_restore_rasterizer(ctx->cso);
+   cso_restore_samplers(ctx->cso);
+   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+}
diff --git a/src/gallium/auxiliary/util/u_blit.h b/src/gallium/auxiliary/util/u_blit.h
new file mode 100644
index 0000000000..308075698f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_blit.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef U_BLIT_H
+#define U_BLIT_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct pipe_context;
+struct pipe_surface;
+struct pipe_texture;
+struct cso_context;
+
+
+struct blit_state;
+
+
+extern struct blit_state *
+util_create_blit(struct pipe_context *pipe, struct cso_context *cso);
+
+extern void
+util_destroy_blit(struct blit_state *ctx);
+
+extern void
+util_blit_pixels(struct blit_state *ctx,
+                 struct pipe_surface *src,
+                 int srcX0, int srcY0,
+                 int srcX1, int srcY1,
+                 struct pipe_surface *dst,
+                 int dstX0, int dstY0,
+                 int dstX1, int dstY1,
+                 float z, uint filter);
+
+extern void
+util_blit_pixels_tex(struct blit_state *ctx,
+                     struct pipe_texture *tex,
+                     int srcX0, int srcY0,
+                     int srcX1, int srcY1,
+                     struct pipe_surface *dst,
+                     int dstX0, int dstY0,
+                     int dstX1, int dstY1,
+                     float z, uint filter);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
new file mode 100644
index 0000000000..d9f2f8fc28
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -0,0 +1,506 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Dennis Smit
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+ */
+
+/* FIXME: clean this entire file up */
+
+#include "u_cpu_detect.h"
+
+#ifdef __linux__
+#define OS_LINUX
+#endif
+#ifdef WIN32
+#define OS_WIN32
+#endif
+
+#if defined(ARCH_POWERPC)
+#if defined(OS_DARWIN)
+#include <sys/sysctl.h>
+#else
+#include <signal.h>
+#include <setjmp.h>
+#endif
+#endif
+
+#if defined(OS_NETBSD) || defined(OS_OPENBSD)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+#if defined(OS_FREEBSD)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(OS_LINUX)
+#include <signal.h>
+#endif
+
+#if defined(OS_WIN32)
+#include <windows.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+
+static struct cpu_detect_caps __cpu_detect_caps;
+static int __cpu_detect_initialized = 0;
+
+static int has_cpuid(void);
+static int cpuid(unsigned int ax, unsigned int *p);
+
+/* The sigill handlers */
+#if defined(ARCH_X86) /* x86 (linux katmai handler check thing) */
+#if defined(OS_LINUX) && defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
+static void sigill_handler_sse(int signal, struct sigcontext sc)
+{
+	/* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
+	 * instructions are 3 bytes long.  We must increment the instruction
+	 * pointer manually to avoid repeated execution of the offending
+	 * instruction.
+	 *
+	 * If the SIGILL is caused by a divide-by-zero when unmasked
+	 * exceptions aren't supported, the SIMD FPU status and control
+	 * word will be restored at the end of the test, so we don't need
+	 * to worry about doing it here.  Besides, we may not be able to...
+	 */
+	sc.eip += 3;
+
+	__cpu_detect_caps.hasSSE=0;
+}
+
+static void sigfpe_handler_sse(int signal, struct sigcontext sc)
+{
+	if (sc.fpstate->magic != 0xffff) {
+		/* Our signal context has the extended FPU state, so reset the
+		 * divide-by-zero exception mask and clear the divide-by-zero
+		 * exception bit.
+		 */
+		sc.fpstate->mxcsr |= 0x00000200;
+		sc.fpstate->mxcsr &= 0xfffffffb;
+	} else {
+		/* If we ever get here, we're completely hosed.
+		*/
+	}
+}
+#endif
+#endif /* OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
+
+#if defined(OS_WIN32)
+LONG CALLBACK win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
+{
+	if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
+		ep->ContextRecord->Eip +=3;
+		__cpu_detect_caps.hasSSE=0;
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+	return EXCEPTION_CONTINUE_SEARCH;
+}
+#endif /* OS_WIN32 */
+
+
+#if defined(ARCH_POWERPC) && !defined(OS_DARWIN)
+static sigjmp_buf __lv_powerpc_jmpbuf;
+static volatile sig_atomic_t __lv_powerpc_canjump = 0;
+
+static void sigill_handler (int sig);
+
+static void sigill_handler (int sig)
+{
+	if (!__lv_powerpc_canjump) {
+		signal (sig, SIG_DFL);
+		raise (sig);
+	}
+
+	__lv_powerpc_canjump = 0;
+	siglongjmp(__lv_powerpc_jmpbuf, 1);
+}
+
+static void check_os_altivec_support(void)
+{
+#if defined(OS_DARWIN)
+	int sels[2] = {CTL_HW, HW_VECTORUNIT};
+	int has_vu = 0;
+	int len = sizeof (has_vu);
+	int err;
+
+	err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+	if (err == 0) {
+		if (has_vu != 0) {
+			__cpu_detect_caps.hasAltiVec = 1;
+		}
+	}
+#else /* !OS_DARWIN */
+	/* no Darwin, do it the brute-force way */
+	/* this is borrowed from the libmpeg2 library */
+	signal(SIGILL, sigill_handler);
+	if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
+		signal(SIGILL, SIG_DFL);
+	} else {
+		__lv_powerpc_canjump = 1;
+
+		__asm __volatile
+			("mtspr 256, %0\n\t"
+			 "vand %%v0, %%v0, %%v0"
+			 :
+			 : "r" (-1));
+
+		signal(SIGILL, SIG_DFL);
+		__cpu_detect_caps.hasAltiVec = 1;
+	}
+#endif
+}
+#endif
+
+/* If we're running on a processor that can do SSE, let's see if we
+ * are allowed to or not.  This will catch 2.4.0 or later kernels that
+ * haven't been configured for a Pentium III but are running on one,
+ * and RedHat patched 2.2 kernels that have broken exception handling
+ * support for user space apps that do SSE.
+ */
+static void check_os_katmai_support(void)
+{
+#if defined(ARCH_X86)
+#if defined(OS_FREEBSD)
+	int has_sse=0, ret;
+	int len = sizeof (has_sse);
+
+	ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
+	if (ret || !has_sse)
+		__cpu_detect_caps.hasSSE=0;
+
+#elif defined(OS_NETBSD) || defined(OS_OPENBSD)
+	int has_sse, has_sse2, ret, mib[2];
+	int varlen;
+
+	mib[0] = CTL_MACHDEP;
+	mib[1] = CPU_SSE;
+	varlen = sizeof (has_sse);
+
+	ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
+	if (ret < 0 || !has_sse) {
+		__cpu_detect_caps.hasSSE = 0;
+	} else {
+		__cpu_detect_caps.hasSSE = 1;
+	}
+
+	mib[1] = CPU_SSE2;
+	varlen = sizeof (has_sse2);
+	ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
+	if (ret < 0 || !has_sse2) {
+		__cpu_detect_caps.hasSSE2 = 0;
+	} else {
+		__cpu_detect_caps.hasSSE2 = 1;
+	}
+	__cpu_detect_caps.hasSSE = 0; /* FIXME ?!?!? */
+
+#elif defined(OS_WIN32)
+	LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
+	if (__cpu_detect_caps.hasSSE) {
+		exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
+		__asm __volatile ("xorps %xmm0, %xmm0");
+		SetUnhandledExceptionFilter(exc_fil);
+	}
+#elif defined(OS_LINUX)
+	struct sigaction saved_sigill;
+	struct sigaction saved_sigfpe;
+
+	/* Save the original signal handlers.
+	*/
+	sigaction(SIGILL, NULL, &saved_sigill);
+	sigaction(SIGFPE, NULL, &saved_sigfpe);
+
+	signal(SIGILL, (void (*)(int))sigill_handler_sse);
+	signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
+
+	/* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
+	 * supports the extended FPU save and restore required for SSE.  If
+	 * we execute an SSE instruction on a PIII and get a SIGILL, the OS
+	 * doesn't support Streaming SIMD Exceptions, even if the processor
+	 * does.
+	 */
+	if (__cpu_detect_caps.hasSSE) {
+		__asm __volatile ("xorps %xmm1, %xmm0");
+	}
+
+	/* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
+	 * it supports unmasked SIMD FPU exceptions.  If we unmask the
+	 * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
+	 * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
+	 * as expected, we're okay but we need to clean up after it.
+	 *
+	 * Are we being too stringent in our requirement that the OS support
+	 * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
+	 * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
+	 * doesn't even support them.  We at least know the user-space SSE
+	 * support is good in kernels that do support unmasked exceptions,
+	 * and therefore to be safe I'm going to leave this test in here.
+	 */
+	if (__cpu_detect_caps.hasSSE) {
+		//      test_os_katmai_exception_support();
+	}
+
+	/* Restore the original signal handlers.
+	*/
+	sigaction(SIGILL, &saved_sigill, NULL);
+	sigaction(SIGFPE, &saved_sigfpe, NULL);
+
+#else
+	/* We can't use POSIX signal handling to test the availability of
+	 * SSE, so we disable it by default.
+	 */
+	__cpu_detect_caps.hasSSE = 0;
+#endif /* __linux__ */
+#endif
+}
+
+
+static int has_cpuid(void)
+{
+#if defined(ARCH_X86)
+	int a, c;
+
+	__asm __volatile
+		("pushf\n"
+		 "popl %0\n"
+		 "movl %0, %1\n"
+		 "xorl $0x200000, %0\n"
+		 "push %0\n"
+		 "popf\n"
+		 "pushf\n"
+		 "popl %0\n"
+		 : "=a" (a), "=c" (c)
+		 :
+		 : "cc");
+
+	return a != c;
+#else
+	return 0;
+#endif
+}
+
+static int cpuid(unsigned int ax, unsigned int *p)
+{
+#if defined(ARCH_X86)
+	unsigned int flags;
+
+	__asm __volatile
+		("movl %%ebx, %%esi\n\t"
+		 "cpuid\n\t"
+		 "xchgl %%ebx, %%esi"
+		 : "=a" (p[0]), "=S" (p[1]),
+		 "=c" (p[2]), "=d" (p[3])
+		 : "0" (ax));
+
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+void cpu_detect_initialize()
+{
+	unsigned int regs[4];
+	unsigned int regs2[4];
+
+	int mib[2], ncpu;
+	int len;
+
+	memset(&__cpu_detect_caps, 0, sizeof (struct cpu_detect_caps));
+
+	/* Check for arch type */
+#if defined(ARCH_MIPS)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_MIPS;
+#elif defined(ARCH_ALPHA)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_ALPHA;
+#elif defined(ARCH_SPARC)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_SPARC;
+#elif defined(ARCH_X86)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_X86;
+#elif defined(ARCH_POWERPC)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_POWERPC;
+#else
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_OTHER;
+#endif
+
+	/* Count the number of CPUs in system */
+#if !defined(OS_WIN32) && !defined(OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
+	__cpu_detect_caps.nrcpu = sysconf(_SC_NPROCESSORS_ONLN);
+	if (__cpu_detect_caps.nrcpu == -1)
+		__cpu_detect_caps.nrcpu = 1;
+
+#elif defined(OS_NETBSD) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+
+	len = sizeof (ncpu);
+	sysctl(mib, 2, &ncpu, &len, NULL, 0);
+	__cpu_detect_caps.nrcpu = ncpu;
+
+#else
+	__cpu_detect_caps.nrcpu = 1;
+#endif
+
+#if defined(ARCH_X86)
+	/* No cpuid, old 486 or lower */
+	if (has_cpuid() == 0)
+		return;
+
+	__cpu_detect_caps.cacheline = 32;
+
+	/* Get max cpuid level */
+	cpuid(0x00000000, regs);
+
+	if (regs[0] >= 0x00000001) {
+		unsigned int cacheline;
+
+		cpuid (0x00000001, regs2);
+
+		__cpu_detect_caps.x86cpuType = (regs2[0] >> 8) & 0xf;
+		if (__cpu_detect_caps.x86cpuType == 0xf)
+		    __cpu_detect_caps.x86cpuType = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
+
+		/* general feature flags */
+		__cpu_detect_caps.hasTSC  = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
+		__cpu_detect_caps.hasMMX  = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+		__cpu_detect_caps.hasSSE  = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
+		__cpu_detect_caps.hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
+		__cpu_detect_caps.hasSSE3 = (regs2[2] & (1));	       /* 0x0000001 */
+		__cpu_detect_caps.hasSSSE3 = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
+		__cpu_detect_caps.hasMMX2 = __cpu_detect_caps.hasSSE; /* SSE cpus supports mmxext too */
+
+		cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+		if (cacheline > 0)
+			__cpu_detect_caps.cacheline = cacheline;
+	}
+
+	cpuid(0x80000000, regs);
+
+	if (regs[0] >= 0x80000001) {
+
+		cpuid(0x80000001, regs2);
+
+		__cpu_detect_caps.hasMMX  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+		__cpu_detect_caps.hasMMX2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
+		__cpu_detect_caps.has3DNow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
+		__cpu_detect_caps.has3DNowExt = (regs2[3] & (1 << 30 )) >> 30;
+	}
+
+	if (regs[0] >= 0x80000006) {
+		cpuid(0x80000006, regs2);
+		__cpu_detect_caps.cacheline = regs2[2] & 0xFF;
+	}
+
+
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_CYGWIN) || defined(OS_OPENBSD)
+	if (__cpu_detect_caps.hasSSE)
+		check_os_katmai_support();
+
+	if (!__cpu_detect_caps.hasSSE) {
+		__cpu_detect_caps.hasSSE2 = 0;
+		__cpu_detect_caps.hasSSE3 = 0;
+		__cpu_detect_caps.hasSSSE3 = 0;
+	}
+#else
+	__cpu_detect_caps.hasSSE = 0;
+	__cpu_detect_caps.hasSSE2 = 0;
+	__cpu_detect_caps.hasSSE3 = 0;
+	__cpu_detect_caps.hasSSSE3 = 0;
+#endif
+#endif /* ARCH_X86 */
+
+#if defined(ARCH_POWERPC)
+	check_os_altivec_support();
+#endif /* ARCH_POWERPC */
+
+	__cpu_detect_initialized = 1;
+}
+
+struct cpu_detect_caps *cpu_detect_get_caps()
+{
+	return &__cpu_detect_caps;
+}
+
+/* The getters and setters for feature flags */
+int cpu_detect_get_tsc()
+{
+	return __cpu_detect_caps.hasTSC;
+}
+
+int cpu_detect_get_mmx()
+{
+	return __cpu_detect_caps.hasMMX;
+}
+
+int cpu_detect_get_mmx2()
+{
+	return __cpu_detect_caps.hasMMX2;
+}
+
+int cpu_detect_get_sse()
+{
+	return __cpu_detect_caps.hasSSE;
+}
+
+int cpu_detect_get_sse2()
+{
+	return __cpu_detect_caps.hasSSE2;
+}
+
+int cpu_detect_get_sse3()
+{
+	return __cpu_detect_caps.hasSSE3;
+}
+
+int cpu_detect_get_ssse3()
+{
+	return __cpu_detect_caps.hasSSSE3;
+}
+
+int cpu_detect_get_3dnow()
+{
+	return __cpu_detect_caps.has3DNow;
+}
+
+int cpu_detect_get_3dnow2()
+{
+	return __cpu_detect_caps.has3DNowExt;
+}
+
+int cpu_detect_get_altivec()
+{
+	return __cpu_detect_caps.hasAltiVec;
+}
+
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
new file mode 100644
index 0000000000..1612d49286
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Dennis Smit
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ***************************************************************************/
+
+/*
+ * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+ */
+
+#ifndef _CPU_DETECT_H
+#define _CPU_DETECT_H
+
+typedef enum {
+	CPU_DETECT_TYPE_MIPS,
+	CPU_DETECT_TYPE_ALPHA,
+	CPU_DETECT_TYPE_SPARC,
+	CPU_DETECT_TYPE_X86,
+	CPU_DETECT_TYPE_POWERPC,
+	CPU_DETECT_TYPE_OTHER
+} cpu_detect_type;
+
+struct cpu_detect_caps {
+	cpu_detect_type	type;
+	int		nrcpu;
+
+	/* Feature flags */
+	int		x86cpuType;
+	int		cacheline;
+
+	int		hasTSC;
+	int		hasMMX;
+	int		hasMMX2;
+	int		hasSSE;
+	int		hasSSE2;
+	int		hasSSE3;
+	int		hasSSSE3;
+	int		has3DNow;
+	int		has3DNowExt;
+	int		hasAltiVec;
+};
+
+/* prototypes */
+void cpu_detect_initialize(void);
+struct cpu_detect_caps *cpu_detect_get_caps(void);
+
+int cpu_detect_get_tsc(void);
+int cpu_detect_get_mmx(void);
+int cpu_detect_get_mmx2(void);
+int cpu_detect_get_sse(void);
+int cpu_detect_get_sse2(void);
+int cpu_detect_get_sse3(void);
+int cpu_detect_get_ssse3(void);
+int cpu_detect_get_3dnow(void);
+int cpu_detect_get_3dnow2(void);
+int cpu_detect_get_altivec(void);
+
+#endif /* _CPU_DETECT_H */
diff --git a/src/gallium/auxiliary/util/u_double_list.h b/src/gallium/auxiliary/util/u_double_list.h
new file mode 100644
index 0000000000..d108d92e52
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_double_list.h
@@ -0,0 +1,99 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND. USA.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * List macros heavily inspired by the Linux kernel
+ * list handling. No list looping yet.
+ * 
+ * Is not threadsafe, so common operations need to
+ * be protected using an external mutex.
+ */
+
+#ifndef _U_DOUBLE_LIST_H_
+#define _U_DOUBLE_LIST_H_
+
+
+#include <stddef.h>
+
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+
+#define LIST_INITHEAD(__item)			\
+  do {						\
+    (__item)->prev = (__item);			\
+    (__item)->next = (__item);			\
+  } while (0)
+
+#define LIST_ADD(__item, __list)		\
+  do {						\
+    (__item)->prev = (__list);			\
+    (__item)->next = (__list)->next;		\
+    (__list)->next->prev = (__item);		\
+    (__list)->next = (__item);			\
+  } while (0)
+
+#define LIST_ADDTAIL(__item, __list)		\
+  do {						\
+    (__item)->next = (__list);			\
+    (__item)->prev = (__list)->prev;		\
+    (__list)->prev->next = (__item);		\
+    (__list)->prev = (__item);			\
+  } while(0)
+
+#define LIST_REPLACE(__from, __to)		\
+  do {						\
+    (__to)->prev = (__from)->prev;		\
+    (__to)->next = (__from)->next;		\
+    (__from)->next->prev = (__to);		\
+    (__from)->prev->next = (__to);		\
+  } while (0)
+
+#define LIST_DEL(__item)			\
+  do {						\
+    (__item)->prev->next = (__item)->next;	\
+    (__item)->next->prev = (__item)->prev;	\
+  } while(0)
+
+#define LIST_DELINIT(__item)			\
+  do {						\
+    (__item)->prev->next = (__item)->next;	\
+    (__item)->next->prev = (__item)->prev;	\
+    (__item)->next = (__item);			\
+    (__item)->prev = (__item);			\
+  } while(0)
+
+#define LIST_ENTRY(__type, __item, __field)   \
+    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
+
+
+#endif /*_U_DOUBLE_LIST_H_*/
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
new file mode 100644
index 0000000000..8ecae71b64
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -0,0 +1,132 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_draw_quad.h"
+
+
+/**
+ * Draw a simple vertex buffer / primitive.
+ * Limited to float[4] vertex attribs, tightly packed.
+ */
+void 
+util_draw_vertex_buffer(struct pipe_context *pipe,
+                        struct pipe_buffer *vbuf,
+                        uint prim_type,
+                        uint num_verts,
+                        uint num_attribs)
+{
+   struct pipe_vertex_buffer vbuffer;
+   struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
+   uint i;
+
+   assert(num_attribs <= PIPE_MAX_ATTRIBS);
+
+   /* tell pipe about the vertex buffer */
+   vbuffer.buffer = vbuf;
+   vbuffer.pitch = num_attribs * 4 * sizeof(float);  /* vertex size */
+   vbuffer.buffer_offset = 0;
+   pipe->set_vertex_buffers(pipe, 1, &vbuffer);
+
+   /* tell pipe about the vertex attributes */
+   for (i = 0; i < num_attribs; i++) {
+      velements[i].src_offset = i * 4 * sizeof(float);
+      velements[i].vertex_buffer_index = 0;
+      velements[i].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      velements[i].nr_components = 4;
+   }
+   pipe->set_vertex_elements(pipe, num_attribs, velements);
+
+   /* draw */
+   pipe->draw_arrays(pipe, prim_type, 0, num_verts);
+}
+
+
+
+/**
+ * Draw screen-aligned textured quad.
+ * Note: this function allocs/destroys a vertex buffer and isn't especially
+ * efficient.
+ */
+void 
+util_draw_texquad(struct pipe_context *pipe,
+                  float x0, float y0, float x1, float y1, float z)
+{
+   struct pipe_buffer *vbuf;
+   uint numAttribs = 2, vertexBytes, i, j;
+
+   vertexBytes = 4 * (4 * numAttribs * sizeof(float));
+
+   /* XXX create one-time */
+   vbuf = pipe_buffer_create(pipe->screen, 32,
+                             PIPE_BUFFER_USAGE_VERTEX, vertexBytes);
+   if (vbuf) {
+      float *v = (float *) pipe_buffer_map(pipe->screen, vbuf,
+                                           PIPE_BUFFER_USAGE_CPU_WRITE);
+      if (v) {
+         /*
+          * Load vertex buffer
+          */
+         for (i = j = 0; i < 4; i++) {
+            v[j + 2] = z;   /* z */
+            v[j + 3] = 1.0; /* w */
+            v[j + 6] = 0.0; /* r */
+            v[j + 7] = 1.0; /* q */
+            j += 8;
+         }
+
+         v[0] = x0;
+         v[1] = y0;
+         v[4] = 0.0; /*s*/
+         v[5] = 0.0; /*t*/
+
+         v[8] = x1;
+         v[9] = y0;
+         v[12] = 1.0;
+         v[13] = 0.0;
+
+         v[16] = x1;
+         v[17] = y1;
+         v[20] = 1.0;
+         v[21] = 1.0;
+
+         v[24] = x0;
+         v[25] = y1;
+         v[28] = 0.0;
+         v[29] = 1.0;
+
+         pipe_buffer_unmap(pipe->screen, vbuf);
+         util_draw_vertex_buffer(pipe, vbuf, PIPE_PRIM_TRIANGLE_FAN, 4, 2);
+      }
+
+      pipe_buffer_reference(pipe->screen, &vbuf, NULL);
+   }
+}
diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h
new file mode 100644
index 0000000000..ec4862ead3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_draw_quad.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_DRAWQUAD_H
+#define U_DRAWQUAD_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_buffer;
+
+extern void 
+util_draw_vertex_buffer(struct pipe_context *pipe,
+                        struct pipe_buffer *vbuf,
+                        uint num_attribs, uint num_verts, uint prim_type);
+
+
+extern void 
+util_draw_texquad(struct pipe_context *pipe,
+                  float x0, float y0, float x1, float y1, float z);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
new file mode 100644
index 0000000000..9d305ad763
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -0,0 +1,952 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Mipmap generation utility
+ *  
+ * @author Brian Paul
+ */
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_memory.h"
+#include "util/u_draw_quad.h"
+#include "util/u_gen_mipmap.h"
+#include "util/u_simple_shaders.h"
+
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cso_cache/cso_context.h"
+
+
+struct gen_mipmap_state
+{
+   struct pipe_context *pipe;
+   struct cso_context *cso;
+
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state depthstencil;
+   struct pipe_rasterizer_state rasterizer;
+   struct pipe_sampler_state sampler;
+   struct pipe_viewport_state viewport;
+
+   struct pipe_shader_state vert_shader;
+   struct pipe_shader_state frag_shader;
+   void *vs;
+   void *fs;
+
+   struct pipe_buffer *vbuf;  /**< quad vertices */
+   float vertices[4][2][4];   /**< vertex/texcoords for quad */
+};
+
+
+
+enum dtype
+{
+   UBYTE,
+   UBYTE_3_3_2,
+   USHORT,
+   USHORT_4_4_4_4,
+   USHORT_5_6_5,
+   USHORT_1_5_5_5_REV,
+   UINT,
+   FLOAT,
+   HALF_FLOAT
+};
+
+
+typedef ushort half_float;
+
+
+#if 0
+extern half_float
+float_to_half(float f);
+
+extern float
+half_to_float(half_float h);
+#endif
+
+
+/**
+ * Average together two rows of a source image to produce a single new
+ * row in the dest image.  It's legal for the two source rows to point
+ * to the same data.  The source width must be equal to either the
+ * dest width or two times the dest width.
+ * \param datatype  GL_UNSIGNED_BYTE, GL_UNSIGNED_SHORT, GL_FLOAT, etc.
+ * \param comps  number of components per pixel (1..4)
+ */
+static void
+do_row(enum dtype datatype, uint comps, int srcWidth,
+       const void *srcRowA, const void *srcRowB,
+       int dstWidth, void *dstRow)
+{
+   const uint k0 = (srcWidth == dstWidth) ? 0 : 1;
+   const uint colStride = (srcWidth == dstWidth) ? 1 : 2;
+
+   assert(comps >= 1);
+   assert(comps <= 4);
+
+   /* This assertion is no longer valid with non-power-of-2 textures
+   assert(srcWidth == dstWidth || srcWidth == 2 * dstWidth);
+   */
+
+   if (datatype == UBYTE && comps == 4) {
+      uint i, j, k;
+      const ubyte(*rowA)[4] = (const ubyte(*)[4]) srcRowA;
+      const ubyte(*rowB)[4] = (const ubyte(*)[4]) srcRowB;
+      ubyte(*dst)[4] = (ubyte(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+      }
+   }
+   else if (datatype == UBYTE && comps == 3) {
+      uint i, j, k;
+      const ubyte(*rowA)[3] = (const ubyte(*)[3]) srcRowA;
+      const ubyte(*rowB)[3] = (const ubyte(*)[3]) srcRowB;
+      ubyte(*dst)[3] = (ubyte(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+      }
+   }
+   else if (datatype == UBYTE && comps == 2) {
+      uint i, j, k;
+      const ubyte(*rowA)[2] = (const ubyte(*)[2]) srcRowA;
+      const ubyte(*rowB)[2] = (const ubyte(*)[2]) srcRowB;
+      ubyte(*dst)[2] = (ubyte(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) >> 2;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) >> 2;
+      }
+   }
+   else if (datatype == UBYTE && comps == 1) {
+      uint i, j, k;
+      const ubyte *rowA = (const ubyte *) srcRowA;
+      const ubyte *rowB = (const ubyte *) srcRowB;
+      ubyte *dst = (ubyte *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) >> 2;
+      }
+   }
+
+   else if (datatype == USHORT && comps == 4) {
+      uint i, j, k;
+      const ushort(*rowA)[4] = (const ushort(*)[4]) srcRowA;
+      const ushort(*rowB)[4] = (const ushort(*)[4]) srcRowB;
+      ushort(*dst)[4] = (ushort(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] + rowB[j][3] + rowB[k][3]) / 4;
+      }
+   }
+   else if (datatype == USHORT && comps == 3) {
+      uint i, j, k;
+      const ushort(*rowA)[3] = (const ushort(*)[3]) srcRowA;
+      const ushort(*rowB)[3] = (const ushort(*)[3]) srcRowB;
+      ushort(*dst)[3] = (ushort(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] + rowB[j][2] + rowB[k][2]) / 4;
+      }
+   }
+   else if (datatype == USHORT && comps == 2) {
+      uint i, j, k;
+      const ushort(*rowA)[2] = (const ushort(*)[2]) srcRowA;
+      const ushort(*rowB)[2] = (const ushort(*)[2]) srcRowB;
+      ushort(*dst)[2] = (ushort(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] + rowB[j][0] + rowB[k][0]) / 4;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] + rowB[j][1] + rowB[k][1]) / 4;
+      }
+   }
+   else if (datatype == USHORT && comps == 1) {
+      uint i, j, k;
+      const ushort *rowA = (const ushort *) srcRowA;
+      const ushort *rowB = (const ushort *) srcRowB;
+      ushort *dst = (ushort *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) / 4;
+      }
+   }
+
+   else if (datatype == FLOAT && comps == 4) {
+      uint i, j, k;
+      const float(*rowA)[4] = (const float(*)[4]) srcRowA;
+      const float(*rowB)[4] = (const float(*)[4]) srcRowB;
+      float(*dst)[4] = (float(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+         dst[i][3] = (rowA[j][3] + rowA[k][3] +
+                      rowB[j][3] + rowB[k][3]) * 0.25F;
+      }
+   }
+   else if (datatype == FLOAT && comps == 3) {
+      uint i, j, k;
+      const float(*rowA)[3] = (const float(*)[3]) srcRowA;
+      const float(*rowB)[3] = (const float(*)[3]) srcRowB;
+      float(*dst)[3] = (float(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+         dst[i][2] = (rowA[j][2] + rowA[k][2] +
+                      rowB[j][2] + rowB[k][2]) * 0.25F;
+      }
+   }
+   else if (datatype == FLOAT && comps == 2) {
+      uint i, j, k;
+      const float(*rowA)[2] = (const float(*)[2]) srcRowA;
+      const float(*rowB)[2] = (const float(*)[2]) srcRowB;
+      float(*dst)[2] = (float(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i][0] = (rowA[j][0] + rowA[k][0] +
+                      rowB[j][0] + rowB[k][0]) * 0.25F;
+         dst[i][1] = (rowA[j][1] + rowA[k][1] +
+                      rowB[j][1] + rowB[k][1]) * 0.25F;
+      }
+   }
+   else if (datatype == FLOAT && comps == 1) {
+      uint i, j, k;
+      const float *rowA = (const float *) srcRowA;
+      const float *rowB = (const float *) srcRowB;
+      float *dst = (float *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = (rowA[j] + rowA[k] + rowB[j] + rowB[k]) * 0.25F;
+      }
+   }
+
+#if 0
+   else if (datatype == HALF_FLOAT && comps == 4) {
+      uint i, j, k, comp;
+      const half_float(*rowA)[4] = (const half_float(*)[4]) srcRowA;
+      const half_float(*rowB)[4] = (const half_float(*)[4]) srcRowB;
+      half_float(*dst)[4] = (half_float(*)[4]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 4; comp++) {
+            float aj, ak, bj, bk;
+            aj = half_to_float(rowA[j][comp]);
+            ak = half_to_float(rowA[k][comp]);
+            bj = half_to_float(rowB[j][comp]);
+            bk = half_to_float(rowB[k][comp]);
+            dst[i][comp] = float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
+      }
+   }
+   else if (datatype == HALF_FLOAT && comps == 3) {
+      uint i, j, k, comp;
+      const half_float(*rowA)[3] = (const half_float(*)[3]) srcRowA;
+      const half_float(*rowB)[3] = (const half_float(*)[3]) srcRowB;
+      half_float(*dst)[3] = (half_float(*)[3]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 3; comp++) {
+            float aj, ak, bj, bk;
+            aj = half_to_float(rowA[j][comp]);
+            ak = half_to_float(rowA[k][comp]);
+            bj = half_to_float(rowB[j][comp]);
+            bk = half_to_float(rowB[k][comp]);
+            dst[i][comp] = float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
+      }
+   }
+   else if (datatype == HALF_FLOAT && comps == 2) {
+      uint i, j, k, comp;
+      const half_float(*rowA)[2] = (const half_float(*)[2]) srcRowA;
+      const half_float(*rowB)[2] = (const half_float(*)[2]) srcRowB;
+      half_float(*dst)[2] = (half_float(*)[2]) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         for (comp = 0; comp < 2; comp++) {
+            float aj, ak, bj, bk;
+            aj = half_to_float(rowA[j][comp]);
+            ak = half_to_float(rowA[k][comp]);
+            bj = half_to_float(rowB[j][comp]);
+            bk = half_to_float(rowB[k][comp]);
+            dst[i][comp] = float_to_half((aj + ak + bj + bk) * 0.25F);
+         }
+      }
+   }
+   else if (datatype == HALF_FLOAT && comps == 1) {
+      uint i, j, k;
+      const half_float *rowA = (const half_float *) srcRowA;
+      const half_float *rowB = (const half_float *) srcRowB;
+      half_float *dst = (half_float *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         float aj, ak, bj, bk;
+         aj = half_to_float(rowA[j]);
+         ak = half_to_float(rowA[k]);
+         bj = half_to_float(rowB[j]);
+         bk = half_to_float(rowB[k]);
+         dst[i] = float_to_half((aj + ak + bj + bk) * 0.25F);
+      }
+   }
+#endif
+
+   else if (datatype == UINT && comps == 1) {
+      uint i, j, k;
+      const uint *rowA = (const uint *) srcRowA;
+      const uint *rowB = (const uint *) srcRowB;
+      uint *dst = (uint *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         dst[i] = rowA[j] / 4 + rowA[k] / 4 + rowB[j] / 4 + rowB[k] / 4;
+      }
+   }
+
+   else if (datatype == USHORT_5_6_5 && comps == 3) {
+      uint i, j, k;
+      const ushort *rowA = (const ushort *) srcRowA;
+      const ushort *rowB = (const ushort *) srcRowB;
+      ushort *dst = (ushort *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const int rowAr0 = rowA[j] & 0x1f;
+         const int rowAr1 = rowA[k] & 0x1f;
+         const int rowBr0 = rowB[j] & 0x1f;
+         const int rowBr1 = rowB[k] & 0x1f;
+         const int rowAg0 = (rowA[j] >> 5) & 0x3f;
+         const int rowAg1 = (rowA[k] >> 5) & 0x3f;
+         const int rowBg0 = (rowB[j] >> 5) & 0x3f;
+         const int rowBg1 = (rowB[k] >> 5) & 0x3f;
+         const int rowAb0 = (rowA[j] >> 11) & 0x1f;
+         const int rowAb1 = (rowA[k] >> 11) & 0x1f;
+         const int rowBb0 = (rowB[j] >> 11) & 0x1f;
+         const int rowBb1 = (rowB[k] >> 11) & 0x1f;
+         const int red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const int green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const int blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         dst[i] = (blue << 11) | (green << 5) | red;
+      }
+   }
+   else if (datatype == USHORT_4_4_4_4 && comps == 4) {
+      uint i, j, k;
+      const ushort *rowA = (const ushort *) srcRowA;
+      const ushort *rowB = (const ushort *) srcRowB;
+      ushort *dst = (ushort *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const int rowAr0 = rowA[j] & 0xf;
+         const int rowAr1 = rowA[k] & 0xf;
+         const int rowBr0 = rowB[j] & 0xf;
+         const int rowBr1 = rowB[k] & 0xf;
+         const int rowAg0 = (rowA[j] >> 4) & 0xf;
+         const int rowAg1 = (rowA[k] >> 4) & 0xf;
+         const int rowBg0 = (rowB[j] >> 4) & 0xf;
+         const int rowBg1 = (rowB[k] >> 4) & 0xf;
+         const int rowAb0 = (rowA[j] >> 8) & 0xf;
+         const int rowAb1 = (rowA[k] >> 8) & 0xf;
+         const int rowBb0 = (rowB[j] >> 8) & 0xf;
+         const int rowBb1 = (rowB[k] >> 8) & 0xf;
+         const int rowAa0 = (rowA[j] >> 12) & 0xf;
+         const int rowAa1 = (rowA[k] >> 12) & 0xf;
+         const int rowBa0 = (rowB[j] >> 12) & 0xf;
+         const int rowBa1 = (rowB[k] >> 12) & 0xf;
+         const int red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const int green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const int blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         const int alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
+         dst[i] = (alpha << 12) | (blue << 8) | (green << 4) | red;
+      }
+   }
+   else if (datatype == USHORT_1_5_5_5_REV && comps == 4) {
+      uint i, j, k;
+      const ushort *rowA = (const ushort *) srcRowA;
+      const ushort *rowB = (const ushort *) srcRowB;
+      ushort *dst = (ushort *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const int rowAr0 = rowA[j] & 0x1f;
+         const int rowAr1 = rowA[k] & 0x1f;
+         const int rowBr0 = rowB[j] & 0x1f;
+         const int rowBr1 = rowB[k] & 0xf;
+         const int rowAg0 = (rowA[j] >> 5) & 0x1f;
+         const int rowAg1 = (rowA[k] >> 5) & 0x1f;
+         const int rowBg0 = (rowB[j] >> 5) & 0x1f;
+         const int rowBg1 = (rowB[k] >> 5) & 0x1f;
+         const int rowAb0 = (rowA[j] >> 10) & 0x1f;
+         const int rowAb1 = (rowA[k] >> 10) & 0x1f;
+         const int rowBb0 = (rowB[j] >> 10) & 0x1f;
+         const int rowBb1 = (rowB[k] >> 10) & 0x1f;
+         const int rowAa0 = (rowA[j] >> 15) & 0x1;
+         const int rowAa1 = (rowA[k] >> 15) & 0x1;
+         const int rowBa0 = (rowB[j] >> 15) & 0x1;
+         const int rowBa1 = (rowB[k] >> 15) & 0x1;
+         const int red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const int green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const int blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         const int alpha = (rowAa0 + rowAa1 + rowBa0 + rowBa1) >> 2;
+         dst[i] = (alpha << 15) | (blue << 10) | (green << 5) | red;
+      }
+   }
+   else if (datatype == UBYTE_3_3_2 && comps == 3) {
+      uint i, j, k;
+      const ubyte *rowA = (const ubyte *) srcRowA;
+      const ubyte *rowB = (const ubyte *) srcRowB;
+      ubyte *dst = (ubyte *) dstRow;
+      for (i = j = 0, k = k0; i < (uint) dstWidth;
+           i++, j += colStride, k += colStride) {
+         const int rowAr0 = rowA[j] & 0x3;
+         const int rowAr1 = rowA[k] & 0x3;
+         const int rowBr0 = rowB[j] & 0x3;
+         const int rowBr1 = rowB[k] & 0x3;
+         const int rowAg0 = (rowA[j] >> 2) & 0x7;
+         const int rowAg1 = (rowA[k] >> 2) & 0x7;
+         const int rowBg0 = (rowB[j] >> 2) & 0x7;
+         const int rowBg1 = (rowB[k] >> 2) & 0x7;
+         const int rowAb0 = (rowA[j] >> 5) & 0x7;
+         const int rowAb1 = (rowA[k] >> 5) & 0x7;
+         const int rowBb0 = (rowB[j] >> 5) & 0x7;
+         const int rowBb1 = (rowB[k] >> 5) & 0x7;
+         const int red = (rowAr0 + rowAr1 + rowBr0 + rowBr1) >> 2;
+         const int green = (rowAg0 + rowAg1 + rowBg0 + rowBg1) >> 2;
+         const int blue = (rowAb0 + rowAb1 + rowBb0 + rowBb1) >> 2;
+         dst[i] = (blue << 5) | (green << 2) | red;
+      }
+   }
+   else {
+      debug_printf("bad format in do_row()");
+   }
+}
+
+
+static void
+format_to_type_comps(enum pipe_format pformat,
+                     enum dtype *datatype, uint *comps)
+{
+   switch (pformat) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      *datatype = UBYTE;
+      *comps = 4;
+      return;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      *datatype = USHORT_1_5_5_5_REV;
+      *comps = 4;
+      return;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      *datatype = USHORT_4_4_4_4;
+      *comps = 4;
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      *datatype = USHORT_5_6_5;
+      *comps = 3;
+      return;
+   case PIPE_FORMAT_L8_UNORM:
+   case PIPE_FORMAT_A8_UNORM:
+   case PIPE_FORMAT_I8_UNORM:
+      *datatype = UBYTE;
+      *comps = 1;
+      return;
+   case PIPE_FORMAT_A8L8_UNORM:
+      *datatype = UBYTE;
+      *comps = 2;
+      return;
+   default:
+      assert(0);
+      *datatype = UBYTE;
+      *comps = 0;
+      break;
+   }
+}
+
+
+static void
+reduce_1d(enum pipe_format pformat,
+          int srcWidth, const ubyte *srcPtr,
+          int dstWidth, ubyte *dstPtr)
+{
+   enum dtype datatype;
+   uint comps;
+
+   format_to_type_comps(pformat, &datatype, &comps);
+
+   /* we just duplicate the input row, kind of hack, saves code */
+   do_row(datatype, comps,
+          srcWidth, srcPtr, srcPtr,
+          dstWidth, dstPtr);
+}
+
+
+/**
+ * Strides are in bytes.  If zero, it'll be computed as width * bpp.
+ */
+static void
+reduce_2d(enum pipe_format pformat,
+          int srcWidth, int srcHeight,
+          int srcRowStride, const ubyte *srcPtr,
+          int dstWidth, int dstHeight,
+          int dstRowStride, ubyte *dstPtr)
+{
+   enum dtype datatype;
+   uint comps;
+   const int bpt = pf_get_size(pformat);
+   const ubyte *srcA, *srcB;
+   ubyte *dst;
+   int row;
+
+   format_to_type_comps(pformat, &datatype, &comps);
+
+   if (!srcRowStride)
+      srcRowStride = bpt * srcWidth;
+
+   if (!dstRowStride)
+      dstRowStride = bpt * dstWidth;
+
+   /* Compute src and dst pointers */
+   srcA = srcPtr;
+   if (srcHeight > 1) 
+      srcB = srcA + srcRowStride;
+   else
+      srcB = srcA;
+   dst = dstPtr;
+
+   for (row = 0; row < dstHeight; row++) {
+      do_row(datatype, comps,
+             srcWidth, srcA, srcB,
+             dstWidth, dst);
+      srcA += 2 * srcRowStride;
+      srcB += 2 * srcRowStride;
+      dst += dstRowStride;
+   }
+}
+
+
+static void
+make_1d_mipmap(struct gen_mipmap_state *ctx,
+               struct pipe_texture *pt,
+               uint face, uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   const uint zslice = 0;
+   uint dstLevel;
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      struct pipe_surface *srcSurf, *dstSurf;
+      void *srcMap, *dstMap;
+      
+      srcSurf = screen->get_tex_surface(screen, pt, face, srcLevel, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_READ);
+
+      dstSurf = screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+
+      srcMap = ((ubyte *) pipe_buffer_map(screen, srcSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_READ)
+                + srcSurf->offset);
+      dstMap = ((ubyte *) pipe_buffer_map(screen, dstSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE)
+                + dstSurf->offset);
+
+      reduce_1d(pt->format,
+                srcSurf->width, srcMap,
+                dstSurf->width, dstMap);
+
+      pipe_buffer_unmap(screen, srcSurf->buffer);
+      pipe_buffer_unmap(screen, dstSurf->buffer);
+
+      pipe_surface_reference(&srcSurf, NULL);
+      pipe_surface_reference(&dstSurf, NULL);
+   }
+}
+
+
+static void
+make_2d_mipmap(struct gen_mipmap_state *ctx,
+               struct pipe_texture *pt,
+               uint face, uint baseLevel, uint lastLevel)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   const uint zslice = 0;
+   uint dstLevel;
+   
+   assert(pt->block.width == 1);
+   assert(pt->block.height == 1);
+
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+      struct pipe_surface *srcSurf, *dstSurf;
+      ubyte *srcMap, *dstMap;
+      
+      srcSurf = screen->get_tex_surface(screen, pt, face, srcLevel, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_READ);
+      dstSurf = screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+
+      srcMap = ((ubyte *) pipe_buffer_map(screen, srcSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_READ)
+                + srcSurf->offset);
+      dstMap = ((ubyte *) pipe_buffer_map(screen, dstSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE)
+                + dstSurf->offset);
+
+      reduce_2d(pt->format,
+                srcSurf->width, srcSurf->height,
+                srcSurf->stride, srcMap,
+                dstSurf->width, dstSurf->height,
+                dstSurf->stride, dstMap);
+
+      pipe_buffer_unmap(screen, srcSurf->buffer);
+      pipe_buffer_unmap(screen, dstSurf->buffer);
+
+      pipe_surface_reference(&srcSurf, NULL);
+      pipe_surface_reference(&dstSurf, NULL);
+   }
+}
+
+
+static void
+make_3d_mipmap(struct gen_mipmap_state *ctx,
+               struct pipe_texture *pt,
+               uint face, uint baseLevel, uint lastLevel)
+{
+}
+
+
+static void
+fallback_gen_mipmap(struct gen_mipmap_state *ctx,
+                    struct pipe_texture *pt,
+                    uint face, uint baseLevel, uint lastLevel)
+{
+   switch (pt->target) {
+   case PIPE_TEXTURE_1D:
+      make_1d_mipmap(ctx, pt, face, baseLevel, lastLevel);
+      break;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      make_2d_mipmap(ctx, pt, face, baseLevel, lastLevel);
+      break;
+   case PIPE_TEXTURE_3D:
+      make_3d_mipmap(ctx, pt, face, baseLevel, lastLevel);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Create a mipmap generation context.
+ * The idea is to create one of these and re-use it each time we need to
+ * generate a mipmap.
+ */
+struct gen_mipmap_state *
+util_create_gen_mipmap(struct pipe_context *pipe,
+                       struct cso_context *cso)
+{
+   struct gen_mipmap_state *ctx;
+   uint i;
+
+   ctx = CALLOC_STRUCT(gen_mipmap_state);
+   if (!ctx)
+      return NULL;
+
+   ctx->pipe = pipe;
+   ctx->cso = cso;
+
+   /* disabled blending/masking */
+   memset(&ctx->blend, 0, sizeof(ctx->blend));
+   ctx->blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+   ctx->blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   ctx->blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+   ctx->blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+   ctx->blend.colormask = PIPE_MASK_RGBA;
+
+   /* no-op depth/stencil/alpha */
+   memset(&ctx->depthstencil, 0, sizeof(ctx->depthstencil));
+
+   /* rasterizer */
+   memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
+   ctx->rasterizer.front_winding = PIPE_WINDING_CW;
+   ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
+   ctx->rasterizer.bypass_clipping = 1;
+   /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
+
+   /* sampler state */
+   memset(&ctx->sampler, 0, sizeof(ctx->sampler));
+   ctx->sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+   ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+   ctx->sampler.normalized_coords = 1;
+
+   /* viewport state (identity, verts are in wincoords) */
+   ctx->viewport.scale[0] = 1.0;
+   ctx->viewport.scale[1] = 1.0;
+   ctx->viewport.scale[2] = 1.0;
+   ctx->viewport.scale[3] = 1.0;
+   ctx->viewport.translate[0] = 0.0;
+   ctx->viewport.translate[1] = 0.0;
+   ctx->viewport.translate[2] = 0.0;
+   ctx->viewport.translate[3] = 0.0;
+
+   /* vertex shader */
+   {
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0 };
+      ctx->vs = util_make_vertex_passthrough_shader(pipe, 2, semantic_names,
+                                                    semantic_indexes,
+                                                    &ctx->vert_shader);
+   }
+
+   /* fragment shader */
+   ctx->fs = util_make_fragment_tex_shader(pipe, &ctx->frag_shader);
+
+   ctx->vbuf = pipe_buffer_create(pipe->screen,
+                                  32,
+                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  sizeof(ctx->vertices));
+   if (!ctx->vbuf) {
+      FREE(ctx);
+      return NULL;
+   }
+
+   /* vertex data that doesn't change */
+   for (i = 0; i < 4; i++) {
+      ctx->vertices[i][0][2] = 0.0f; /* z */
+      ctx->vertices[i][0][3] = 1.0f; /* w */
+      ctx->vertices[i][1][2] = 0.0f; /* r */
+      ctx->vertices[i][1][3] = 1.0f; /* q */
+   }
+
+   return ctx;
+}
+
+
+static void
+set_vertex_data(struct gen_mipmap_state *ctx, float width, float height)
+{
+   void *buf;
+
+   ctx->vertices[0][0][0] = 0.0f; /*x*/
+   ctx->vertices[0][0][1] = 0.0f; /*y*/
+   ctx->vertices[0][1][0] = 0.0f; /*s*/
+   ctx->vertices[0][1][1] = 0.0f; /*t*/
+
+   ctx->vertices[1][0][0] = width;
+   ctx->vertices[1][0][1] = 0.0f;
+   ctx->vertices[1][1][0] = 1.0f;
+   ctx->vertices[1][1][1] = 0.0f;
+
+   ctx->vertices[2][0][0] = width;
+   ctx->vertices[2][0][1] = height;
+   ctx->vertices[2][1][0] = 1.0f;
+   ctx->vertices[2][1][1] = 1.0f;
+
+   ctx->vertices[3][0][0] = 0.0f;
+   ctx->vertices[3][0][1] = height;
+   ctx->vertices[3][1][0] = 0.0f;
+   ctx->vertices[3][1][1] = 1.0f;
+
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
+
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
+}
+
+
+
+/**
+ * Destroy a mipmap generation context
+ */
+void
+util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
+{
+   struct pipe_context *pipe = ctx->pipe;
+
+   pipe->delete_vs_state(pipe, ctx->vs);
+   pipe->delete_fs_state(pipe, ctx->fs);
+
+   FREE((void*) ctx->vert_shader.tokens);
+   FREE((void*) ctx->frag_shader.tokens);
+
+   pipe_buffer_reference(pipe->screen, &ctx->vbuf, NULL);
+
+   FREE(ctx);
+}
+
+
+/**
+ * Generate mipmap images.  It's assumed all needed texture memory is
+ * already allocated.
+ *
+ * \param pt  the texture to generate mipmap levels for
+ * \param face  which cube face to generate mipmaps for (0 for non-cube maps)
+ * \param baseLevel  the first mipmap level to use as a src
+ * \param lastLevel  the last mipmap level to generate
+ * \param filter  the minification filter used to generate mipmap levels with
+ * \param filter  one of PIPE_TEX_FILTER_LINEAR, PIPE_TEX_FILTER_NEAREST
+ */
+void
+util_gen_mipmap(struct gen_mipmap_state *ctx,
+                struct pipe_texture *pt,
+                uint face, uint baseLevel, uint lastLevel, uint filter)
+{
+   struct pipe_context *pipe = ctx->pipe;
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_framebuffer_state fb;
+   uint dstLevel;
+   uint zslice = 0;
+
+   /* check if we can render in the texture's format */
+   if (!screen->is_format_supported(screen, pt->format, PIPE_TEXTURE_2D,
+                                    PIPE_TEXTURE_USAGE_RENDER_TARGET, 0)) {
+      fallback_gen_mipmap(ctx, pt, face, baseLevel, lastLevel);
+      return;
+   }
+
+   /* save state (restored below) */
+   cso_save_blend(ctx->cso);
+   cso_save_depth_stencil_alpha(ctx->cso);
+   cso_save_rasterizer(ctx->cso);
+   cso_save_samplers(ctx->cso);
+   cso_save_sampler_textures(ctx->cso);
+   cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
+
+   /* bind our state */
+   cso_set_blend(ctx->cso, &ctx->blend);
+   cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
+   cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_viewport(ctx->cso, &ctx->viewport);
+
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
+
+   /* init framebuffer state */
+   memset(&fb, 0, sizeof(fb));
+   fb.num_cbufs = 1;
+
+   /* set min/mag to same filter for faster sw speed */
+   ctx->sampler.mag_img_filter = filter;
+   ctx->sampler.min_img_filter = filter;
+
+   /*
+    * XXX for small mipmap levels, it may be faster to use the software
+    * fallback path...
+    */
+   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
+      const uint srcLevel = dstLevel - 1;
+
+      struct pipe_surface *surf = 
+         screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
+                                 PIPE_BUFFER_USAGE_GPU_WRITE);
+
+      /*
+       * Setup framebuffer / dest surface
+       */
+      fb.cbufs[0] = surf;
+      fb.width = pt->width[dstLevel];
+      fb.height = pt->height[dstLevel];
+      cso_set_framebuffer(ctx->cso, &fb);
+
+      /*
+       * Setup sampler state
+       * Note: we should only have to set the min/max LOD clamps to ensure
+       * we grab texels from the right mipmap level.  But some hardware
+       * has trouble with min clamping so we also set the lod_bias to
+       * try to work around that.
+       */
+      ctx->sampler.min_lod = ctx->sampler.max_lod = (float) srcLevel;
+      ctx->sampler.lod_bias = (float) srcLevel;
+      cso_single_sampler(ctx->cso, 0, &ctx->sampler);
+      cso_single_sampler_done(ctx->cso);
+
+      cso_set_sampler_textures(ctx->cso, 1, &pt);
+
+      /* quad coords in window coords (bypassing clipping, viewport mapping) */
+      set_vertex_data(ctx,
+                      (float) pt->width[dstLevel],
+                      (float) pt->height[dstLevel]);
+      util_draw_vertex_buffer(ctx->pipe, ctx->vbuf,
+                              PIPE_PRIM_TRIANGLE_FAN,
+                              4,  /* verts */
+                              2); /* attribs/vert */
+
+      pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, NULL);
+
+      /* need to signal that the texture has changed _after_ rendering to it */
+      pipe_surface_reference( &surf, NULL );
+   }
+
+   /* restore state we changed */
+   cso_restore_blend(ctx->cso);
+   cso_restore_depth_stencil_alpha(ctx->cso);
+   cso_restore_rasterizer(ctx->cso);
+   cso_restore_samplers(ctx->cso);
+   cso_restore_sampler_textures(ctx->cso);
+   cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
+}
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.h b/src/gallium/auxiliary/util/u_gen_mipmap.h
new file mode 100644
index 0000000000..3277024f07
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_GENMIPMAP_H
+#define U_GENMIPMAP_H
+
+#include "pipe/p_state.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct pipe_context;
+struct pipe_texture;
+struct cso_context;
+
+struct gen_mipmap_state;
+
+
+extern struct gen_mipmap_state *
+util_create_gen_mipmap(struct pipe_context *pipe, struct cso_context *cso);
+
+
+extern void
+util_destroy_gen_mipmap(struct gen_mipmap_state *ctx);
+
+
+
+extern void
+util_gen_mipmap(struct gen_mipmap_state *ctx,
+                struct pipe_texture *pt,
+                uint face, uint baseLevel, uint lastLevel, uint filter);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
new file mode 100644
index 0000000000..2d15932ce3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -0,0 +1,293 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Generic handle table implementation.
+ *  
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+#include "util/u_memory.h"
+#include "util/u_handle_table.h"
+
+
+#define HANDLE_TABLE_INITIAL_SIZE 16  
+
+
+struct handle_table
+{
+   /** Object array. Empty handles have a null object */
+   void **objects;
+   
+   /** Number of objects the handle can currently hold */
+   unsigned size;
+   /** Number of consecutive objects allocated at the start of the table */
+   unsigned filled;
+   
+   /** Optional object destructor */
+   void (*destroy)(void *object);
+};
+
+
+struct handle_table *
+handle_table_create(void)
+{
+   struct handle_table *ht;
+   
+   ht = MALLOC_STRUCT(handle_table);
+   if(!ht)
+      return NULL;
+   
+   ht->objects = (void **)CALLOC(HANDLE_TABLE_INITIAL_SIZE, sizeof(void *));
+   if(!ht->objects) {
+      FREE(ht);
+      return NULL;
+   }
+   
+   ht->size = HANDLE_TABLE_INITIAL_SIZE;
+   ht->filled = 0;
+   
+   ht->destroy = NULL;
+   
+   return ht;
+}
+
+
+void
+handle_table_set_destroy(struct handle_table *ht,
+                         void (*destroy)(void *object))
+{
+   assert(ht);
+   ht->destroy = destroy;
+}
+
+
+/**
+ * Resize the table if necessary 
+ */
+static INLINE int
+handle_table_resize(struct handle_table *ht,
+                    unsigned minimum_size)
+{
+   unsigned new_size;
+   void **new_objects;
+
+   if(ht->size > minimum_size)
+      return ht->size;
+
+   new_size = ht->size;
+   while(!(new_size > minimum_size))
+      new_size *= 2;
+   assert(new_size);
+   
+   new_objects = (void **)REALLOC((void *)ht->objects,
+				  ht->size*sizeof(void *),
+				  new_size*sizeof(void *));
+   if(!new_objects)
+      return 0;
+   
+   memset(new_objects + ht->size, 0, (new_size - ht->size)*sizeof(void *));
+   
+   ht->size = new_size;
+   ht->objects = new_objects;
+   
+   return ht->size;
+}
+
+
+static INLINE void
+handle_table_clear(struct handle_table *ht, 
+                   unsigned index)
+{
+   void *object;
+   
+   /* The order here is important so that the object being destroyed is not
+    * present in the table when seen by the destroy callback, because the 
+    * destroy callback may directly or indirectly call the other functions in 
+    * this module.
+    */
+
+   object = ht->objects[index];
+   if(object) {
+      ht->objects[index] = NULL;
+      
+      if(ht->destroy)
+         ht->destroy(object);
+   }   
+}
+
+
+unsigned
+handle_table_add(struct handle_table *ht, 
+                 void *object)
+{
+   unsigned index;
+   unsigned handle;
+   
+   assert(ht);
+   assert(object);
+   if(!object)
+      return 0;
+
+   /* linear search for an empty handle */
+   while(ht->filled < ht->size) {
+      if(!ht->objects[ht->filled])
+	 break;
+      ++ht->filled;
+   }
+  
+   index = ht->filled;
+   handle = index + 1;
+   
+   /* check integer overflow */
+   if(!handle)
+      return 0;
+   
+   /* grow the table if necessary */
+   if(!handle_table_resize(ht, index))
+      return 0;
+
+   assert(!ht->objects[index]);
+   ht->objects[index] = object;
+   ++ht->filled;
+   
+   return handle;
+}
+
+
+unsigned
+handle_table_set(struct handle_table *ht, 
+                 unsigned handle,
+                 void *object)
+{
+   unsigned index;
+   
+   assert(ht);
+   assert(handle);
+   if(!handle)
+      return 0;
+
+   assert(object);
+   if(!object)
+      return 0;
+   
+   index = handle - 1;
+
+   /* grow the table if necessary */
+   if(!handle_table_resize(ht, index))
+      return 0;
+
+   handle_table_clear(ht, index);
+
+   ht->objects[index] = object;
+   
+   return handle;
+}
+
+
+void *
+handle_table_get(struct handle_table *ht, 
+                 unsigned handle)
+{
+   void *object;
+   
+   assert(ht);
+   assert(handle);
+   if(!handle || handle > ht->size)
+      return NULL;
+
+   object = ht->objects[handle - 1];
+   
+   return object;
+}
+
+
+void
+handle_table_remove(struct handle_table *ht, 
+                    unsigned handle)
+{
+   void *object;
+   unsigned index;
+   
+   assert(ht);
+   assert(handle);
+   if(!handle || handle > ht->size)
+      return;
+
+   index = handle - 1;
+   object = ht->objects[index];
+   if(!object)
+      return;
+   
+   handle_table_clear(ht, index);
+
+   if(index < ht->filled)
+      ht->filled = index;
+}
+
+
+unsigned
+handle_table_get_next_handle(struct handle_table *ht, 
+                             unsigned handle)
+{
+   unsigned index;
+   
+   for(index = handle; index < ht->size; ++index) {
+      if(ht->objects[index])
+	 return index + 1;
+   }
+
+   return 0;
+}
+
+
+unsigned
+handle_table_get_first_handle(struct handle_table *ht)
+{
+   return handle_table_get_next_handle(ht, 0);
+}
+
+
+void
+handle_table_destroy(struct handle_table *ht)
+{
+   unsigned index;
+   assert(ht);
+
+   if(ht->destroy)
+      for(index = 0; index < ht->size; ++index)
+         handle_table_clear(ht, index);
+   
+   FREE(ht->objects);
+   FREE(ht);
+}
+
diff --git a/src/gallium/auxiliary/util/u_handle_table.h b/src/gallium/auxiliary/util/u_handle_table.h
new file mode 100644
index 0000000000..d080135c9f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_handle_table.h
@@ -0,0 +1,116 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Generic handle table.
+ *  
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef U_HANDLE_TABLE_H_
+#define U_HANDLE_TABLE_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+/**
+ * Abstract data type to map integer handles to objects.
+ * 
+ * Also referred as "pointer array".
+ */
+struct handle_table;
+
+
+struct handle_table *
+handle_table_create(void);
+
+
+/**
+ * Set an optional destructor callback.
+ * 
+ * If set, it will be called during handle_table_remove and 
+ * handle_table_destroy calls.
+ */
+void
+handle_table_set_destroy(struct handle_table *ht,
+                         void (*destroy)(void *object));
+
+
+/**
+ * Add a new object.
+ * 
+ * Returns a zero handle on failure (out of memory).
+ */
+unsigned
+handle_table_add(struct handle_table *ht, 
+                 void *object);
+
+/**
+ * Returns zero on failure (out of memory).
+ */
+unsigned
+handle_table_set(struct handle_table *ht, 
+                 unsigned handle,
+                 void *object);
+
+/**
+ * Fetch an existing object.
+ * 
+ * Returns NULL for an invalid handle.
+ */
+void *
+handle_table_get(struct handle_table *ht, 
+                 unsigned handle);
+
+
+void
+handle_table_remove(struct handle_table *ht, 
+                    unsigned handle);
+
+
+void
+handle_table_destroy(struct handle_table *ht);
+
+
+unsigned
+handle_table_get_first_handle(struct handle_table *ht);
+
+
+unsigned
+handle_table_get_next_handle(struct handle_table *ht,
+                             unsigned handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_HANDLE_TABLE_H_ */
diff --git a/src/gallium/auxiliary/util/u_hash_table.c b/src/gallium/auxiliary/util/u_hash_table.c
new file mode 100644
index 0000000000..0bc8de9632
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_hash_table.c
@@ -0,0 +1,280 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * General purpose hash table implementation.
+ * 
+ * Just uses the cso_hash for now, but it might be better switch to a linear 
+ * probing hash table implementation at some point -- as it is said they have 
+ * better lookup and cache performance and it appears to be possible to write 
+ * a lock-free implementation of such hash tables . 
+ * 
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_hash_table.h"
+
+
+struct hash_table
+{
+   struct cso_hash *cso;   
+   
+   /** Hash function */
+   unsigned (*hash)(void *key);
+   
+   /** Compare two keys */
+   int (*compare)(void *key1, void *key2);
+   
+   /* TODO: key, value destructors? */
+};
+
+
+struct hash_table_item
+{
+   void *key;
+   void *value;
+};
+
+
+static INLINE struct hash_table_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct hash_table_item *)cso_hash_iter_data(iter);
+}
+
+
+struct hash_table *
+hash_table_create(unsigned (*hash)(void *key),
+                  int (*compare)(void *key1, void *key2))
+{
+   struct hash_table *ht;
+   
+   ht = MALLOC_STRUCT(hash_table);
+   if(!ht)
+      return NULL;
+   
+   ht->cso = cso_hash_create();
+   if(!ht->cso) {
+      FREE(ht);
+      return NULL;
+   }
+   
+   ht->hash = hash;
+   ht->compare = compare;
+   
+   return ht;
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(struct hash_table *ht,
+                     void *key, 
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+   
+   iter = cso_hash_find(ht->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      if (!ht->compare(item->key, key))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct hash_table_item *
+hash_table_find_item(struct hash_table *ht,
+                     void *key, 
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+   
+   iter = cso_hash_find(ht->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      if (!ht->compare(item->key, key))
+         return item;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return NULL;
+}
+
+
+enum pipe_error
+hash_table_set(struct hash_table *ht,
+               void *key,
+               void *value)
+{
+   unsigned key_hash;
+   struct hash_table_item *item;
+   struct cso_hash_iter iter;
+
+   assert(ht);
+
+   key_hash = ht->hash(key);
+
+   item = hash_table_find_item(ht, key, key_hash);
+   if(item) {
+      /* TODO: key/value destruction? */
+      item->value = value;
+      return PIPE_OK;
+   }
+   
+   item = MALLOC_STRUCT(hash_table_item);
+   if(!item)
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   
+   item->key = key;
+   item->value = value;
+   
+   iter = cso_hash_insert(ht->cso, key_hash, item);
+   if(cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return PIPE_ERROR_OUT_OF_MEMORY;
+   }
+
+   return PIPE_OK;
+}
+
+
+void *
+hash_table_get(struct hash_table *ht, 
+               void *key)
+{
+   unsigned key_hash;
+   struct hash_table_item *item;
+
+   assert(ht);
+
+   key_hash = ht->hash(key);
+
+   item = hash_table_find_item(ht, key, key_hash);
+   if(!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+void
+hash_table_remove(struct hash_table *ht, 
+                  void *key)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+
+   assert(ht);
+
+   key_hash = ht->hash(key);
+
+   iter = hash_table_find_iter(ht, key, key_hash);
+   if(cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   FREE(item);
+   
+   cso_hash_erase(ht->cso, iter);
+}
+
+
+void 
+hash_table_clear(struct hash_table *ht)
+{
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+
+   assert(ht);
+   
+   iter = cso_hash_first_node(ht->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct hash_table_item *)cso_hash_take(ht->cso, cso_hash_iter_key(iter));
+      FREE(item);
+      iter = cso_hash_first_node(ht->cso);
+   }
+}
+
+
+enum pipe_error
+hash_table_foreach(struct hash_table *ht,
+                   enum pipe_error (*callback)(void *key, void *value, void *data),
+                   void *data)
+{
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+   enum pipe_error result;
+   
+   assert(ht);
+   
+   iter = cso_hash_first_node(ht->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      result = callback(item->key, item->value, data);
+      if(result != PIPE_OK)
+	 return result;
+      iter = cso_hash_iter_next(iter);
+   }
+
+   return PIPE_OK;
+}
+
+
+void
+hash_table_destroy(struct hash_table *ht)
+{
+   struct cso_hash_iter iter;
+   struct hash_table_item *item;
+   
+   assert(ht);
+   
+   iter = cso_hash_first_node(ht->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct hash_table_item *)cso_hash_iter_data(iter);
+      FREE(item);
+      iter = cso_hash_iter_next(iter);
+   }
+
+   cso_hash_delete(ht->cso);
+   
+   FREE(ht);
+}
diff --git a/src/gallium/auxiliary/util/u_hash_table.h b/src/gallium/auxiliary/util/u_hash_table.h
new file mode 100644
index 0000000000..feee881582
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_hash_table.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * General purpose hash table.
+ *  
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef U_HASH_TABLE_H_
+#define U_HASH_TABLE_H_
+
+
+#include "pipe/p_error.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+/**
+ * Generic purpose hash table.
+ */
+struct hash_table;
+
+
+/**
+ * Create an hash table.
+ * 
+ * @param hash hash function
+ * @param compare should return 0 for two equal keys.
+ */
+struct hash_table *
+hash_table_create(unsigned (*hash)(void *key),
+                  int (*compare)(void *key1, void *key2));
+
+
+enum pipe_error
+hash_table_set(struct hash_table *ht,
+               void *key,
+               void *value);
+
+void *
+hash_table_get(struct hash_table *ht, 
+               void *key);
+
+
+void
+hash_table_remove(struct hash_table *ht, 
+                  void *key);
+
+
+void
+hash_table_clear(struct hash_table *ht);
+
+
+enum pipe_error
+hash_table_foreach(struct hash_table *ht,
+                   enum pipe_error (*callback)(void *key, void *value, void *data),
+                   void *data);
+
+void
+hash_table_destroy(struct hash_table *ht);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_HASH_TABLE_H_ */
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 0000000000..01b17ddb1b
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+   struct cso_hash *cso;   
+   unsigned key_size;
+   unsigned max_entries; /* XXX not obeyed net */
+   unsigned num_entries;
+   keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+   void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+                    const void *key, void *data, void *user)
+{
+   FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+   unsigned i, hash;
+
+   keySize /= 4; /* convert from bytes to uints */
+
+   hash = 0;
+   for (i = 0; i < keySize; i++) {
+      hash ^= (i + 1) * ((const unsigned *) key)[i];
+   }
+
+   /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+   return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize  size of the keys in bytes
+ * \param maxEntries  max number of entries to allow (~0 = infinity)
+ * \param deleteFunc  optional callback to call when entries
+ *                    are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                 keymap_delete_func deleteFunc)
+{
+   struct keymap *map = MALLOC_STRUCT(keymap);
+   if (!map)
+      return NULL;
+   
+   map->cso = cso_hash_create();
+   if (!map->cso) {
+      FREE(map);
+      return NULL;
+   }
+   
+   map->max_entries = maxEntries;
+   map->num_entries = 0;
+   map->key_size = keySize;
+   map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+   return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries.  The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user  user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+   util_keymap_remove_all(map, user);
+   cso_hash_delete(map->cso);
+   FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+   
+   iter = cso_hash_find(map->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *) cso_hash_iter_data(iter);
+      if (!memcmp(item->key, key, map->key_size))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter)) {
+      return NULL;
+   }
+   else {
+      return hash_table_item(iter);
+   }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+   struct cso_hash_iter iter;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (item) {
+      /* call delete callback for old entry/item */
+      map->delete_func(map, item->key, item->value, user);
+      item->value = (void *) data;
+      return TRUE;
+   }
+   
+   item = MALLOC_STRUCT(keymap_item);
+   if (!item)
+      return FALSE;
+
+   item->key = mem_dup(key, map->key_size);
+   item->value = (void *) data;
+   
+   iter = cso_hash_insert(map->cso, key_hash, item);
+   if (cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return FALSE;
+   }
+
+   map->num_entries++;
+
+   return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   map->delete_func(map, item->key, item->value, user);
+   FREE(item->key);
+   FREE(item);
+   
+   map->num_entries--;
+
+   cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+   
+   iter = cso_hash_first_node(map->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *)
+         cso_hash_take(map->cso, cso_hash_iter_key(iter));
+      map->delete_func(map, item->key, item->value, user);
+      FREE(item->key);
+      FREE(item);
+      iter = cso_hash_first_node(map->cso);
+   }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+   debug_printf("Keymap %p: %u of max %u entries\n",
+                (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 0000000000..8d60a76fc3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+                                   const void *key, void *data,
+                                   void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_linear.c b/src/gallium/auxiliary/util/u_linear.c
new file mode 100644
index 0000000000..a76704ffc7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linear.c
@@ -0,0 +1,69 @@
+
+#include "pipe/p_debug.h"
+#include "u_linear.h"
+
+void
+pipe_linear_to_tile(size_t src_stride, void *src_ptr,
+		    struct pipe_tile_info *t, void *dst_ptr)
+{
+   int x, y, z;
+   char *ptr;
+   size_t bytes = t->cols * t->block.size;
+
+
+   assert(pipe_linear_check_tile(t));
+
+   /* lets write lineary to the tiled buffer */
+   for (y = 0; y < t->tiles_y; y++) {
+      for (x = 0; x < t->tiles_x; x++) {
+	 /* this inner loop could be replace with SSE magic */
+	 ptr = (char*)src_ptr + src_stride * t->rows * y + bytes * x;
+	 for (z = 0; z < t->rows; z++) {
+	    memcpy(dst_ptr, ptr, bytes);
+	    dst_ptr += bytes;
+	    ptr += src_stride;
+	 }
+      }
+   }
+}
+
+void pipe_linear_from_tile(struct pipe_tile_info *t, void  *src_ptr,
+			   size_t dst_stride, void *dst_ptr)
+{
+   int x, y, z;
+   char *ptr;
+   size_t bytes = t->cols * t->block.size;
+
+   /* lets read lineary from the tiled buffer */
+   for (y = 0; y < t->tiles_y; y++) {
+      for (x = 0; x < t->tiles_x; x++) {
+	 /* this inner loop could be replace with SSE magic */
+	 ptr = (char*)dst_ptr + dst_stride * t->rows * y + bytes * x;
+	 for (z = 0; z < t->rows; z++) {
+	    memcpy(ptr, src_ptr, bytes);
+	    src_ptr += bytes;
+	    ptr += dst_stride;
+	 }
+      }
+   }
+}
+
+void
+pipe_linear_fill_info(struct pipe_tile_info *t,
+		      struct pipe_format_block *block,
+		      unsigned tile_width, unsigned tile_height,
+		      unsigned tiles_x, unsigned tiles_y)
+{
+   t->block = *block;
+
+   t->tile.width = tile_width;
+   t->tile.height = tile_height;
+   t->cols = t->tile.width / t->block.width;
+   t->rows = t->tile.height / t->block.height;
+   t->tile.size = t->cols * t->rows * t->block.size;
+
+   t->tiles_x = tiles_x;
+   t->tiles_y = tiles_y;
+   t->stride = t->cols * t->tiles_x * t->block.size;
+   t->size = t->tiles_x * t->tiles_y * t->tile.size;
+}
diff --git a/src/gallium/auxiliary/util/u_linear.h b/src/gallium/auxiliary/util/u_linear.h
new file mode 100644
index 0000000000..e337cfd770
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_linear.h
@@ -0,0 +1,60 @@
+
+#ifndef U_LINEAR_H
+#define U_LINEAR_H
+
+#include "pipe/p_format.h"
+struct pipe_tile_info
+{
+   unsigned size;
+   unsigned stride;
+
+   /* The number of tiles */
+   unsigned tiles_x;
+   unsigned tiles_y;
+
+   /* size of each tile expressed in blocks */
+   unsigned cols;
+   unsigned rows;
+
+   /* Describe the tile in pixels */
+   struct pipe_format_block tile;
+
+   /* Describe each block within the tile */
+   struct pipe_format_block block;
+};
+
+void pipe_linear_to_tile(size_t src_stride, void *src_ptr,
+			 struct pipe_tile_info *t, void  *dst_ptr);
+
+void pipe_linear_from_tile(struct pipe_tile_info *t, void  *src_ptr,
+			   size_t dst_stride, void *dst_ptr);
+
+/**
+ * Convenience function to fillout a pipe_tile_info struct.
+ * @t info to fill out.
+ * @block block info about pixel layout
+ * @tile_width the width of the tile in pixels
+ * @tile_height the height of the tile in pixels
+ * @tiles_x number of tiles in x axis
+ * @tiles_y number of tiles in y axis
+ */
+void pipe_linear_fill_info(struct pipe_tile_info *t,
+			   struct pipe_format_block *block,
+			   unsigned tile_width, unsigned tile_height,
+			   unsigned tiles_x, unsigned tiles_y);
+
+static INLINE boolean pipe_linear_check_tile(struct pipe_tile_info *t)
+{
+   if (t->tile.size != t->block.size * t->cols * t->rows)
+      return FALSE;
+
+   if (t->stride != t->block.size * t->cols * t->tiles_x)
+      return FALSE;
+
+   if (t->size < t->stride * t->rows * t->tiles_y)
+      return FALSE;
+
+   return TRUE;
+}
+
+#endif /* U_LINEAR_H */
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
new file mode 100644
index 0000000000..2811475fa0
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "util/u_math.h"
+
+
+/** 2^x, for x in [-1.0, 1.0) */
+float pow2_table[POW2_TABLE_SIZE];
+
+
+static void
+init_pow2_table(void)
+{
+   int i;
+   for (i = 0; i < POW2_TABLE_SIZE; i++)
+      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0) */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SCALE));
+}
+
+
+/**
+ * One time init for math utilities.
+ */
+void
+util_init_math(void)
+{
+   static boolean initialized = FALSE;
+   if (!initialized) {
+      init_pow2_table();
+      init_log2_table();
+      initialized = TRUE;
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
new file mode 100644
index 0000000000..aa4fa17b59
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -0,0 +1,433 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Math utilities and approximations for common math functions.
+ * Reduced precision is usually acceptable in shaders...
+ *
+ * "fast" is used in the names of functions which are low-precision,
+ * or at least lower-precision than the normal C lib functions.
+ */
+
+
+#ifndef U_MATH_H
+#define U_MATH_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+__inline double ceil(double val)
+{
+   double ceil_val;
+
+   if((val - (long) val) == 0) {
+      ceil_val = val;
+   }
+   else {
+      if(val > 0) {
+         ceil_val = (long) val + 1;
+      }
+      else {
+         ceil_val = (long) val;
+      }
+   }
+
+   return ceil_val;
+}
+
+#ifndef PIPE_SUBSYSTEM_WINDOWS_CE_OGL
+__inline double floor(double val)
+{
+   double floor_val;
+
+   if((val - (long) val) == 0) {
+      floor_val = val;
+   }
+   else {
+      if(val > 0) {
+         floor_val = (long) val;
+      }
+      else {
+         floor_val = (long) val - 1;
+      }
+   }
+
+   return floor_val;
+}
+#endif
+
+#pragma function(pow)
+__inline double __cdecl pow(double val, double exponent)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(log)
+__inline double __cdecl log(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+
+#pragma function(atan2)
+__inline double __cdecl atan2(double val)
+{
+   /* XXX */
+   assert(0);
+   return 0;
+}
+#else
+#include <math.h>
+#include <stdarg.h>
+#endif
+
+
+#if defined(_MSC_VER) 
+
+#if _MSC_VER < 1400 && !defined(__cplusplus) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+ 
+static INLINE float cosf( float f ) 
+{
+   return (float) cos( (double) f );
+}
+
+static INLINE float sinf( float f ) 
+{
+   return (float) sin( (double) f );
+}
+
+static INLINE float ceilf( float f ) 
+{
+   return (float) ceil( (double) f );
+}
+
+static INLINE float floorf( float f ) 
+{
+   return (float) floor( (double) f );
+}
+
+static INLINE float powf( float f, float g ) 
+{
+   return (float) pow( (double) f, (double) g );
+}
+
+static INLINE float sqrtf( float f ) 
+{
+   return (float) sqrt( (double) f );
+}
+
+static INLINE float fabsf( float f ) 
+{
+   return (float) fabs( (double) f );
+}
+
+static INLINE float logf( float f ) 
+{
+   return (float) log( (double) f );
+}
+
+#else
+/* Work-around an extra semi-colon in VS 2005 logf definition */
+#ifdef logf
+#undef logf
+#define logf(x) ((float)log((double)(x)))
+#endif /* logf */
+#endif
+
+static INLINE double log2( double x )
+{
+   const double invln2 = 1.442695041;
+   return log( x ) * invln2;
+}
+
+#endif /* _MSC_VER */
+
+
+
+
+
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
+extern float pow2_table[POW2_TABLE_SIZE];
+
+
+
+extern void
+util_init_math(void);
+
+
+union fi {
+   float f;
+   int32_t i;
+   uint32_t ui;
+};
+
+
+/**
+ * Fast version of 2^x
+ * Identity: exp2(a + b) = exp2(a) * exp2(b)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
+ * Compute exp2(ipart) with i << ipart
+ * Compute exp2(fpart) with lookup table.
+ */
+static INLINE float
+util_fast_exp2(float x)
+{
+   int32_t ipart;
+   float fpart, mpart;
+   union fi epart;
+   
+   if(x > 129.00000f)
+      return 3.402823466e+38f;
+   
+   if(x < -126.99999f)
+      return 0.0f;
+
+   ipart = (int32_t) x;
+   fpart = x - (float) ipart;
+   
+   /* same as
+    *   epart.f = (float) (1 << ipart)
+    * but faster and without integer overflow for ipart > 31 */
+   epart.i = (ipart + 127 ) << 23;
+   
+   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+   
+   return epart.f * mpart;
+}
+
+
+/**
+ * Fast approximation to exp(x).
+ */
+static INLINE float
+util_fast_exp(float x)
+{
+   const float k = 1.44269f; /* = log2(e) */
+   return util_fast_exp2(k * x);
+}
+
+
+#define LOG2_TABLE_SIZE_LOG2 16
+#define LOG2_TABLE_SCALE (1 << LOG2_TABLE_SIZE_LOG2)
+#define LOG2_TABLE_SIZE (LOG2_TABLE_SCALE + 1)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
+static INLINE float
+util_fast_log2(float x)
+{
+   union fi num;
+   float epart, mpart;
+   num.f = x;
+   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+   /* mpart = log2_table[mantissa*LOG2_TABLE_SCALE + 0.5] */
+   mpart = log2_table[((num.i & 0x007fffff) + (1 << (22 - LOG2_TABLE_SIZE_LOG2))) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+   return epart + mpart;
+}
+
+
+static INLINE float
+util_fast_pow(float x, float y)
+{
+   return util_fast_exp2(util_fast_log2(x) * y);
+}
+
+
+
+/**
+ * Floor(x), returned as int.
+ */
+static INLINE int
+util_ifloor(float f)
+{
+   int ai, bi;
+   double af, bf;
+   union fi u;
+   af = (3 << 22) + 0.5 + (double)f;
+   bf = (3 << 22) + 0.5 - (double)f;
+   u.f = (float) af;  ai = u.i;
+   u.f = (float) bf;  bi = u.i;
+   return (ai - bi) >> 1;
+}
+
+
+/**
+ * Round float to nearest int.
+ */
+static INLINE int
+util_iround(float f)
+{
+#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_X86) 
+   int r;
+   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
+   return r;
+#elif defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
+   int r;
+   _asm {
+	 fld f
+	 fistp r
+	}
+   return r;
+#else
+   if (f >= 0.0f)
+      return (int) (f + 0.5f);
+   else
+      return (int) (f - 0.5f);
+#endif
+}
+
+
+
+#if defined(PIPE_CC_MSVC) && defined(PIPE_ARCH_X86)
+/**
+ * Find first bit set in word.  Least significant bit is 1.
+ * Return 0 if no bits set.
+ */
+static INLINE
+unsigned ffs( unsigned u )
+{
+   unsigned i;
+
+   if( u == 0 ) {
+      return 0;
+   }
+
+   __asm bsf eax, [u]
+   __asm inc eax
+   __asm mov [i], eax
+
+   return i;
+}
+#endif
+
+
+/**
+ * Return float bits.
+ */
+static INLINE unsigned
+fui( float f )
+{
+   union fi fi;
+   fi.f = f;
+   return fi.ui;
+}
+
+
+
+static INLINE float
+ubyte_to_float(ubyte ub)
+{
+   return (float) ub * (1.0f / 255.0f);
+}
+
+
+/**
+ * Convert float in [0,1] to ubyte in [0,255] with clamping.
+ */
+static INLINE ubyte
+float_to_ubyte(float f)
+{
+   const int ieee_0996 = 0x3f7f0000;   /* 0.996 or so */
+   union fi tmp;
+
+   tmp.f = f;
+   if (tmp.i < 0) {
+      return (ubyte) 0;
+   }
+   else if (tmp.i >= ieee_0996) {
+      return (ubyte) 255;
+   }
+   else {
+      tmp.f = tmp.f * (255.0f/256.0f) + 32768.0f;
+      return (ubyte) tmp.i;
+   }
+}
+
+
+
+#define CLAMP( X, MIN, MAX )  ( (X)<(MIN) ? (MIN) : ((X)>(MAX) ? (MAX) : (X)) )
+
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+#ifndef COPY_4V
+#define COPY_4V( DST, SRC )         \
+do {                                \
+   (DST)[0] = (SRC)[0];             \
+   (DST)[1] = (SRC)[1];             \
+   (DST)[2] = (SRC)[2];             \
+   (DST)[3] = (SRC)[3];             \
+} while (0)
+#endif
+
+
+#ifndef COPY_4FV
+#define COPY_4FV( DST, SRC )  COPY_4V(DST, SRC)
+#endif
+
+
+#ifndef ASSIGN_4V
+#define ASSIGN_4V( DST, V0, V1, V2, V3 ) \
+do {                                     \
+   (DST)[0] = (V0);                      \
+   (DST)[1] = (V1);                      \
+   (DST)[2] = (V2);                      \
+   (DST)[3] = (V3);                      \
+} while (0)
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_MATH_H */
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
new file mode 100644
index 0000000000..1a6b596421
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -0,0 +1,234 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Memory functions
+ */
+
+
+#ifndef U_MEMORY_H
+#define U_MEMORY_H
+
+
+#include "util/u_pointer.h"
+#include "pipe/p_debug.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Define ENOMEM for WINCE */ 
+#if (_WIN32_WCE < 600)
+#ifndef ENOMEM
+#define ENOMEM 12
+#endif
+#endif
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
+
+/* memory debugging */
+
+#include "pipe/p_debug.h"
+
+#define MALLOC( _size ) \
+   debug_malloc( __FILE__, __LINE__, __FUNCTION__, _size )
+#define CALLOC( _count, _size ) \
+   debug_calloc(__FILE__, __LINE__, __FUNCTION__, _count, _size )
+#define FREE( _ptr ) \
+   debug_free( __FILE__, __LINE__, __FUNCTION__,  _ptr )
+#define REALLOC( _ptr, _old_size, _size ) \
+   debug_realloc( __FILE__, __LINE__, __FUNCTION__,  _ptr, _old_size, _size )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void * __stdcall
+EngAllocMem(
+    unsigned long Flags,
+    unsigned long MemSize,
+    unsigned long Tag );
+
+void __stdcall
+EngFreeMem(
+    void *Mem );
+
+#define MALLOC( _size ) EngAllocMem( 0, _size, 'D3AG' )
+#define _FREE( _ptr ) EngFreeMem( _ptr )
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+
+void *
+ExAllocatePool(
+    unsigned long PoolType, 
+    size_t NumberOfBytes);
+
+void 
+ExFreePool(void *P);
+
+#define MALLOC(_size) ExAllocatePool(0, _size)
+#define _FREE(_ptr) ExFreePool(_ptr)
+
+#else
+
+#define MALLOC( SIZE )  malloc( SIZE )
+#define CALLOC( COUNT, SIZE )   calloc( COUNT, SIZE )
+#define FREE( PTR )  free( PTR )
+#define REALLOC( OLDPTR, OLDSIZE, NEWSIZE )  realloc( OLDPTR, NEWSIZE )
+
+#endif
+
+
+#ifndef CALLOC
+static INLINE void *
+CALLOC( unsigned count, unsigned size )
+{
+   void *ptr = MALLOC( count * size );
+   if( ptr ) {
+      memset( ptr, 0, count * size );
+   }
+   return ptr;
+}
+#endif /* !CALLOC */
+
+#ifndef FREE
+static INLINE void
+FREE( void *ptr )
+{
+   if( ptr ) {
+      _FREE( ptr );
+   }
+}
+#endif /* !FREE */
+
+#ifndef REALLOC
+static INLINE void *
+REALLOC( void *old_ptr, unsigned old_size, unsigned new_size )
+{
+   void *new_ptr = NULL;
+
+   if (new_size != 0) {
+      unsigned copy_size = old_size < new_size ? old_size : new_size;
+      new_ptr = MALLOC( new_size );
+      if (new_ptr && old_ptr && copy_size) {
+         memcpy( new_ptr, old_ptr, copy_size );
+      }
+   }
+
+   FREE( old_ptr );
+   return new_ptr;
+}
+#endif /* !REALLOC */
+
+
+#define MALLOC_STRUCT(T)   (struct T *) MALLOC(sizeof(struct T))
+
+#define CALLOC_STRUCT(T)   (struct T *) CALLOC(1, sizeof(struct T))
+
+#define CALLOC_VARIANT_LENGTH_STRUCT(T,more_size)   ((struct T *) CALLOC(1, sizeof(struct T) + more_size))
+
+
+/**
+ * Return memory on given byte alignment
+ */
+static INLINE void *
+align_malloc(size_t bytes, uint alignment)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   void *mem;
+   alignment = (alignment + (uint)sizeof(void*) - 1) & ~((uint)sizeof(void*) - 1);
+   if(posix_memalign(& mem, alignment, bytes) != 0)
+      return NULL;
+   return mem;
+#else
+   char *ptr, *buf;
+
+   assert( alignment > 0 );
+
+   ptr = (char *) MALLOC(bytes + alignment + sizeof(void *));
+   if (!ptr)
+      return NULL;
+
+   buf = (char *) align_pointer( ptr + sizeof(void *), alignment );
+   *(char **)(buf - sizeof(void *)) = ptr;
+
+   return buf;
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+/**
+ * Free memory returned by align_malloc().
+ */
+static INLINE void
+align_free(void *ptr)
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+   FREE(ptr);
+#else
+   void **cubbyHole = (void **) ((char *) ptr - sizeof(void *));
+   void *realAddr = *cubbyHole;
+   FREE(realAddr);
+#endif /* defined(HAVE_POSIX_MEMALIGN) */
+}
+
+
+/**
+ * Duplicate a block of memory.
+ */
+static INLINE void *
+mem_dup(const void *src, uint size)
+{
+   void *dup = MALLOC(size);
+   if (dup)
+      memcpy(dup, src, size);
+   return dup;
+}
+
+
+/**
+ * Number of elements in an array.
+ */
+#ifndef Elements
+#define Elements(x) (sizeof(x)/sizeof((x)[0]))
+#endif
+
+
+/**
+ * Offset of a field in a struct, in bytes.
+ */
+#define Offset(TYPE, MEMBER) ((unsigned)&(((TYPE *)NULL)->MEMBER))
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* U_MEMORY_H */
diff --git a/src/gallium/auxiliary/util/u_mm.c b/src/gallium/auxiliary/util/u_mm.c
new file mode 100644
index 0000000000..45ce257b5e
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mm.c
@@ -0,0 +1,287 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * WITTAWAT YAMWONG, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+
+#include "util/u_memory.h"
+#include "util/u_mm.h"
+
+
+void
+u_mmDumpMemInfo(const struct mem_block *heap)
+{
+   debug_printf("Memory heap %p:\n", (void *)heap);
+   if (heap == 0) {
+      debug_printf("  heap == 0\n");
+   } else {
+      const struct mem_block *p;
+
+      for(p = heap->next; p != heap; p = p->next) {
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+      debug_printf("\nFree list:\n");
+
+      for(p = heap->next_free; p != heap; p = p->next_free) {
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+   }
+   debug_printf("End of memory blocks\n");
+}
+
+struct mem_block *
+u_mmInit(int ofs, int size)
+{
+   struct mem_block *heap, *block;
+  
+   if (size <= 0) 
+      return NULL;
+
+   heap = CALLOC_STRUCT(mem_block);
+   if (!heap) 
+      return NULL;
+   
+   block = CALLOC_STRUCT(mem_block);
+   if (!block) {
+      FREE(heap);
+      return NULL;
+   }
+
+   heap->next = block;
+   heap->prev = block;
+   heap->next_free = block;
+   heap->prev_free = block;
+
+   block->heap = heap;
+   block->next = heap;
+   block->prev = heap;
+   block->next_free = heap;
+   block->prev_free = heap;
+
+   block->ofs = ofs;
+   block->size = size;
+   block->free = 1;
+
+   return heap;
+}
+
+
+static struct mem_block *
+SliceBlock(struct mem_block *p, 
+           int startofs, int size, 
+           int reserved, int alignment)
+{
+   struct mem_block *newblock;
+
+   /* break left  [p, newblock, p->next], then p = newblock */
+   if (startofs > p->ofs) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs;
+      newblock->size = p->size - (startofs - p->ofs);
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+
+      p->size -= newblock->size;
+      p = newblock;
+   }
+
+   /* break right, also [p, newblock, p->next] */
+   if (size < p->size) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs + size;
+      newblock->size = p->size - size;
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+	 
+      p->size = size;
+   }
+
+   /* p = middle block */
+   p->free = 0;
+
+   /* Remove p from the free list: 
+    */
+   p->next_free->prev_free = p->prev_free;
+   p->prev_free->next_free = p->next_free;
+
+   p->next_free = 0;
+   p->prev_free = 0;
+
+   p->reserved = reserved;
+   return p;
+}
+
+
+struct mem_block *
+u_mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+{
+   struct mem_block *p;
+   const int mask = (1 << align2)-1;
+   int startofs = 0;
+   int endofs;
+
+   assert(size >= 0);
+   assert(align2 >= 0);
+   assert(align2 <= 12); /* sanity check, 2^12 (4KB) enough? */
+
+   if (!heap || align2 < 0 || size <= 0)
+      return NULL;
+
+   for (p = heap->next_free; p != heap; p = p->next_free) {
+      assert(p->free);
+
+      startofs = (p->ofs + mask) & ~mask;
+      if ( startofs < startSearch ) {
+	 startofs = startSearch;
+      }
+      endofs = startofs+size;
+      if (endofs <= (p->ofs+p->size))
+	 break;
+   }
+
+   if (p == heap) 
+      return NULL;
+
+   assert(p->free);
+   p = SliceBlock(p,startofs,size,0,mask+1);
+
+   return p;
+}
+
+
+struct mem_block *
+u_mmFindBlock(struct mem_block *heap, int start)
+{
+   struct mem_block *p;
+
+   for (p = heap->next; p != heap; p = p->next) {
+      if (p->ofs == start) 
+	 return p;
+   }
+
+   return NULL;
+}
+
+
+static INLINE int
+Join2Blocks(struct mem_block *p)
+{
+   /* XXX there should be some assertions here */
+
+   /* NOTE: heap->free == 0 */
+
+   if (p->free && p->next->free) {
+      struct mem_block *q = p->next;
+
+      assert(p->ofs + p->size == q->ofs);
+      p->size += q->size;
+
+      p->next = q->next;
+      q->next->prev = p;
+
+      q->next_free->prev_free = q->prev_free; 
+      q->prev_free->next_free = q->next_free;
+     
+      FREE(q);
+      return 1;
+   }
+   return 0;
+}
+
+int
+u_mmFreeMem(struct mem_block *b)
+{
+   if (!b)
+      return 0;
+
+   if (b->free) {
+      debug_printf("block already free\n");
+      return -1;
+   }
+   if (b->reserved) {
+      debug_printf("block is reserved\n");
+      return -1;
+   }
+
+   b->free = 1;
+   b->next_free = b->heap->next_free;
+   b->prev_free = b->heap;
+   b->next_free->prev_free = b;
+   b->prev_free->next_free = b;
+
+   Join2Blocks(b);
+   if (b->prev != b->heap)
+      Join2Blocks(b->prev);
+
+   return 0;
+}
+
+
+void
+u_mmDestroy(struct mem_block *heap)
+{
+   struct mem_block *p;
+
+   if (!heap)
+      return;
+
+   for (p = heap->next; p != heap; ) {
+      struct mem_block *next = p->next;
+      FREE(p);
+      p = next;
+   }
+
+   FREE(heap);
+}
diff --git a/src/gallium/auxiliary/util/u_mm.h b/src/gallium/auxiliary/util/u_mm.h
new file mode 100644
index 0000000000..ce20e48763
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_mm.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ *
+ * Copyright (C) 1999 Wittawat Yamwong
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * KEITH WHITWELL, OR ANY OTHER CONTRIBUTORS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE 
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Memory manager code.  Primarily used by device drivers to manage texture
+ * heaps, etc.
+ */
+
+
+#ifndef _U_MM_H_
+#define _U_MM_H_
+
+
+struct mem_block {
+   struct mem_block *next, *prev;
+   struct mem_block *next_free, *prev_free;
+   struct mem_block *heap;
+   int ofs,size;
+   unsigned int free:1;
+   unsigned int reserved:1;
+};
+
+
+
+/** 
+ * input: total size in bytes
+ * return: a heap pointer if OK, NULL if error
+ */
+extern struct mem_block *u_mmInit(int ofs, int size);
+
+/**
+ * Allocate 'size' bytes with 2^align2 bytes alignment,
+ * restrict the search to free memory after 'startSearch'
+ * depth and back buffers should be in different 4mb banks
+ * to get better page hits if possible
+ * input:	size = size of block
+ *       	align2 = 2^align2 bytes alignment
+ *		startSearch = linear offset from start of heap to begin search
+ * return: pointer to the allocated block, 0 if error
+ */
+extern struct mem_block *u_mmAllocMem(struct mem_block *heap, int size, int align2, 
+                            int startSearch);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a block
+ * return: 0 if OK, -1 if error
+ */
+extern int u_mmFreeMem(struct mem_block *b);
+
+/**
+ * Free block starts at offset
+ * input: pointer to a heap, start offset
+ * return: pointer to a block
+ */
+extern struct mem_block *u_mmFindBlock(struct mem_block *heap, int start);
+
+/**
+ * destroy MM
+ */
+extern void u_mmDestroy(struct mem_block *mmInit);
+
+/**
+ * For debuging purpose.
+ */
+extern void u_mmDumpMemInfo(const struct mem_block *mmInit);
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
new file mode 100644
index 0000000000..e0e8aa8e9f
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -0,0 +1,472 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Functions to produce packed colors/Z from floats.
+ */
+
+
+#ifndef U_PACK_COLOR_H
+#define U_PACK_COLOR_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "util/u_math.h"
+
+
+/**
+ * Pack ubyte R,G,B,A into dest pixel.
+ */
+static INLINE void
+util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
+                   enum pipe_format format, void *dest)
+{
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (r << 24) | (g << 16) | (b << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (r << 24) | (g << 16) | (b << 8) | 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      return;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (b << 24) | (g << 16) | (r << 8) | 0xff;
+      }
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+      }
+      return;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+      }
+      return;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+      }
+      return;
+   case PIPE_FORMAT_A8_UNORM:
+      {
+         ubyte *d = (ubyte *) dest;
+         *d = a;
+      }
+      return;
+   case PIPE_FORMAT_L8_UNORM:
+   case PIPE_FORMAT_I8_UNORM:
+      {
+         ubyte *d = (ubyte *) dest;
+         *d = r;
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      {
+         float *d = (float *) dest;
+         d[0] = (float)r / 255.0f;
+         d[1] = (float)g / 255.0f;
+         d[2] = (float)b / 255.0f;
+         d[3] = (float)a / 255.0f;
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      {
+         float *d = (float *) dest;
+         d[0] = (float)r / 255.0f;
+         d[1] = (float)g / 255.0f;
+         d[2] = (float)b / 255.0f;
+      }
+      return;
+
+   /* XXX lots more cases to add */
+   default:
+      debug_print_format("gallium: unhandled format in util_pack_color_ub()", format);
+      assert(0);
+   }
+}
+ 
+
+/**
+ * Unpack RGBA from a packed pixel, returning values as ubytes in [0,255].
+ */
+static INLINE void
+util_unpack_color_ub(enum pipe_format format, const void *src,
+                     ubyte *r, ubyte *g, ubyte *b, ubyte *a)
+{
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 24) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >>  8) & 0xff);
+         *a = (ubyte) ((p >>  0) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 24) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >>  8) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 16) & 0xff);
+         *g = (ubyte) ((p >>  8) & 0xff);
+         *b = (ubyte) ((p >>  0) & 0xff);
+         *a = (ubyte) ((p >> 24) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 16) & 0xff);
+         *g = (ubyte) ((p >>  8) & 0xff);
+         *b = (ubyte) ((p >>  0) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >>  8) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >> 24) & 0xff);
+         *a = (ubyte) ((p >>  0) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >>  8) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >> 24) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >> 8) & 0xf8) | ((p >> 13) & 0x7));
+         *g = (ubyte) (((p >> 3) & 0xfc) | ((p >>  9) & 0x3));
+         *b = (ubyte) (((p << 3) & 0xf8) | ((p >>  2) & 0x7));
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >>  7) & 0xf8) | ((p >> 12) & 0x7));
+         *g = (ubyte) (((p >>  2) & 0xf8) | ((p >>  7) & 0x7));
+         *b = (ubyte) (((p <<  3) & 0xf8) | ((p >>  2) & 0x7));
+         *a = (ubyte) (0xff * (p >> 15));
+      }
+      return;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >> 4) & 0xf0) | ((p >>  8) & 0xf));
+         *g = (ubyte) (((p >> 0) & 0xf0) | ((p >>  4) & 0xf));
+         *b = (ubyte) (((p << 4) & 0xf0) | ((p >>  0) & 0xf));
+         *a = (ubyte) (((p >> 8) & 0xf0) | ((p >> 12) & 0xf));
+      }
+      return;
+   case PIPE_FORMAT_A8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = (ubyte) 0xff;
+         *a = p;
+      }
+      return;
+   case PIPE_FORMAT_L8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = p;
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_I8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = *a = p;
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = float_to_ubyte(p[2]);
+         *a = float_to_ubyte(p[3]);
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = float_to_ubyte(p[2]);
+         *a = (ubyte) 0xff;
+      }
+      return;
+
+   case PIPE_FORMAT_R32G32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = *a = (ubyte) 0xff;
+      }
+      return;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = *b = *a = (ubyte) 0xff;
+      }
+      return;
+
+   /* XXX lots more cases to add */
+   default:
+      debug_print_format("gallium: unhandled format in util_unpack_color_ub()",
+                         format);
+      assert(0);
+   }
+}
+ 
+
+
+/**
+ * Note rgba outside [0,1] will be clamped for int pixel formats.
+ */
+static INLINE void
+util_pack_color(const float rgba[4], enum pipe_format format, void *dest)
+{
+   ubyte r, g, b, a;
+
+   if (pf_size_x(format) <= 8) {
+      /* format uses 8-bit components or less */
+      r = float_to_ubyte(rgba[0]);
+      g = float_to_ubyte(rgba[1]);
+      b = float_to_ubyte(rgba[2]);
+      a = float_to_ubyte(rgba[3]);
+   }
+
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (r << 24) | (g << 16) | (b << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (r << 24) | (g << 16) | (b << 8) | 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      return;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (b << 24) | (g << 16) | (r << 8) | 0xff;
+      }
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+      }
+      return;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((a & 0x80) << 8) | ((r & 0xf8) << 7) | ((g & 0xf8) << 2) | (b >> 3);
+      }
+      return;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((a & 0xf0) << 8) | ((r & 0xf0) << 4) | ((g & 0xf0) << 0) | (b >> 4);
+      }
+      return;
+   case PIPE_FORMAT_A8_UNORM:
+      {
+         ubyte *d = (ubyte *) dest;
+         *d = a;
+      }
+      return;
+   case PIPE_FORMAT_L8_UNORM:
+   case PIPE_FORMAT_I8_UNORM:
+      {
+         ubyte *d = (ubyte *) dest;
+         *d = r;
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      {
+         float *d = (float *) dest;
+         d[0] = rgba[0];
+         d[1] = rgba[1];
+         d[2] = rgba[2];
+         d[3] = rgba[3];
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      {
+         float *d = (float *) dest;
+         d[0] = rgba[0];
+         d[1] = rgba[1];
+         d[2] = rgba[2];
+      }
+      return;
+   /* XXX lots more cases to add */
+   default:
+      debug_print_format("gallium: unhandled format in util_pack_color()", format);
+      assert(0);
+   }
+}
+ 
+
+/**
+ * Note: it's assumed that z is in [0,1]
+ */
+static INLINE uint
+util_pack_z(enum pipe_format format, double z)
+{
+   if (z == 0.0)
+      return 0;
+
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      if (z == 1.0)
+         return 0xffff;
+      return (uint) (z * 0xffff);
+   case PIPE_FORMAT_Z32_UNORM:
+      /* special-case to avoid overflow */
+      if (z == 1.0)
+         return 0xffffffff;
+      return (uint) (z * 0xffffffff);
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      if (z == 1.0)
+         return 0xffffff;
+      return (uint) (z * 0xffffff);
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+      if (z == 1.0)
+         return 0xffffff00;
+      return ((uint) (z * 0xffffff)) << 8;
+   default:
+      debug_print_format("gallium: unhandled format in util_pack_z()", format);
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Pack 4 ubytes into a 4-byte word
+ */
+static INLINE unsigned
+pack_ub4(ubyte b0, ubyte b1, ubyte b2, ubyte b3)
+{
+   return ((((unsigned int)b0) << 0) |
+	   (((unsigned int)b1) << 8) |
+	   (((unsigned int)b2) << 16) |
+	   (((unsigned int)b3) << 24));
+}
+
+
+/**
+ * Pack/convert 4 floats into one 4-byte word.
+ */
+static INLINE unsigned
+pack_ui32_float4(float a, float b, float c, float d)
+{
+   return pack_ub4( float_to_ubyte(a),
+		    float_to_ubyte(b),
+		    float_to_ubyte(c),
+		    float_to_ubyte(d) );
+}
+
+
+
+#endif /* U_PACK_COLOR_H */
diff --git a/src/gallium/auxiliary/util/u_pointer.h b/src/gallium/auxiliary/util/u_pointer.h
new file mode 100644
index 0000000000..e1af9f11cb
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pointer.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_POINTER_H
+#define U_POINTER_H
+
+#include "pipe/p_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE intptr_t
+pointer_to_intptr( const void *p )
+{
+   union {
+      const void *p;
+      intptr_t i;
+   } pi;
+   pi.p = p;
+   return pi.i;
+}
+
+static INLINE void *
+intptr_to_pointer( intptr_t i )
+{
+   union {
+      void *p;
+      intptr_t i;
+   } pi;
+   pi.i = i;
+   return pi.p;
+}
+
+static INLINE uintptr_t
+pointer_to_uintptr( const void *ptr )
+{
+   union {
+      const void *p;
+      uintptr_t u;
+   } pu;
+   pu.p = ptr;
+   return pu.u;
+}
+
+static INLINE void *
+uintptr_to_pointer( uintptr_t u )
+{
+   union {
+      void *p;
+      uintptr_t u;
+   } pu;
+   pu.u = u;
+   return pu.p;
+}
+
+/**
+ * Return a pointer aligned to next multiple of N bytes.
+ */
+static INLINE void *
+align_pointer( const void *unaligned, uintptr_t alignment )
+{
+   uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
+   return uintptr_to_pointer( aligned );
+}
+
+
+/**
+ * Return a pointer aligned to next multiple of 16 bytes.
+ */
+static INLINE void *
+align16( void *unaligned )
+{
+   return align_pointer( unaligned, 16 );
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_POINTER_H */
diff --git a/src/gallium/auxiliary/util/u_prim.h b/src/gallium/auxiliary/util/u_prim.h
new file mode 100644
index 0000000000..e45e84ded2
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_prim.h
@@ -0,0 +1,122 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef U_BLIT_H
+#define U_BLIT_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pipe/p_defines.h"
+
+static INLINE boolean u_validate_pipe_prim( unsigned pipe_prim, unsigned nr )
+{
+   boolean ok = TRUE;
+
+   switch (pipe_prim) {
+   case PIPE_PRIM_POINTS:
+      ok = (nr >= 1);
+      break;
+   case PIPE_PRIM_LINES:
+      ok = (nr >= 2);
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      ok = (nr >= 2);
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      ok = (nr >= 3);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      ok = (nr >= 3);
+      break;
+   case PIPE_PRIM_QUADS:
+      ok = (nr >= 4);
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      ok = (nr >= 4);
+      break;
+   default:
+      ok = 0;
+      break;
+   }
+
+   return ok;
+}
+
+
+static INLINE boolean u_trim_pipe_prim( unsigned pipe_prim, unsigned *nr )
+{
+   boolean ok = TRUE;
+
+   switch (pipe_prim) {
+   case PIPE_PRIM_POINTS:
+      ok = (*nr >= 1);
+      break;
+   case PIPE_PRIM_LINES:
+      ok = (*nr >= 2);
+      *nr -= (*nr % 2);
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      ok = (*nr >= 2);
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      ok = (*nr >= 3);
+      *nr -= (*nr % 3);
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      ok = (*nr >= 3);
+      break;
+   case PIPE_PRIM_QUADS:
+      ok = (*nr >= 4);
+      *nr -= (*nr % 4);
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      ok = (*nr >= 4);
+      *nr -= (*nr % 2);
+      break;
+   default:
+      ok = 0;
+      break;
+   }
+
+   if (!ok)
+      *nr = 0;
+
+   return ok;
+}
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
new file mode 100644
index 0000000000..30f32413d7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -0,0 +1,329 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Rectangle-related helper functions.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_rect.h"
+
+
+/**
+ * Copy 2D rect from one place to another.
+ * Position and sizes are in pixels.
+ * src_pitch may be negative to do vertical flip of pixels from source.
+ */
+void
+pipe_copy_rect(ubyte * dst,
+               const struct pipe_format_block *block,
+               unsigned dst_stride,
+               unsigned dst_x,
+               unsigned dst_y,
+               unsigned width,
+               unsigned height,
+               const ubyte * src,
+               int src_stride,
+               unsigned src_x, 
+               int src_y)
+{
+   unsigned i;
+   int src_stride_pos = src_stride < 0 ? -src_stride : src_stride;
+
+   assert(block->size > 0);
+   assert(block->width > 0);
+   assert(block->height > 0);
+   assert(src_x >= 0);
+   assert(src_y >= 0);
+   assert(dst_x >= 0);
+   assert(dst_y >= 0);
+
+   dst_x /= block->width;
+   dst_y /= block->height;
+   width = (width + block->width - 1)/block->width;
+   height = (height + block->height - 1)/block->height;
+   src_x /= block->width;
+   src_y /= block->height;
+   
+   dst += dst_x * block->size;
+   src += src_x * block->size;
+   dst += dst_y * dst_stride;
+   src += src_y * src_stride_pos;
+   width *= block->size;
+
+   if (width == dst_stride && width == src_stride)
+      memcpy(dst, src, height * width);
+   else {
+      for (i = 0; i < height; i++) {
+         memcpy(dst, src, width);
+         dst += dst_stride;
+         src += src_stride;
+      }
+   }
+}
+
+void
+pipe_fill_rect(ubyte * dst,
+               const struct pipe_format_block *block,
+               unsigned dst_stride,
+               unsigned dst_x,
+               unsigned dst_y,
+               unsigned width,
+               unsigned height,
+               uint32_t value)
+{
+   unsigned i, j;
+   unsigned width_size;
+
+   assert(block->size > 0);
+   assert(block->width > 0);
+   assert(block->height > 0);
+   assert(dst_x >= 0);
+   assert(dst_y >= 0);
+
+   dst_x /= block->width;
+   dst_y /= block->height;
+   width = (width + block->width - 1)/block->width;
+   height = (height + block->height - 1)/block->height;
+   
+   dst += dst_x * block->size;
+   dst += dst_y * dst_stride;
+   width_size = width * block->size;
+   
+   switch (block->size) {
+   case 1:
+      if(dst_stride == width_size)
+	 memset(dst, (ubyte) value, height * width_size);
+      else {
+	 for (i = 0; i < height; i++) {
+	    memset(dst, (ubyte) value, width_size);
+	    dst += dst_stride;
+	 }
+      }
+      break;
+   case 2:
+      for (i = 0; i < height; i++) {
+	 uint16_t *row = (uint16_t *)dst;
+	 for (j = 0; j < width; j++)
+	    *row++ = (uint16_t) value;
+	 dst += dst_stride;
+      }
+      break;
+   case 4:
+      for (i = 0; i < height; i++) {
+	 uint32_t *row = (uint32_t *)dst;
+	 for (j = 0; j < width; j++)
+	    *row++ = value;
+	 dst += dst_stride;
+      }
+      break;
+   default:
+	 assert(0);
+	 break;
+   }
+}
+
+
+
+/**
+ * Fallback function for pipe->surface_copy().
+ * Note: (X,Y)=(0,0) is always the upper-left corner.
+ * if do_flip, flip the image vertically on its way from src rect to dst rect.
+ * XXX should probably put this in new u_surface.c file...
+ */
+void
+util_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+                  struct pipe_surface *dst,
+                  unsigned dst_x, unsigned dst_y,
+                  struct pipe_surface *src,
+                  unsigned src_x, unsigned src_y, 
+                  unsigned w, unsigned h)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *new_src = NULL, *new_dst = NULL;
+   void *dst_map;
+   const void *src_map;
+
+   assert(dst->block.size == src->block.size);
+   assert(dst->block.width == src->block.width);
+   assert(dst->block.height == src->block.height);
+
+   if ((src->usage & PIPE_BUFFER_USAGE_CPU_READ) == 0) {
+      /* Need to create new src surface which is CPU readable */
+      assert(src->texture);
+      if (!src->texture)
+         return;
+      new_src = screen->get_tex_surface(screen,
+                                        src->texture,
+                                        src->face,
+                                        src->level,
+                                        src->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_READ);
+      src = new_src;
+   }
+
+   if ((dst->usage & PIPE_BUFFER_USAGE_CPU_WRITE) == 0) {
+      /* Need to create new dst surface which is CPU writable */
+      assert(dst->texture);
+      if (!dst->texture)
+         return;
+      new_dst = screen->get_tex_surface(screen,
+                                        dst->texture,
+                                        dst->face,
+                                        dst->level,
+                                        dst->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+      dst = new_dst;
+   }
+
+   src_map = pipe->screen->surface_map(screen,
+                                       src, PIPE_BUFFER_USAGE_CPU_READ);
+   dst_map = pipe->screen->surface_map(screen,
+                                       dst, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   assert(src_map);
+   assert(dst_map);
+
+   if (src_map && dst_map) {
+      /* If do_flip, invert src_y position and pass negative src stride */
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dst_x, dst_y,
+                     w, h,
+                     src_map,
+                     do_flip ? -(int) src->stride : src->stride,
+                     src_x,
+                     do_flip ? w - src_y : src_y);
+   }
+
+   pipe->screen->surface_unmap(pipe->screen, src);
+   pipe->screen->surface_unmap(pipe->screen, dst);
+
+   if (new_src)
+      screen->tex_surface_release(screen, &new_src);
+   if (new_dst)
+      screen->tex_surface_release(screen, &new_dst);
+}
+
+
+
+static void *
+get_pointer(struct pipe_surface *dst, void *dst_map, unsigned x, unsigned y)
+{
+   return (char *)dst_map
+      + y / dst->block.height * dst->stride
+      + x / dst->block.width * dst->block.size;
+}
+
+
+#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
+
+
+/**
+ * Fallback for pipe->surface_fill() function.
+ * XXX should probably put this in new u_surface.c file...
+ */
+void
+util_surface_fill(struct pipe_context *pipe,
+                  struct pipe_surface *dst,
+                  unsigned dstx, unsigned dsty,
+                  unsigned width, unsigned height, unsigned value)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *new_dst = NULL;
+   void *dst_map;
+
+   if ((dst->usage & PIPE_BUFFER_USAGE_CPU_WRITE) == 0) {
+      /* Need to create new dst surface which is CPU writable */
+      assert(dst->texture);
+      if (!dst->texture)
+         return;
+      new_dst = screen->get_tex_surface(screen,
+                                        dst->texture,
+                                        dst->face,
+                                        dst->level,
+                                        dst->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+      dst = new_dst;
+   }
+
+   dst_map = pipe->screen->surface_map(screen,
+                                       dst, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   assert(dst_map);
+
+   if (dst_map) {
+      assert(dst->stride > 0);
+
+      switch (dst->block.size) {
+      case 1:
+      case 2:
+      case 4:
+         pipe_fill_rect(dst_map, &dst->block, dst->stride,
+                        dstx, dsty, width, height, value);
+         break;
+      case 8:
+         {
+            /* expand the 4-byte clear value to an 8-byte value */
+            ushort *row = (ushort *) get_pointer(dst, dst_map, dstx, dsty);
+            ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
+            ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
+            ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
+            ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
+            unsigned i, j;
+            val0 = (val0 << 8) | val0;
+            val1 = (val1 << 8) | val1;
+            val2 = (val2 << 8) | val2;
+            val3 = (val3 << 8) | val3;
+            for (i = 0; i < height; i++) {
+               for (j = 0; j < width; j++) {
+                  row[j*4+0] = val0;
+                  row[j*4+1] = val1;
+                  row[j*4+2] = val2;
+                  row[j*4+3] = val3;
+               }
+               row += dst->stride/2;
+            }
+         }
+         break;
+      default:
+         assert(0);
+         break;
+      }
+   }
+
+   pipe->screen->surface_unmap(pipe->screen, dst);
+
+   if (new_dst)
+      screen->tex_surface_release(screen, &new_dst);
+}
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
new file mode 100644
index 0000000000..59e842e16d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Pipe copy/fill rect helpers.
+ */
+
+
+#ifndef U_RECT_H
+#define U_RECT_H
+
+
+#include "pipe/p_format.h"
+
+struct pipe_context;
+struct pipe_surface;
+
+
+extern void
+pipe_copy_rect(ubyte * dst, const struct pipe_format_block *block,
+               unsigned dst_stride, unsigned dst_x, unsigned dst_y,
+               unsigned width, unsigned height, const ubyte * src,
+               int src_stride, unsigned src_x, int src_y);
+
+extern void
+pipe_fill_rect(ubyte * dst, const struct pipe_format_block *block,
+               unsigned dst_stride, unsigned dst_x, unsigned dst_y,
+               unsigned width, unsigned height, uint32_t value);
+
+
+extern void
+util_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+                  struct pipe_surface *dst,
+                  unsigned dst_x, unsigned dst_y,
+                  struct pipe_surface *src,
+                  unsigned src_x, unsigned src_y, 
+                  unsigned w, unsigned h);
+
+extern void
+util_surface_fill(struct pipe_context *pipe,
+                  struct pipe_surface *dst,
+                  unsigned dstx, unsigned dsty,
+                  unsigned width, unsigned height, unsigned value);
+
+
+#endif /* U_RECT_H */
diff --git a/src/gallium/auxiliary/util/u_simple_list.h b/src/gallium/auxiliary/util/u_simple_list.h
new file mode 100644
index 0000000000..f5f43b0faa
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_simple_list.h
@@ -0,0 +1,197 @@
+/**
+ * \file simple_list.h
+ * Simple macros for type-safe, intrusive lists.
+ *
+ *  Intended to work with a list sentinal which is created as an empty
+ *  list.  Insert & delete are O(1).
+ *  
+ * \author
+ *  (C) 1997, Keith Whitwell
+ */
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _U_SIMPLE_LIST_H_
+#define _U_SIMPLE_LIST_H_
+
+/**
+ * Remove an element from list.
+ *
+ * \param elem element to remove.
+ */
+#define remove_from_list(elem)			\
+do {						\
+   (elem)->next->prev = (elem)->prev;		\
+   (elem)->prev->next = (elem)->next;		\
+} while (0)
+
+/**
+ * Insert an element to the list head.
+ *
+ * \param list list.
+ * \param elem element to insert.
+ */
+#define insert_at_head(list, elem)		\
+do {						\
+   (elem)->prev = list;				\
+   (elem)->next = (list)->next;			\
+   (list)->next->prev = elem;			\
+   (list)->next = elem;				\
+} while(0)
+
+/**
+ * Insert an element to the list tail.
+ *
+ * \param list list.
+ * \param elem element to insert.
+ */
+#define insert_at_tail(list, elem)		\
+do {						\
+   (elem)->next = list;				\
+   (elem)->prev = (list)->prev;			\
+   (list)->prev->next = elem;			\
+   (list)->prev = elem;				\
+} while(0)
+
+/**
+ * Move an element to the list head.
+ *
+ * \param list list.
+ * \param elem element to move.
+ */
+#define move_to_head(list, elem)		\
+do {						\
+   remove_from_list(elem);			\
+   insert_at_head(list, elem);			\
+} while (0)
+
+/**
+ * Move an element to the list tail.
+ *
+ * \param list list.
+ * \param elem element to move.
+ */
+#define move_to_tail(list, elem)		\
+do {						\
+   remove_from_list(elem);			\
+   insert_at_tail(list, elem);			\
+} while (0)
+
+/**
+ * Make a empty list empty.
+ *
+ * \param sentinal list (sentinal element).
+ */
+#define make_empty_list(sentinal)		\
+do {						\
+   (sentinal)->next = sentinal;			\
+   (sentinal)->prev = sentinal;			\
+} while (0)
+
+/**
+ * Get list first element.
+ *
+ * \param list list.
+ *
+ * \return pointer to first element.
+ */
+#define first_elem(list)       ((list)->next)
+
+/**
+ * Get list last element.
+ *
+ * \param list list.
+ *
+ * \return pointer to last element.
+ */
+#define last_elem(list)        ((list)->prev)
+
+/**
+ * Get next element.
+ *
+ * \param elem element.
+ *
+ * \return pointer to next element.
+ */
+#define next_elem(elem)        ((elem)->next)
+
+/**
+ * Get previous element.
+ *
+ * \param elem element.
+ *
+ * \return pointer to previous element.
+ */
+#define prev_elem(elem)        ((elem)->prev)
+
+/**
+ * Test whether element is at end of the list.
+ * 
+ * \param list list.
+ * \param elem element.
+ * 
+ * \return non-zero if element is at end of list, or zero otherwise.
+ */
+#define at_end(list, elem)     ((elem) == (list))
+
+/**
+ * Test if a list is empty.
+ * 
+ * \param list list.
+ * 
+ * \return non-zero if list empty, or zero otherwise.
+ */
+#define is_empty_list(list)    ((list)->next == (list))
+
+/**
+ * Walk through the elements of a list.
+ *
+ * \param ptr pointer to the current element.
+ * \param list list.
+ *
+ * \note It should be followed by a { } block or a single statement, as in a \c
+ * for loop.
+ */
+#define foreach(ptr, list)     \
+        for( ptr=(list)->next ;  ptr!=list ;  ptr=(ptr)->next )
+
+/**
+ * Walk through the elements of a list.
+ *
+ * Same as #foreach but lets you unlink the current value during a list
+ * traversal.  Useful for freeing a list, element by element.
+ * 
+ * \param ptr pointer to the current element.
+ * \param t temporary pointer.
+ * \param list list.
+ *
+ * \note It should be followed by a { } block or a single statement, as in a \c
+ * for loop.
+ */
+#define foreach_s(ptr, t, list)   \
+        for(ptr=(list)->next,t=(ptr)->next; list != ptr; ptr=t, t=(t)->next)
+
+#endif /* _U_SIMPLE_LIST_H_ */
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
new file mode 100644
index 0000000000..f06d13c2c4
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -0,0 +1,361 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Simple vertex/fragment shader generators.
+ *  
+ * @author Brian Paul
+ */
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+
+#include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+/**
+ * Make simple vertex pass-through shader.
+ */
+void *
+util_make_vertex_passthrough_shader(struct pipe_context *pipe,
+                                    uint num_attribs,
+                                    const uint *semantic_names,
+                                    const uint *semantic_indexes,
+                                    struct pipe_shader_state *shader)
+                                    
+{
+   uint maxTokens = 100;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+   struct tgsi_processor *processor;
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+   const uint procType = TGSI_PROCESSOR_VERTEX;
+   uint ti, i;
+
+   tokens = (struct tgsi_token *) MALLOC(maxTokens * sizeof(tokens[0]));
+
+   /* shader header
+    */
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) &tokens[2];
+   *processor = tgsi_build_processor( procType, header );
+
+   ti = 3;
+
+   /* declare inputs */
+   for (i = 0; i < num_attribs; i++) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_INPUT;
+
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.SemanticName = semantic_names[i];
+      decl.Semantic.SemanticIndex = semantic_indexes[i];
+
+      decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = i;
+      ti += tgsi_build_full_declaration(&decl,
+                                        &tokens[ti],
+                                        header,
+                                        maxTokens - ti);
+   }
+
+   /* declare outputs */
+   for (i = 0; i < num_attribs; i++) {
+      decl = tgsi_default_full_declaration();
+      decl.Declaration.File = TGSI_FILE_OUTPUT;
+      decl.Declaration.Semantic = 1;
+      decl.Semantic.SemanticName = semantic_names[i];
+      decl.Semantic.SemanticIndex = semantic_indexes[i];
+      decl.DeclarationRange.First = 
+         decl.DeclarationRange.Last = i;
+      ti += tgsi_build_full_declaration(&decl,
+                                        &tokens[ti],
+                                        header,
+                                        maxTokens - ti);
+   }
+
+   /* emit MOV instructions */
+   for (i = 0; i < num_attribs; i++) {
+      /* MOVE out[i], in[i]; */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+      inst.Instruction.NumDstRegs = 1;
+      inst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+      inst.FullDstRegisters[0].DstRegister.Index = i;
+      inst.Instruction.NumSrcRegs = 1;
+      inst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+      inst.FullSrcRegisters[0].SrcRegister.Index = i;
+      ti += tgsi_build_full_instruction(&inst,
+                                        &tokens[ti],
+                                        header,
+                                        maxTokens - ti );
+   }
+
+   /* END instruction */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+   ti += tgsi_build_full_instruction(&inst,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti );
+
+#if 0 /*debug*/
+   tgsi_dump(tokens, 0);
+#endif
+
+   shader->tokens = tokens;
+   /*shader->num_tokens = ti;*/
+
+   return pipe->create_vs_state(pipe, shader);
+}
+
+
+
+
+/**
+ * Make simple fragment texture shader:
+ *  TEX OUT[0], IN[0], SAMP[0], 2D;
+ *  END;
+ */
+void *
+util_make_fragment_tex_shader(struct pipe_context *pipe,
+                              struct pipe_shader_state *shader)
+{
+   uint maxTokens = 100;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+   struct tgsi_processor *processor;
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+   const uint procType = TGSI_PROCESSOR_FRAGMENT;
+   uint ti;
+
+   tokens = (struct tgsi_token *) MALLOC(maxTokens * sizeof(tokens[0]));
+
+   /* shader header
+    */
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) &tokens[2];
+   *processor = tgsi_build_processor( procType, header );
+
+   ti = 3;
+
+   /* declare TEX[0] input */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   /* XXX this could be linear... */
+   decl.Declaration.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = TGSI_SEMANTIC_GENERIC;
+   decl.Semantic.SemanticIndex = 0;
+   decl.DeclarationRange.First = 
+   decl.DeclarationRange.Last = 0;
+   ti += tgsi_build_full_declaration(&decl,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti);
+
+   /* declare color[0] output */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = TGSI_SEMANTIC_COLOR;
+   decl.Semantic.SemanticIndex = 0;
+   decl.DeclarationRange.First = 
+   decl.DeclarationRange.Last = 0;
+   ti += tgsi_build_full_declaration(&decl,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti);
+
+   /* declare sampler */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER;
+   decl.DeclarationRange.First = 
+   decl.DeclarationRange.Last = 0;
+   ti += tgsi_build_full_declaration(&decl,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti);
+
+   /* TEX instruction */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+   inst.FullDstRegisters[0].DstRegister.Index = 0;
+   inst.Instruction.NumSrcRegs = 2;
+   inst.InstructionExtTexture.Texture = TGSI_TEXTURE_2D;
+   inst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+   inst.FullSrcRegisters[0].SrcRegister.Index = 0;
+   inst.FullSrcRegisters[1].SrcRegister.File = TGSI_FILE_SAMPLER;
+   inst.FullSrcRegisters[1].SrcRegister.Index = 0;
+   ti += tgsi_build_full_instruction(&inst,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti );
+
+   /* END instruction */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+   ti += tgsi_build_full_instruction(&inst,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti );
+
+#if 0 /*debug*/
+   tgsi_dump(tokens, 0);
+#endif
+
+   shader->tokens = tokens;
+   /*shader->num_tokens = ti;*/
+
+   return pipe->create_fs_state(pipe, shader);
+}
+
+
+
+
+
+/**
+ * Make simple fragment color pass-through shader.
+ */
+void *
+util_make_fragment_passthrough_shader(struct pipe_context *pipe,
+                                      struct pipe_shader_state *shader)
+{
+   uint maxTokens = 40;
+   struct tgsi_token *tokens;
+   struct tgsi_header *header;
+   struct tgsi_processor *processor;
+   struct tgsi_full_declaration decl;
+   struct tgsi_full_instruction inst;
+   const uint procType = TGSI_PROCESSOR_FRAGMENT;
+   uint ti;
+
+   tokens = (struct tgsi_token *) MALLOC(maxTokens * sizeof(tokens[0]));
+
+   /* shader header
+    */
+   *(struct tgsi_version *) &tokens[0] = tgsi_build_version();
+
+   header = (struct tgsi_header *) &tokens[1];
+   *header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) &tokens[2];
+   *processor = tgsi_build_processor( procType, header );
+
+   ti = 3;
+
+   /* declare input */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_INPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = TGSI_SEMANTIC_COLOR;
+   decl.Semantic.SemanticIndex = 0;
+   decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = 0;
+   ti += tgsi_build_full_declaration(&decl,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti);
+
+   /* declare output */
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_OUTPUT;
+   decl.Declaration.Semantic = 1;
+   decl.Semantic.SemanticName = TGSI_SEMANTIC_COLOR;
+   decl.Semantic.SemanticIndex = 0;
+   decl.DeclarationRange.First = 
+      decl.DeclarationRange.Last = 0;
+   ti += tgsi_build_full_declaration(&decl,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti);
+
+
+   /* MOVE out[0], in[0]; */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_MOV;
+   inst.Instruction.NumDstRegs = 1;
+   inst.FullDstRegisters[0].DstRegister.File = TGSI_FILE_OUTPUT;
+   inst.FullDstRegisters[0].DstRegister.Index = 0;
+   inst.Instruction.NumSrcRegs = 1;
+   inst.FullSrcRegisters[0].SrcRegister.File = TGSI_FILE_INPUT;
+   inst.FullSrcRegisters[0].SrcRegister.Index = 0;
+   ti += tgsi_build_full_instruction(&inst,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti );
+
+   /* END instruction */
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = TGSI_OPCODE_END;
+   inst.Instruction.NumDstRegs = 0;
+   inst.Instruction.NumSrcRegs = 0;
+   ti += tgsi_build_full_instruction(&inst,
+                                     &tokens[ti],
+                                     header,
+                                     maxTokens - ti );
+
+   assert(ti < maxTokens);
+
+#if 0 /*debug*/
+   tgsi_dump(tokens, 0);
+#endif
+
+   shader->tokens = tokens;
+   /*shader->num_tokens = ti;*/
+
+   return pipe->create_fs_state(pipe, shader);
+}
+
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
new file mode 100644
index 0000000000..8ca4977d71
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef U_SIMPLE_SHADERS_H
+#define U_SIMPLE_SHADERS_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct pipe_context;
+struct pipe_shader_state;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+extern void *
+util_make_vertex_passthrough_shader(struct pipe_context *pipe,
+                                    uint num_attribs,
+                                    const uint *semantic_names,
+                                    const uint *semantic_indexes,
+                                    struct pipe_shader_state *shader);
+
+
+extern void *
+util_make_fragment_tex_shader(struct pipe_context *pipe,
+                              struct pipe_shader_state *shader);
+
+
+extern void *
+util_make_fragment_passthrough_shader(struct pipe_context *pipe,
+                                      struct pipe_shader_state *shader);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_snprintf.c b/src/gallium/auxiliary/util/u_snprintf.c
new file mode 100644
index 0000000000..0d54299b28
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_snprintf.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 1995 Patrick Powell.
+ *
+ * This code is based on code written by Patrick Powell <papowell@astart.com>.
+ * It may be used for any purpose as long as this notice remains intact on all
+ * source code distributions.
+ */
+
+/*
+ * Copyright (c) 2008 Holger Weiss.
+ *
+ * This version of the code is maintained by Holger Weiss <holger@jhweiss.de>.
+ * My changes to the code may freely be used, modified and/or redistributed for
+ * any purpose.  It would be nice if additions and fixes to this file (including
+ * trivial code cleanups) would be sent back in order to let me include them in
+ * the version available at <http://www.jhweiss.de/software/snprintf.html>.
+ * However, this is not a requirement for using or redistributing (possibly
+ * modified) versions of this file, nor is leaving this notice intact mandatory.
+ */
+
+/*
+ * History
+ *
+ * 2008-01-20 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.1:
+ *
+ * 	Fixed the detection of infinite floating point values on IRIX (and
+ * 	possibly other systems) and applied another few minor cleanups.
+ *
+ * 2008-01-06 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.0:
+ *
+ * 	Added a lot of new features, fixed many bugs, and incorporated various
+ * 	improvements done by Andrew Tridgell <tridge@samba.org>, Russ Allbery
+ * 	<rra@stanford.edu>, Hrvoje Niksic <hniksic@xemacs.org>, Damien Miller
+ * 	<djm@mindrot.org>, and others for the Samba, INN, Wget, and OpenSSH
+ * 	projects.  The additions include: support the "e", "E", "g", "G", and
+ * 	"F" conversion specifiers (and use conversion style "f" or "F" for the
+ * 	still unsupported "a" and "A" specifiers); support the "hh", "ll", "j",
+ * 	"t", and "z" length modifiers; support the "#" flag and the (non-C99)
+ * 	"'" flag; use localeconv(3) (if available) to get both the current
+ * 	locale's decimal point character and the separator between groups of
+ * 	digits; fix the handling of various corner cases of field width and
+ * 	precision specifications; fix various floating point conversion bugs;
+ * 	handle infinite and NaN floating point values; don't attempt to write to
+ * 	the output buffer (which may be NULL) if a size of zero was specified;
+ * 	check for integer overflow of the field width, precision, and return
+ * 	values and during the floating point conversion; use the OUTCHAR() macro
+ * 	instead of a function for better performance; provide asprintf(3) and
+ * 	vasprintf(3) functions; add new test cases.  The replacement functions
+ * 	have been renamed to use an "rpl_" prefix, the function calls in the
+ * 	main project (and in this file) must be redefined accordingly for each
+ * 	replacement function which is needed (by using Autoconf or other means).
+ * 	Various other minor improvements have been applied and the coding style
+ * 	was cleaned up for consistency.
+ *
+ * 2007-07-23 Holger Weiss <holger@jhweiss.de> for Mutt 1.5.13:
+ *
+ * 	C99 compliant snprintf(3) and vsnprintf(3) functions return the number
+ * 	of characters that would have been written to a sufficiently sized
+ * 	buffer (excluding the '\0').  The original code simply returned the
+ * 	length of the resulting output string, so that's been fixed.
+ *
+ * 1998-03-05 Michael Elkins <me@mutt.org> for Mutt 0.90.8:
+ *
+ * 	The original code assumed that both snprintf(3) and vsnprintf(3) were
+ * 	missing.  Some systems only have snprintf(3) but not vsnprintf(3), so
+ * 	the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF.
+ *
+ * 1998-01-27 Thomas Roessler <roessler@does-not-exist.org> for Mutt 0.89i:
+ *
+ * 	The PGP code was using unsigned hexadecimal formats.  Unfortunately,
+ * 	unsigned formats simply didn't work.
+ *
+ * 1997-10-22 Brandon Long <blong@fiction.net> for Mutt 0.87.1:
+ *
+ * 	Ok, added some minimal floating point support, which means this probably
+ * 	requires libm on most operating systems.  Don't yet support the exponent
+ * 	(e,E) and sigfig (g,G).  Also, fmtint() was pretty badly broken, it just
+ * 	wasn't being exercised in ways which showed it, so that's been fixed.
+ * 	Also, formatted the code to Mutt conventions, and removed dead code left
+ * 	over from the original.  Also, there is now a builtin-test, run with:
+ * 	gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm && ./snprintf
+ *
+ * 2996-09-15 Brandon Long <blong@fiction.net> for Mutt 0.43:
+ *
+ * 	This was ugly.  It is still ugly.  I opted out of floating point
+ * 	numbers, but the formatter understands just about everything from the
+ * 	normal C string format, at least as far as I can tell from the Solaris
+ * 	2.5 printf(3S) man page.
+ */
+
+/*
+ * ToDo
+ *
+ * - Add wide character support.
+ * - Add support for "%a" and "%A" conversions.
+ * - Create test routines which predefine the expected results.  Our test cases
+ *   usually expose bugs in system implementations rather than in ours :-)
+ */
+
+/*
+ * Usage
+ *
+ * 1) The following preprocessor macros should be defined to 1 if the feature or
+ *    file in question is available on the target system (by using Autoconf or
+ *    other means), though basic functionality should be available as long as
+ *    HAVE_STDARG_H and HAVE_STDLIB_H are defined correctly:
+ *
+ *    	HAVE_VSNPRINTF
+ *    	HAVE_SNPRINTF
+ *    	HAVE_VASPRINTF
+ *    	HAVE_ASPRINTF
+ *    	HAVE_STDARG_H
+ *    	HAVE_STDDEF_H
+ *    	HAVE_STDINT_H
+ *    	HAVE_STDLIB_H
+ *    	HAVE_INTTYPES_H
+ *    	HAVE_LOCALE_H
+ *    	HAVE_LOCALECONV
+ *    	HAVE_LCONV_DECIMAL_POINT
+ *    	HAVE_LCONV_THOUSANDS_SEP
+ *    	HAVE_LONG_DOUBLE
+ *    	HAVE_LONG_LONG_INT
+ *    	HAVE_UNSIGNED_LONG_LONG_INT
+ *    	HAVE_INTMAX_T
+ *    	HAVE_UINTMAX_T
+ *    	HAVE_UINTPTR_T
+ *    	HAVE_PTRDIFF_T
+ *    	HAVE_VA_COPY
+ *    	HAVE___VA_COPY
+ *
+ * 2) The calls to the functions which should be replaced must be redefined
+ *    throughout the project files (by using Autoconf or other means):
+ *
+ *    	#define vsnprintf rpl_vsnprintf
+ *    	#define snprintf rpl_snprintf
+ *    	#define vasprintf rpl_vasprintf
+ *    	#define asprintf rpl_asprintf
+ *
+ * 3) The required replacement functions should be declared in some header file
+ *    included throughout the project files:
+ *
+ *    	#if HAVE_CONFIG_H
+ *    	#include <config.h>
+ *    	#endif
+ *    	#if HAVE_STDARG_H
+ *    	#include <stdarg.h>
+ *    	#if !HAVE_VSNPRINTF
+ *    	int rpl_vsnprintf(char *, size_t, const char *, va_list);
+ *    	#endif
+ *    	#if !HAVE_SNPRINTF
+ *    	int rpl_snprintf(char *, size_t, const char *, ...);
+ *    	#endif
+ *    	#if !HAVE_VASPRINTF
+ *    	int rpl_vasprintf(char **, const char *, va_list);
+ *    	#endif
+ *    	#if !HAVE_ASPRINTF
+ *    	int rpl_asprintf(char **, const char *, ...);
+ *    	#endif
+ *    	#endif
+ *
+ * Autoconf macros for handling step 1 and step 2 are available at
+ * <http://www.jhweiss.de/software/snprintf.html>.
+ */
+
+#include "pipe/p_config.h"
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#else
+#ifdef WIN32
+#define vsnprintf util_vsnprintf
+#define snprintf util_snprintf
+#define HAVE_VSNPRINTF 0
+#define HAVE_SNPRINTF 0
+#define HAVE_VASPRINTF 1 /* not needed */
+#define HAVE_ASPRINTF 1 /* not needed */
+#define HAVE_STDARG_H 1
+#define HAVE_STDDEF_H 1
+#define HAVE_STDINT_H 0
+#define HAVE_STDLIB_H 1
+#define HAVE_INTTYPES_H 0
+#define HAVE_LOCALE_H 0
+#define HAVE_LOCALECONV 0
+#define HAVE_LCONV_DECIMAL_POINT 0
+#define HAVE_LCONV_THOUSANDS_SEP 0
+#define HAVE_LONG_DOUBLE 0
+#define HAVE_LONG_LONG_INT 1
+#define HAVE_UNSIGNED_LONG_LONG_INT 1
+#define HAVE_INTMAX_T 0
+#define HAVE_UINTMAX_T 0
+#define HAVE_UINTPTR_T 1
+#define HAVE_PTRDIFF_T 1
+#define HAVE_VA_COPY 0
+#define HAVE___VA_COPY 0
+#else
+#define HAVE_VSNPRINTF 1
+#define HAVE_SNPRINTF 1
+#define HAVE_VASPRINTF 1
+#define HAVE_ASPRINTF 1
+#endif
+#endif	/* HAVE_CONFIG_H */
+
+#if !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || !HAVE_VASPRINTF
+#include <stdio.h>	/* For NULL, size_t, vsnprintf(3), and vasprintf(3). */
+#ifdef VA_START
+#undef VA_START
+#endif	/* defined(VA_START) */
+#ifdef VA_SHIFT
+#undef VA_SHIFT
+#endif	/* defined(VA_SHIFT) */
+#if HAVE_STDARG_H
+#include <stdarg.h>
+#define VA_START(ap, last) va_start(ap, last)
+#define VA_SHIFT(ap, value, type) /* No-op for ANSI C. */
+#else	/* Assume <varargs.h> is available. */
+#include <varargs.h>
+#define VA_START(ap, last) va_start(ap)	/* "last" is ignored. */
+#define VA_SHIFT(ap, value, type) value = va_arg(ap, type)
+#endif	/* HAVE_STDARG_H */
+
+#if !HAVE_VASPRINTF
+#if HAVE_STDLIB_H
+#include <stdlib.h>	/* For malloc(3). */
+#endif	/* HAVE_STDLIB_H */
+#ifdef VA_COPY
+#undef VA_COPY
+#endif	/* defined(VA_COPY) */
+#ifdef VA_END_COPY
+#undef VA_END_COPY
+#endif	/* defined(VA_END_COPY) */
+#if HAVE_VA_COPY
+#define VA_COPY(dest, src) va_copy(dest, src)
+#define VA_END_COPY(ap) va_end(ap)
+#elif HAVE___VA_COPY
+#define VA_COPY(dest, src) __va_copy(dest, src)
+#define VA_END_COPY(ap) va_end(ap)
+#else
+#define VA_COPY(dest, src) (void)mymemcpy(&dest, &src, sizeof(va_list))
+#define VA_END_COPY(ap) /* No-op. */
+#define NEED_MYMEMCPY 1
+static void *mymemcpy(void *, void *, size_t);
+#endif	/* HAVE_VA_COPY */
+#endif	/* !HAVE_VASPRINTF */
+
+#if !HAVE_VSNPRINTF
+#include <limits.h>	/* For *_MAX. */
+#if HAVE_INTTYPES_H
+#include <inttypes.h>	/* For intmax_t (if not defined in <stdint.h>). */
+#endif	/* HAVE_INTTYPES_H */
+#if HAVE_LOCALE_H
+#include <locale.h>	/* For localeconv(3). */
+#endif	/* HAVE_LOCALE_H */
+#if HAVE_STDDEF_H
+#include <stddef.h>	/* For ptrdiff_t. */
+#endif	/* HAVE_STDDEF_H */
+#if HAVE_STDINT_H
+#include <stdint.h>	/* For intmax_t. */
+#endif	/* HAVE_STDINT_H */
+
+/* Support for unsigned long long int.  We may also need ULLONG_MAX. */
+#ifndef ULONG_MAX	/* We may need ULONG_MAX as a fallback. */
+#ifdef UINT_MAX
+#define ULONG_MAX UINT_MAX
+#else
+#define ULONG_MAX INT_MAX
+#endif	/* defined(UINT_MAX) */
+#endif	/* !defined(ULONG_MAX) */
+#ifdef ULLONG
+#undef ULLONG
+#endif	/* defined(ULLONG) */
+#if HAVE_UNSIGNED_LONG_LONG_INT
+#define ULLONG unsigned long long int
+#ifndef ULLONG_MAX
+#define ULLONG_MAX ULONG_MAX
+#endif	/* !defined(ULLONG_MAX) */
+#else
+#define ULLONG unsigned long int
+#ifdef ULLONG_MAX
+#undef ULLONG_MAX
+#endif	/* defined(ULLONG_MAX) */
+#define ULLONG_MAX ULONG_MAX
+#endif	/* HAVE_LONG_LONG_INT */
+
+/* Support for uintmax_t.  We also need UINTMAX_MAX. */
+#ifdef UINTMAX_T
+#undef UINTMAX_T
+#endif	/* defined(UINTMAX_T) */
+#if HAVE_UINTMAX_T || defined(uintmax_t)
+#define UINTMAX_T uintmax_t
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX ULLONG_MAX
+#endif	/* !defined(UINTMAX_MAX) */
+#else
+#define UINTMAX_T ULLONG
+#ifdef UINTMAX_MAX
+#undef UINTMAX_MAX
+#endif	/* defined(UINTMAX_MAX) */
+#define UINTMAX_MAX ULLONG_MAX
+#endif	/* HAVE_UINTMAX_T || defined(uintmax_t) */
+
+/* Support for long double. */
+#ifndef LDOUBLE
+#if HAVE_LONG_DOUBLE
+#define LDOUBLE long double
+#else
+#define LDOUBLE double
+#endif	/* HAVE_LONG_DOUBLE */
+#endif	/* !defined(LDOUBLE) */
+
+/* Support for long long int. */
+#ifndef LLONG
+#if HAVE_LONG_LONG_INT
+#define LLONG long long int
+#else
+#define LLONG long int
+#endif	/* HAVE_LONG_LONG_INT */
+#endif	/* !defined(LLONG) */
+
+/* Support for intmax_t. */
+#ifndef INTMAX_T
+#if HAVE_INTMAX_T || defined(intmax_t)
+#define INTMAX_T intmax_t
+#else
+#define INTMAX_T LLONG
+#endif	/* HAVE_INTMAX_T || defined(intmax_t) */
+#endif	/* !defined(INTMAX_T) */
+
+/* Support for uintptr_t. */
+#ifndef UINTPTR_T
+#if HAVE_UINTPTR_T || defined(uintptr_t)
+#define UINTPTR_T uintptr_t
+#else
+#define UINTPTR_T unsigned long int
+#endif	/* HAVE_UINTPTR_T || defined(uintptr_t) */
+#endif	/* !defined(UINTPTR_T) */
+
+/* WinCE5.0 does not have uintptr_t defined */ 
+#if (_WIN32_WCE < 600) 
+#ifdef UINTPTR_T 
+#undef UINTPTR_T 
+#endif 
+#define UINTPTR_T unsigned long int 
+#endif 
+
+
+/* Support for ptrdiff_t. */
+#ifndef PTRDIFF_T
+#if HAVE_PTRDIFF_T || defined(ptrdiff_t)
+#define PTRDIFF_T ptrdiff_t
+#else
+#define PTRDIFF_T long int
+#endif	/* HAVE_PTRDIFF_T || defined(ptrdiff_t) */
+#endif	/* !defined(PTRDIFF_T) */
+
+/*
+ * We need an unsigned integer type corresponding to ptrdiff_t (cf. C99:
+ * 7.19.6.1, 7).  However, we'll simply use PTRDIFF_T and convert it to an
+ * unsigned type if necessary.  This should work just fine in practice.
+ */
+#ifndef UPTRDIFF_T
+#define UPTRDIFF_T PTRDIFF_T
+#endif	/* !defined(UPTRDIFF_T) */
+
+/*
+ * We need a signed integer type corresponding to size_t (cf. C99: 7.19.6.1, 7).
+ * However, we'll simply use size_t and convert it to a signed type if
+ * necessary.  This should work just fine in practice.
+ */
+#ifndef SSIZE_T
+#define SSIZE_T size_t
+#endif	/* !defined(SSIZE_T) */
+
+/* Either ERANGE or E2BIG should be available everywhere. */
+#ifndef ERANGE
+#define ERANGE E2BIG
+#endif	/* !defined(ERANGE) */
+#ifndef EOVERFLOW
+#define EOVERFLOW ERANGE
+#endif	/* !defined(EOVERFLOW) */
+
+/*
+ * Buffer size to hold the octal string representation of UINT128_MAX without
+ * nul-termination ("3777777777777777777777777777777777777777777").
+ */
+#ifdef MAX_CONVERT_LENGTH
+#undef MAX_CONVERT_LENGTH
+#endif	/* defined(MAX_CONVERT_LENGTH) */
+#define MAX_CONVERT_LENGTH      43
+
+/* Format read states. */
+#define PRINT_S_DEFAULT         0
+#define PRINT_S_FLAGS           1
+#define PRINT_S_WIDTH           2
+#define PRINT_S_DOT             3
+#define PRINT_S_PRECISION       4
+#define PRINT_S_MOD             5
+#define PRINT_S_CONV            6
+
+/* Format flags. */
+#define PRINT_F_MINUS           (1 << 0)
+#define PRINT_F_PLUS            (1 << 1)
+#define PRINT_F_SPACE           (1 << 2)
+#define PRINT_F_NUM             (1 << 3)
+#define PRINT_F_ZERO            (1 << 4)
+#define PRINT_F_QUOTE           (1 << 5)
+#define PRINT_F_UP              (1 << 6)
+#define PRINT_F_UNSIGNED        (1 << 7)
+#define PRINT_F_TYPE_G          (1 << 8)
+#define PRINT_F_TYPE_E          (1 << 9)
+
+/* Conversion flags. */
+#define PRINT_C_CHAR            1
+#define PRINT_C_SHORT           2
+#define PRINT_C_LONG            3
+#define PRINT_C_LLONG           4
+#define PRINT_C_LDOUBLE         5
+#define PRINT_C_SIZE            6
+#define PRINT_C_PTRDIFF         7
+#define PRINT_C_INTMAX          8
+
+#ifndef MAX
+#define MAX(x, y) ((x >= y) ? x : y)
+#endif	/* !defined(MAX) */
+#ifndef CHARTOINT
+#define CHARTOINT(ch) (ch - '0')
+#endif	/* !defined(CHARTOINT) */
+#ifndef ISDIGIT
+#define ISDIGIT(ch) ('0' <= (unsigned char)ch && (unsigned char)ch <= '9')
+#endif	/* !defined(ISDIGIT) */
+#ifndef ISNAN
+#define ISNAN(x) (x != x)
+#endif	/* !defined(ISNAN) */
+#ifndef ISINF
+#define ISINF(x) (x != 0.0 && x + x == x)
+#endif	/* !defined(ISINF) */
+
+#ifdef OUTCHAR
+#undef OUTCHAR
+#endif	/* defined(OUTCHAR) */
+#define OUTCHAR(str, len, size, ch)                                          \
+do {                                                                         \
+	if (len + 1 < size)                                                  \
+		str[len] = ch;                                               \
+	(len)++;                                                             \
+} while (/* CONSTCOND */ 0)
+
+static void fmtstr(char *, size_t *, size_t, const char *, int, int, int);
+static void fmtint(char *, size_t *, size_t, INTMAX_T, int, int, int, int);
+static void fmtflt(char *, size_t *, size_t, LDOUBLE, int, int, int, int *);
+static void printsep(char *, size_t *, size_t);
+static int getnumsep(int);
+static int getexponent(LDOUBLE);
+static int convert(UINTMAX_T, char *, size_t, int, int);
+static UINTMAX_T cast(LDOUBLE);
+static UINTMAX_T myround(LDOUBLE);
+static LDOUBLE mypow10(int);
+
+int
+util_vsnprintf(char *str, size_t size, const char *format, va_list args)
+{
+	LDOUBLE fvalue;
+	INTMAX_T value;
+	unsigned char cvalue;
+	const char *strvalue;
+	INTMAX_T *intmaxptr;
+	PTRDIFF_T *ptrdiffptr;
+	SSIZE_T *sizeptr;
+	LLONG *llongptr;
+	long int *longptr;
+	int *intptr;
+	short int *shortptr;
+	signed char *charptr;
+	size_t len = 0;
+	int overflow = 0;
+	int base = 0;
+	int cflags = 0;
+	int flags = 0;
+	int width = 0;
+	int precision = -1;
+	int state = PRINT_S_DEFAULT;
+	char ch = *format++;
+
+	/*
+	 * C99 says: "If `n' is zero, nothing is written, and `s' may be a null
+	 * pointer." (7.19.6.5, 2)  We're forgiving and allow a NULL pointer
+	 * even if a size larger than zero was specified.  At least NetBSD's
+	 * snprintf(3) does the same, as well as other versions of this file.
+	 * (Though some of these versions will write to a non-NULL buffer even
+	 * if a size of zero was specified, which violates the standard.)
+	 */
+	if (str == NULL && size != 0)
+		size = 0;
+
+	while (ch != '\0')
+		switch (state) {
+		case PRINT_S_DEFAULT:
+			if (ch == '%')
+				state = PRINT_S_FLAGS;
+			else
+				OUTCHAR(str, len, size, ch);
+			ch = *format++;
+			break;
+		case PRINT_S_FLAGS:
+			switch (ch) {
+			case '-':
+				flags |= PRINT_F_MINUS;
+				ch = *format++;
+				break;
+			case '+':
+				flags |= PRINT_F_PLUS;
+				ch = *format++;
+				break;
+			case ' ':
+				flags |= PRINT_F_SPACE;
+				ch = *format++;
+				break;
+			case '#':
+				flags |= PRINT_F_NUM;
+				ch = *format++;
+				break;
+			case '0':
+				flags |= PRINT_F_ZERO;
+				ch = *format++;
+				break;
+			case '\'':	/* SUSv2 flag (not in C99). */
+				flags |= PRINT_F_QUOTE;
+				ch = *format++;
+				break;
+			default:
+				state = PRINT_S_WIDTH;
+				break;
+			}
+			break;
+		case PRINT_S_WIDTH:
+			if (ISDIGIT(ch)) {
+				ch = CHARTOINT(ch);
+				if (width > (INT_MAX - ch) / 10) {
+					overflow = 1;
+					goto out;
+				}
+				width = 10 * width + ch;
+				ch = *format++;
+			} else if (ch == '*') {
+				/*
+				 * C99 says: "A negative field width argument is
+				 * taken as a `-' flag followed by a positive
+				 * field width." (7.19.6.1, 5)
+				 */
+				if ((width = va_arg(args, int)) < 0) {
+					flags |= PRINT_F_MINUS;
+					width = -width;
+				}
+				ch = *format++;
+				state = PRINT_S_DOT;
+			} else
+				state = PRINT_S_DOT;
+			break;
+		case PRINT_S_DOT:
+			if (ch == '.') {
+				state = PRINT_S_PRECISION;
+				ch = *format++;
+			} else
+				state = PRINT_S_MOD;
+			break;
+		case PRINT_S_PRECISION:
+			if (precision == -1)
+				precision = 0;
+			if (ISDIGIT(ch)) {
+				ch = CHARTOINT(ch);
+				if (precision > (INT_MAX - ch) / 10) {
+					overflow = 1;
+					goto out;
+				}
+				precision = 10 * precision + ch;
+				ch = *format++;
+			} else if (ch == '*') {
+				/*
+				 * C99 says: "A negative precision argument is
+				 * taken as if the precision were omitted."
+				 * (7.19.6.1, 5)
+				 */
+				if ((precision = va_arg(args, int)) < 0)
+					precision = -1;
+				ch = *format++;
+				state = PRINT_S_MOD;
+			} else
+				state = PRINT_S_MOD;
+			break;
+		case PRINT_S_MOD:
+			switch (ch) {
+			case 'h':
+				ch = *format++;
+				if (ch == 'h') {	/* It's a char. */
+					ch = *format++;
+					cflags = PRINT_C_CHAR;
+				} else
+					cflags = PRINT_C_SHORT;
+				break;
+			case 'l':
+				ch = *format++;
+				if (ch == 'l') {	/* It's a long long. */
+					ch = *format++;
+					cflags = PRINT_C_LLONG;
+				} else
+					cflags = PRINT_C_LONG;
+				break;
+			case 'L':
+				cflags = PRINT_C_LDOUBLE;
+				ch = *format++;
+				break;
+			case 'j':
+				cflags = PRINT_C_INTMAX;
+				ch = *format++;
+				break;
+			case 't':
+				cflags = PRINT_C_PTRDIFF;
+				ch = *format++;
+				break;
+			case 'z':
+				cflags = PRINT_C_SIZE;
+				ch = *format++;
+				break;
+			}
+			state = PRINT_S_CONV;
+			break;
+		case PRINT_S_CONV:
+			switch (ch) {
+			case 'd':
+				/* FALLTHROUGH */
+			case 'i':
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					value = (signed char)va_arg(args, int);
+					break;
+				case PRINT_C_SHORT:
+					value = (short int)va_arg(args, int);
+					break;
+				case PRINT_C_LONG:
+					value = va_arg(args, long int);
+					break;
+				case PRINT_C_LLONG:
+					value = va_arg(args, LLONG);
+					break;
+				case PRINT_C_SIZE:
+					value = va_arg(args, SSIZE_T);
+					break;
+				case PRINT_C_INTMAX:
+					value = va_arg(args, INTMAX_T);
+					break;
+				case PRINT_C_PTRDIFF:
+					value = va_arg(args, PTRDIFF_T);
+					break;
+				default:
+					value = va_arg(args, int);
+					break;
+				}
+				fmtint(str, &len, size, value, 10, width,
+				    precision, flags);
+				break;
+			case 'X':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'x':
+				base = 16;
+				/* FALLTHROUGH */
+			case 'o':
+				if (base == 0)
+					base = 8;
+				/* FALLTHROUGH */
+			case 'u':
+				if (base == 0)
+					base = 10;
+				flags |= PRINT_F_UNSIGNED;
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					value = (unsigned char)va_arg(args,
+					    unsigned int);
+					break;
+				case PRINT_C_SHORT:
+					value = (unsigned short int)va_arg(args,
+					    unsigned int);
+					break;
+				case PRINT_C_LONG:
+					value = va_arg(args, unsigned long int);
+					break;
+				case PRINT_C_LLONG:
+					value = va_arg(args, ULLONG);
+					break;
+				case PRINT_C_SIZE:
+					value = va_arg(args, size_t);
+					break;
+				case PRINT_C_INTMAX:
+					value = va_arg(args, UINTMAX_T);
+					break;
+				case PRINT_C_PTRDIFF:
+					value = va_arg(args, UPTRDIFF_T);
+					break;
+				default:
+					value = va_arg(args, unsigned int);
+					break;
+				}
+				fmtint(str, &len, size, value, base, width,
+				    precision, flags);
+				break;
+			case 'A':
+				/* Not yet supported, we'll use "%F". */
+				/* FALLTHROUGH */
+			case 'F':
+				flags |= PRINT_F_UP;
+			case 'a':
+				/* Not yet supported, we'll use "%f". */
+				/* FALLTHROUGH */
+			case 'f':
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'E':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'e':
+				flags |= PRINT_F_TYPE_E;
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'G':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'g':
+				flags |= PRINT_F_TYPE_G;
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				/*
+				 * If the precision is zero, it is treated as
+				 * one (cf. C99: 7.19.6.1, 8).
+				 */
+				if (precision == 0)
+					precision = 1;
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'c':
+				cvalue = (unsigned char)va_arg(args, int);
+				OUTCHAR(str, len, size, cvalue);
+				break;
+			case 's':
+				strvalue = va_arg(args, char *);
+				fmtstr(str, &len, size, strvalue, width,
+				    precision, flags);
+				break;
+			case 'p':
+				/*
+				 * C99 says: "The value of the pointer is
+				 * converted to a sequence of printing
+				 * characters, in an implementation-defined
+				 * manner." (C99: 7.19.6.1, 8)
+				 */
+				if ((strvalue = va_arg(args, void *)) == NULL)
+					/*
+					 * We use the glibc format.  BSD prints
+					 * "0x0", SysV "0".
+					 */
+					fmtstr(str, &len, size, "(nil)", width,
+					    -1, flags);
+				else {
+					/*
+					 * We use the BSD/glibc format.  SysV
+					 * omits the "0x" prefix (which we emit
+					 * using the PRINT_F_NUM flag).
+					 */
+					flags |= PRINT_F_NUM;
+					flags |= PRINT_F_UNSIGNED;
+					fmtint(str, &len, size,
+					    (UINTPTR_T)strvalue, 16, width,
+					    precision, flags);
+				}
+				break;
+			case 'n':
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					charptr = va_arg(args, signed char *);
+					*charptr = (signed char)len;
+					break;
+				case PRINT_C_SHORT:
+					shortptr = va_arg(args, short int *);
+					*shortptr = (short int)len;
+					break;
+				case PRINT_C_LONG:
+					longptr = va_arg(args, long int *);
+					*longptr = (long int)len;
+					break;
+				case PRINT_C_LLONG:
+					llongptr = va_arg(args, LLONG *);
+					*llongptr = (LLONG)len;
+					break;
+				case PRINT_C_SIZE:
+					/*
+					 * C99 says that with the "z" length
+					 * modifier, "a following `n' conversion
+					 * specifier applies to a pointer to a
+					 * signed integer type corresponding to
+					 * size_t argument." (7.19.6.1, 7)
+					 */
+					sizeptr = va_arg(args, SSIZE_T *);
+					*sizeptr = len;
+					break;
+				case PRINT_C_INTMAX:
+					intmaxptr = va_arg(args, INTMAX_T *);
+					*intmaxptr = len;
+					break;
+				case PRINT_C_PTRDIFF:
+					ptrdiffptr = va_arg(args, PTRDIFF_T *);
+					*ptrdiffptr = len;
+					break;
+				default:
+					intptr = va_arg(args, int *);
+					*intptr = len;
+					break;
+				}
+				break;
+			case '%':	/* Print a "%" character verbatim. */
+				OUTCHAR(str, len, size, ch);
+				break;
+			default:	/* Skip other characters. */
+				break;
+			}
+			ch = *format++;
+			state = PRINT_S_DEFAULT;
+			base = cflags = flags = width = 0;
+			precision = -1;
+			break;
+		}
+out:
+	if (len < size)
+		str[len] = '\0';
+	else if (size > 0)
+		str[size - 1] = '\0';
+
+	if (overflow || len >= INT_MAX) {
+		return -1;
+	}
+	return (int)len;
+}
+
+static void
+fmtstr(char *str, size_t *len, size_t size, const char *value, int width,
+       int precision, int flags)
+{
+	int padlen, strln;	/* Amount to pad. */
+	int noprecision = (precision == -1);
+
+	if (value == NULL)	/* We're forgiving. */
+		value = "(null)";
+
+	/* If a precision was specified, don't read the string past it. */
+	for (strln = 0; value[strln] != '\0' &&
+	    (noprecision || strln < precision); strln++)
+		continue;
+
+	if ((padlen = width - strln) < 0)
+		padlen = 0;
+	if (flags & PRINT_F_MINUS)	/* Left justify. */
+		padlen = -padlen;
+
+	while (padlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen--;
+	}
+	while (*value != '\0' && (noprecision || precision-- > 0)) {
+		OUTCHAR(str, *len, size, *value);
+		value++;
+	}
+	while (padlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen++;
+	}
+}
+
+static void
+fmtint(char *str, size_t *len, size_t size, INTMAX_T value, int base, int width,
+       int precision, int flags)
+{
+	UINTMAX_T uvalue;
+	char iconvert[MAX_CONVERT_LENGTH];
+	char sign = 0;
+	char hexprefix = 0;
+	int spadlen = 0;	/* Amount to space pad. */
+	int zpadlen = 0;	/* Amount to zero pad. */
+	int pos;
+	int separators = (flags & PRINT_F_QUOTE);
+	int noprecision = (precision == -1);
+
+	if (flags & PRINT_F_UNSIGNED)
+		uvalue = value;
+	else {
+		uvalue = (value >= 0) ? value : -value;
+		if (value < 0)
+			sign = '-';
+		else if (flags & PRINT_F_PLUS)	/* Do a sign. */
+			sign = '+';
+		else if (flags & PRINT_F_SPACE)
+			sign = ' ';
+	}
+
+	pos = convert(uvalue, iconvert, sizeof(iconvert), base,
+	    flags & PRINT_F_UP);
+
+	if (flags & PRINT_F_NUM && uvalue != 0) {
+		/*
+		 * C99 says: "The result is converted to an `alternative form'.
+		 * For `o' conversion, it increases the precision, if and only
+		 * if necessary, to force the first digit of the result to be a
+		 * zero (if the value and precision are both 0, a single 0 is
+		 * printed).  For `x' (or `X') conversion, a nonzero result has
+		 * `0x' (or `0X') prefixed to it." (7.19.6.1, 6)
+		 */
+		switch (base) {
+		case 8:
+			if (precision <= pos)
+				precision = pos + 1;
+			break;
+		case 16:
+			hexprefix = (flags & PRINT_F_UP) ? 'X' : 'x';
+			break;
+		}
+	}
+
+	if (separators)	/* Get the number of group separators we'll print. */
+		separators = getnumsep(pos);
+
+	zpadlen = precision - pos - separators;
+	spadlen = width                         /* Minimum field width. */
+	    - separators                        /* Number of separators. */
+	    - MAX(precision, pos)               /* Number of integer digits. */
+	    - ((sign != 0) ? 1 : 0)             /* Will we print a sign? */
+	    - ((hexprefix != 0) ? 2 : 0);       /* Will we print a prefix? */
+
+	if (zpadlen < 0)
+		zpadlen = 0;
+	if (spadlen < 0)
+		spadlen = 0;
+
+	/*
+	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
+	 * ignored.  For `d', `i', `o', `u', `x', and `X' conversions, if a
+	 * precision is specified, the `0' flag is ignored." (7.19.6.1, 6)
+	 */
+	if (flags & PRINT_F_MINUS)	/* Left justify. */
+		spadlen = -spadlen;
+	else if (flags & PRINT_F_ZERO && noprecision) {
+		zpadlen += spadlen;
+		spadlen = 0;
+	}
+	while (spadlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		spadlen--;
+	}
+	if (sign != 0)	/* Sign. */
+		OUTCHAR(str, *len, size, sign);
+	if (hexprefix != 0) {	/* A "0x" or "0X" prefix. */
+		OUTCHAR(str, *len, size, '0');
+		OUTCHAR(str, *len, size, hexprefix);
+	}
+	while (zpadlen > 0) {	/* Leading zeros. */
+		OUTCHAR(str, *len, size, '0');
+		zpadlen--;
+	}
+	while (pos > 0) {	/* The actual digits. */
+		pos--;
+		OUTCHAR(str, *len, size, iconvert[pos]);
+		if (separators > 0 && pos > 0 && pos % 3 == 0)
+			printsep(str, len, size);
+	}
+	while (spadlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		spadlen++;
+	}
+}
+
+static void
+fmtflt(char *str, size_t *len, size_t size, LDOUBLE fvalue, int width,
+       int precision, int flags, int *overflow)
+{
+	LDOUBLE ufvalue;
+	UINTMAX_T intpart;
+	UINTMAX_T fracpart;
+	UINTMAX_T mask;
+	const char *infnan = NULL;
+	char iconvert[MAX_CONVERT_LENGTH];
+	char fconvert[MAX_CONVERT_LENGTH];
+	char econvert[4];	/* "e-12" (without nul-termination). */
+	char esign = 0;
+	char sign = 0;
+	int leadfraczeros = 0;
+	int exponent = 0;
+	int emitpoint = 0;
+	int omitzeros = 0;
+	int omitcount = 0;
+	int padlen = 0;
+	int epos = 0;
+	int fpos = 0;
+	int ipos = 0;
+	int separators = (flags & PRINT_F_QUOTE);
+	int estyle = (flags & PRINT_F_TYPE_E);
+#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
+	struct lconv *lc = localeconv();
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
+
+	/*
+	 * AIX' man page says the default is 0, but C99 and at least Solaris'
+	 * and NetBSD's man pages say the default is 6, and sprintf(3) on AIX
+	 * defaults to 6.
+	 */
+	if (precision == -1)
+		precision = 6;
+
+	if (fvalue < 0.0)
+		sign = '-';
+	else if (flags & PRINT_F_PLUS)	/* Do a sign. */
+		sign = '+';
+	else if (flags & PRINT_F_SPACE)
+		sign = ' ';
+
+	if (ISNAN(fvalue))
+		infnan = (flags & PRINT_F_UP) ? "NAN" : "nan";
+	else if (ISINF(fvalue))
+		infnan = (flags & PRINT_F_UP) ? "INF" : "inf";
+
+	if (infnan != NULL) {
+		if (sign != 0)
+			iconvert[ipos++] = sign;
+		while (*infnan != '\0')
+			iconvert[ipos++] = *infnan++;
+		fmtstr(str, len, size, iconvert, width, ipos, flags);
+		return;
+	}
+
+	/* "%e" (or "%E") or "%g" (or "%G") conversion. */
+	if (flags & PRINT_F_TYPE_E || flags & PRINT_F_TYPE_G) {
+		if (flags & PRINT_F_TYPE_G) {
+			/*
+			 * For "%g" (and "%G") conversions, the precision
+			 * specifies the number of significant digits, which
+			 * includes the digits in the integer part.  The
+			 * conversion will or will not be using "e-style" (like
+			 * "%e" or "%E" conversions) depending on the precision
+			 * and on the exponent.  However, the exponent can be
+			 * affected by rounding the converted value, so we'll
+			 * leave this decision for later.  Until then, we'll
+			 * assume that we're going to do an "e-style" conversion
+			 * (in order to get the exponent calculated).  For
+			 * "e-style", the precision must be decremented by one.
+			 */
+			precision--;
+			/*
+			 * For "%g" (and "%G") conversions, trailing zeros are
+			 * removed from the fractional portion of the result
+			 * unless the "#" flag was specified.
+			 */
+			if (!(flags & PRINT_F_NUM))
+				omitzeros = 1;
+		}
+		exponent = getexponent(fvalue);
+		estyle = 1;
+	}
+
+again:
+	/*
+	 * Sorry, we only support 9, 19, or 38 digits (that is, the number of
+	 * digits of the 32-bit, the 64-bit, or the 128-bit UINTMAX_MAX value
+	 * minus one) past the decimal point due to our conversion method.
+	 */
+	switch (sizeof(UINTMAX_T)) {
+	case 16:
+		if (precision > 38)
+			precision = 38;
+		break;
+	case 8:
+		if (precision > 19)
+			precision = 19;
+		break;
+	default:
+		if (precision > 9)
+			precision = 9;
+		break;
+	}
+
+	ufvalue = (fvalue >= 0.0) ? fvalue : -fvalue;
+	if (estyle)	/* We want exactly one integer digit. */
+		ufvalue /= mypow10(exponent);
+
+	if ((intpart = cast(ufvalue)) == UINTMAX_MAX) {
+		*overflow = 1;
+		return;
+	}
+
+	/*
+	 * Factor of ten with the number of digits needed for the fractional
+	 * part.  For example, if the precision is 3, the mask will be 1000.
+	 */
+#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+	mask = (unsigned long)mypow10(precision);
+#else
+	mask = (UINTMAX_T)mypow10(precision);
+#endif
+	/*
+	 * We "cheat" by converting the fractional part to integer by
+	 * multiplying by a factor of ten.
+	 */
+	if ((fracpart = myround(mask * (ufvalue - intpart))) >= mask) {
+		/*
+		 * For example, ufvalue = 2.99962, intpart = 2, and mask = 1000
+		 * (because precision = 3).  Now, myround(1000 * 0.99962) will
+		 * return 1000.  So, the integer part must be incremented by one
+		 * and the fractional part must be set to zero.
+		 */
+		intpart++;
+		fracpart = 0;
+		if (estyle && intpart == 10) {
+			/*
+			 * The value was rounded up to ten, but we only want one
+			 * integer digit if using "e-style".  So, the integer
+			 * part must be set to one and the exponent must be
+			 * incremented by one.
+			 */
+			intpart = 1;
+			exponent++;
+		}
+	}
+
+	/*
+	 * Now that we know the real exponent, we can check whether or not to
+	 * use "e-style" for "%g" (and "%G") conversions.  If we don't need
+	 * "e-style", the precision must be adjusted and the integer and
+	 * fractional parts must be recalculated from the original value.
+	 *
+	 * C99 says: "Let P equal the precision if nonzero, 6 if the precision
+	 * is omitted, or 1 if the precision is zero.  Then, if a conversion
+	 * with style `E' would have an exponent of X:
+	 *
+	 * - if P > X >= -4, the conversion is with style `f' (or `F') and
+	 *   precision P - (X + 1).
+	 *
+	 * - otherwise, the conversion is with style `e' (or `E') and precision
+	 *   P - 1." (7.19.6.1, 8)
+	 *
+	 * Note that we had decremented the precision by one.
+	 */
+	if (flags & PRINT_F_TYPE_G && estyle &&
+	    precision + 1 > exponent && exponent >= -4) {
+		precision -= exponent;
+		estyle = 0;
+		goto again;
+	}
+
+	if (estyle) {
+		if (exponent < 0) {
+			exponent = -exponent;
+			esign = '-';
+		} else
+			esign = '+';
+
+		/*
+		 * Convert the exponent.  The sizeof(econvert) is 4.  So, the
+		 * econvert buffer can hold e.g. "e+99" and "e-99".  We don't
+		 * support an exponent which contains more than two digits.
+		 * Therefore, the following stores are safe.
+		 */
+		epos = convert(exponent, econvert, 2, 10, 0);
+		/*
+		 * C99 says: "The exponent always contains at least two digits,
+		 * and only as many more digits as necessary to represent the
+		 * exponent." (7.19.6.1, 8)
+		 */
+		if (epos == 1)
+			econvert[epos++] = '0';
+		econvert[epos++] = esign;
+		econvert[epos++] = (flags & PRINT_F_UP) ? 'E' : 'e';
+	}
+
+	/* Convert the integer part and the fractional part. */
+	ipos = convert(intpart, iconvert, sizeof(iconvert), 10, 0);
+	if (fracpart != 0)	/* convert() would return 1 if fracpart == 0. */
+		fpos = convert(fracpart, fconvert, sizeof(fconvert), 10, 0);
+
+	leadfraczeros = precision - fpos;
+
+	if (omitzeros) {
+		if (fpos > 0)	/* Omit trailing fractional part zeros. */
+			while (omitcount < fpos && fconvert[omitcount] == '0')
+				omitcount++;
+		else {	/* The fractional part is zero, omit it completely. */
+			omitcount = precision;
+			leadfraczeros = 0;
+		}
+		precision -= omitcount;
+	}
+
+	/*
+	 * Print a decimal point if either the fractional part is non-zero
+	 * and/or the "#" flag was specified.
+	 */
+	if (precision > 0 || flags & PRINT_F_NUM)
+		emitpoint = 1;
+	if (separators)	/* Get the number of group separators we'll print. */
+		separators = getnumsep(ipos);
+
+	padlen = width                  /* Minimum field width. */
+	    - ipos                      /* Number of integer digits. */
+	    - epos                      /* Number of exponent characters. */
+	    - precision                 /* Number of fractional digits. */
+	    - separators                /* Number of group separators. */
+	    - (emitpoint ? 1 : 0)       /* Will we print a decimal point? */
+	    - ((sign != 0) ? 1 : 0);    /* Will we print a sign character? */
+
+	if (padlen < 0)
+		padlen = 0;
+
+	/*
+	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
+	 * ignored." (7.19.6.1, 6)
+	 */
+	if (flags & PRINT_F_MINUS)	/* Left justifty. */
+		padlen = -padlen;
+	else if (flags & PRINT_F_ZERO && padlen > 0) {
+		if (sign != 0) {	/* Sign. */
+			OUTCHAR(str, *len, size, sign);
+			sign = 0;
+		}
+		while (padlen > 0) {	/* Leading zeros. */
+			OUTCHAR(str, *len, size, '0');
+			padlen--;
+		}
+	}
+	while (padlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen--;
+	}
+	if (sign != 0)	/* Sign. */
+		OUTCHAR(str, *len, size, sign);
+	while (ipos > 0) {	/* Integer part. */
+		ipos--;
+		OUTCHAR(str, *len, size, iconvert[ipos]);
+		if (separators > 0 && ipos > 0 && ipos % 3 == 0)
+			printsep(str, len, size);
+	}
+	if (emitpoint) {	/* Decimal point. */
+#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
+		if (lc->decimal_point != NULL && *lc->decimal_point != '\0')
+			OUTCHAR(str, *len, size, *lc->decimal_point);
+		else	/* We'll always print some decimal point character. */
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
+			OUTCHAR(str, *len, size, '.');
+	}
+	while (leadfraczeros > 0) {	/* Leading fractional part zeros. */
+		OUTCHAR(str, *len, size, '0');
+		leadfraczeros--;
+	}
+	while (fpos > omitcount) {	/* The remaining fractional part. */
+		fpos--;
+		OUTCHAR(str, *len, size, fconvert[fpos]);
+	}
+	while (epos > 0) {	/* Exponent. */
+		epos--;
+		OUTCHAR(str, *len, size, econvert[epos]);
+	}
+	while (padlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen++;
+	}
+}
+
+static void
+printsep(char *str, size_t *len, size_t size)
+{
+#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
+	struct lconv *lc = localeconv();
+	int i;
+
+	if (lc->thousands_sep != NULL)
+		for (i = 0; lc->thousands_sep[i] != '\0'; i++)
+			OUTCHAR(str, *len, size, lc->thousands_sep[i]);
+	else
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
+		OUTCHAR(str, *len, size, ',');
+}
+
+static int
+getnumsep(int digits)
+{
+	int separators = (digits - ((digits % 3 == 0) ? 1 : 0)) / 3;
+#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
+	int strln;
+	struct lconv *lc = localeconv();
+
+	/* We support an arbitrary separator length (including zero). */
+	if (lc->thousands_sep != NULL) {
+		for (strln = 0; lc->thousands_sep[strln] != '\0'; strln++)
+			continue;
+		separators *= strln;
+	}
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
+	return separators;
+}
+
+static int
+getexponent(LDOUBLE value)
+{
+	LDOUBLE tmp = (value >= 0.0) ? value : -value;
+	int exponent = 0;
+
+	/*
+	 * We check for 99 > exponent > -99 in order to work around possible
+	 * endless loops which could happen (at least) in the second loop (at
+	 * least) if we're called with an infinite value.  However, we checked
+	 * for infinity before calling this function using our ISINF() macro, so
+	 * this might be somewhat paranoid.
+	 */
+	while (tmp < 1.0 && tmp > 0.0 && --exponent > -99)
+		tmp *= 10;
+	while (tmp >= 10.0 && ++exponent < 99)
+		tmp /= 10;
+
+	return exponent;
+}
+
+static int
+convert(UINTMAX_T value, char *buf, size_t size, int base, int caps)
+{
+	const char *digits = caps ? "0123456789ABCDEF" : "0123456789abcdef";
+	size_t pos = 0;
+
+	/* We return an unterminated buffer with the digits in reverse order. */
+	do {
+		buf[pos++] = digits[value % base];
+		value /= base;
+	} while (value != 0 && pos < size);
+
+	return (int)pos;
+}
+
+static UINTMAX_T
+cast(LDOUBLE value)
+{
+	UINTMAX_T result;
+
+	/*
+	 * We check for ">=" and not for ">" because if UINTMAX_MAX cannot be
+	 * represented exactly as an LDOUBLE value (but is less than LDBL_MAX),
+	 * it may be increased to the nearest higher representable value for the
+	 * comparison (cf. C99: 6.3.1.4, 2).  It might then equal the LDOUBLE
+	 * value although converting the latter to UINTMAX_T would overflow.
+	 */
+	if (value >= UINTMAX_MAX)
+		return UINTMAX_MAX;
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+	result = (unsigned long)value;
+#else
+	result = (UINTMAX_T)value;
+#endif
+	/*
+	 * At least on NetBSD/sparc64 3.0.2 and 4.99.30, casting long double to
+	 * an integer type converts e.g. 1.9 to 2 instead of 1 (which violates
+	 * the standard).  Sigh.
+	 */
+	return (result <= value) ? result : result - 1;
+}
+
+static UINTMAX_T
+myround(LDOUBLE value)
+{
+	UINTMAX_T intpart = cast(value);
+
+	return ((value -= intpart) < 0.5) ? intpart : intpart + 1;
+}
+
+static LDOUBLE
+mypow10(int exponent)
+{
+	LDOUBLE result = 1;
+
+	while (exponent > 0) {
+		result *= 10;
+		exponent--;
+	}
+	while (exponent < 0) {
+		result /= 10;
+		exponent++;
+	}
+	return result;
+}
+#endif	/* !HAVE_VSNPRINTF */
+
+#if !HAVE_VASPRINTF
+#if NEED_MYMEMCPY
+void *
+mymemcpy(void *dst, void *src, size_t len)
+{
+	const char *from = src;
+	char *to = dst;
+
+	/* No need for optimization, we use this only to replace va_copy(3). */
+	while (len-- > 0)
+		*to++ = *from++;
+	return dst;
+}
+#endif	/* NEED_MYMEMCPY */
+
+int
+util_vasprintf(char **ret, const char *format, va_list ap)
+{
+	size_t size;
+	int len;
+	va_list aq;
+
+	VA_COPY(aq, ap);
+	len = vsnprintf(NULL, 0, format, aq);
+	VA_END_COPY(aq);
+	if (len < 0 || (*ret = malloc(size = len + 1)) == NULL)
+		return -1;
+	return vsnprintf(*ret, size, format, ap);
+}
+#endif	/* !HAVE_VASPRINTF */
+
+#if !HAVE_SNPRINTF
+#if HAVE_STDARG_H
+int
+util_snprintf(char *str, size_t size, const char *format, ...)
+#else
+int
+util_snprintf(va_alist) va_dcl
+#endif	/* HAVE_STDARG_H */
+{
+#if !HAVE_STDARG_H
+	char *str;
+	size_t size;
+	char *format;
+#endif	/* HAVE_STDARG_H */
+	va_list ap;
+	int len;
+
+	VA_START(ap, format);
+	VA_SHIFT(ap, str, char *);
+	VA_SHIFT(ap, size, size_t);
+	VA_SHIFT(ap, format, const char *);
+	len = vsnprintf(str, size, format, ap);
+	va_end(ap);
+	return len;
+}
+#endif	/* !HAVE_SNPRINTF */
+
+#if !HAVE_ASPRINTF
+#if HAVE_STDARG_H
+int
+util_asprintf(char **ret, const char *format, ...)
+#else
+int
+util_asprintf(va_alist) va_dcl
+#endif	/* HAVE_STDARG_H */
+{
+#if !HAVE_STDARG_H
+	char **ret;
+	char *format;
+#endif	/* HAVE_STDARG_H */
+	va_list ap;
+	int len;
+
+	VA_START(ap, format);
+	VA_SHIFT(ap, ret, char **);
+	VA_SHIFT(ap, format, const char *);
+	len = vasprintf(ret, format, ap);
+	va_end(ap);
+	return len;
+}
+#endif	/* !HAVE_ASPRINTF */
+#else	/* Dummy declaration to avoid empty translation unit warnings. */
+int main(void);
+#endif	/* !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || [...] */
+
+
+/* vim: set joinspaces textwidth=80: */
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 0000000000..e2a8491e62
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ * 
+ * Although the SSE intrinsics are support by all modern x86 and x86-64 
+ * compilers, there are some intrisincs missing in some implementations 
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+   __m128 m128;
+   __m128i m128i;
+   __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+   union m128_types u;
+   u.m128i = a;
+   return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+   union m128_types u;
+   u.m128 = a;
+   return u.m128i;
+}
+
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
+
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
+
+#endif /* U_SSE_H_ */
diff --git a/src/gallium/auxiliary/util/u_stream.h b/src/gallium/auxiliary/util/u_stream.h
new file mode 100644
index 0000000000..a9d0f0121a
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_stream.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform sequential access stream abstraction.
+ */
+
+#ifndef U_STREAM_H
+#define U_STREAM_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct util_stream;
+
+
+/**
+ * Create a stream
+ * @param filename relative or absolute path (necessary for windows)  
+ * @param optional maximum file size (0 for a growable size).
+ */
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size);
+
+boolean
+util_stream_write(struct util_stream *stream, const void *data, size_t size);
+
+void
+util_stream_flush(struct util_stream *stream);
+
+void
+util_stream_close(struct util_stream *stream);
+
+
+#endif /* U_STREAM_H */
diff --git a/src/gallium/auxiliary/util/u_stream_stdc.c b/src/gallium/auxiliary/util/u_stream_stdc.c
new file mode 100644
index 0000000000..ca80bef0f3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_stream_stdc.c
@@ -0,0 +1,106 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Stream implementation based on the Standard C Library.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+#include <stdio.h>
+
+#include "util/u_memory.h"
+
+#include "u_stream.h"
+
+
+struct util_stream 
+{
+   FILE *file;
+};
+
+
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size)
+{
+   struct util_stream *stream;
+   
+   (void)max_size;
+   
+   stream = CALLOC_STRUCT(util_stream);
+   if(!stream)
+      goto error1;
+   
+   stream->file = fopen(filename, "w");
+   if(!stream->file)
+      goto error2;
+   
+   return stream;
+   
+error2:
+   FREE(stream);
+error1:
+   return NULL;
+}
+
+
+boolean
+util_stream_write(struct util_stream *stream, const void *data, size_t size)
+{
+   if(!stream)
+      return FALSE;
+   
+   return fwrite(data, size, 1, stream->file) == size ? TRUE : FALSE;
+}
+
+
+void
+util_stream_flush(struct util_stream *stream) 
+{
+   if(!stream)
+      return;
+   
+   fflush(stream->file);
+}
+
+
+void
+util_stream_close(struct util_stream *stream) 
+{
+   if(!stream)
+      return;
+   
+   fclose(stream->file);
+
+   FREE(stream);
+}
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_stream_wd.c b/src/gallium/auxiliary/util/u_stream_wd.c
new file mode 100644
index 0000000000..864489e775
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_stream_wd.c
@@ -0,0 +1,224 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Stream implementation for the Windows Display driver.
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+#include <windows.h>
+#include <winddi.h>
+
+#include "util/u_memory.h"
+#include "util/u_string.h"
+
+#include "u_stream.h"
+
+
+#define MAP_FILE_SIZE (4*1024*1024)
+
+
+struct util_stream 
+{
+   char filename[MAX_PATH + 1];
+   WCHAR wFileName[MAX_PATH + 1];
+   boolean growable;
+   size_t map_size;
+   ULONG_PTR iFile;
+   char *pMap;
+   size_t written;
+   unsigned suffix;
+};
+
+
+static INLINE boolean
+util_stream_map(struct util_stream *stream)
+{
+   ULONG BytesInUnicodeString;
+   static char filename[MAX_PATH + 1];
+   unsigned filename_len;
+
+   if(stream->growable)
+      filename_len = util_snprintf(filename,
+                                   sizeof(filename),
+                                   "%s.%04x",
+                                   stream->filename,
+                                   stream->suffix++);
+   else
+      filename_len = util_snprintf(filename,
+                                   sizeof(filename),
+                                   "%s",
+                                   stream->filename);
+
+   EngMultiByteToUnicodeN(
+         stream->wFileName,
+         sizeof(stream->wFileName),
+         &BytesInUnicodeString,
+         filename,
+         filename_len);
+   
+   stream->pMap = EngMapFile(stream->wFileName, stream->map_size, &stream->iFile);
+   if(!stream->pMap)
+      return FALSE;
+   
+   memset(stream->pMap, 0, stream->map_size);
+   stream->written = 0;
+   
+   return TRUE;
+}
+
+
+static INLINE void
+util_stream_unmap(struct util_stream *stream)
+{
+   EngUnmapFile(stream->iFile);
+   if(stream->written < stream->map_size) {
+      /* Truncate file size */
+      stream->pMap = EngMapFile(stream->wFileName, stream->written, &stream->iFile);
+      if(stream->pMap)
+         EngUnmapFile(stream->iFile);
+   }
+   
+   stream->pMap = NULL;
+}
+
+
+static INLINE void
+util_stream_full_qualified_filename(char *dst, size_t size, const char *src)
+{
+   boolean need_drive, need_root;
+   
+   if((('A' <= src[0] && src[0] <= 'Z') || ('a' <= src[0] && src[0] <= 'z')) && src[1] == ':') {
+      need_drive = FALSE;
+      need_root = src[2] == '\\' ? FALSE : TRUE;
+   }
+   else {
+      need_drive = TRUE;
+      need_root = src[0] == '\\' ? FALSE : TRUE;
+   }
+   
+   util_snprintf(dst, size, 
+                 "\\??\\%s%s%s",
+                 need_drive ? "C:" : "",
+                 need_root ? "\\" : "",
+                 src);
+}
+
+
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size)
+{
+   struct util_stream *stream;
+   
+   stream = CALLOC_STRUCT(util_stream);
+   if(!stream)
+      goto error1;
+   
+   util_stream_full_qualified_filename(stream->filename,
+                                       sizeof(stream->filename),
+                                       filename);
+   
+   if(max_size) {
+      stream->growable = FALSE;
+      stream->map_size = max_size;
+   }
+   else {
+      stream->growable = TRUE;
+      stream->map_size = MAP_FILE_SIZE;
+   }
+   
+   if(!util_stream_map(stream))
+      goto error2;
+   
+   return stream;
+   
+error2:
+   FREE(stream);
+error1:
+   return NULL;
+}
+
+
+static INLINE void
+util_stream_copy(struct util_stream *stream, const char *data, size_t size)
+{
+   assert(stream->written + size <= stream->map_size);
+   memcpy(stream->pMap + stream->written, data, size);
+   stream->written += size;
+}
+
+
+boolean
+util_stream_write(struct util_stream *stream, const void *data, size_t size)
+{
+   if(!stream)
+      return FALSE;
+   
+   if(!stream->pMap)
+      return FALSE;
+   
+   while(stream->written + size > stream->map_size) {
+      size_t step = stream->map_size - stream->written;
+      util_stream_copy(stream, data, step);
+      data = (const char *)data + step;
+      size -= step;
+      
+      util_stream_unmap(stream);
+      if(!stream->growable || !util_stream_map(stream))
+         return FALSE;
+   }
+
+   util_stream_copy(stream, data, size);
+   
+   return TRUE;
+}
+
+
+void
+util_stream_flush(struct util_stream *stream) 
+{
+   (void)stream;
+}
+
+
+void
+util_stream_close(struct util_stream *stream) 
+{
+   if(!stream)
+      return;
+   
+   util_stream_unmap(stream);
+
+   FREE(stream);
+}
+
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_string.h b/src/gallium/auxiliary/util/u_string.h
new file mode 100644
index 0000000000..08c89bbf77
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_string.h
@@ -0,0 +1,220 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Platform independent functions for string manipulation.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef U_STRING_H_
+#define U_STRING_H_
+
+#if !defined(WIN32) && !defined(XF86_LIBC_H)
+#include <stdio.h>
+#endif
+#include <stddef.h>
+#include <stdarg.h>
+
+#include "pipe/p_compiler.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef WIN32
+
+int util_vsnprintf(char *, size_t, const char *, va_list);
+int util_snprintf(char *str, size_t size, const char *format, ...);
+
+static INLINE void
+util_vsprintf(char *str, const char *format, va_list ap)
+{
+   util_vsnprintf(str, (size_t)-1, format, ap);
+}
+
+static INLINE void
+util_sprintf(char *str, const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   util_vsnprintf(str, (size_t)-1, format, ap);
+   va_end(ap);
+}
+
+static INLINE char *
+util_strchr(const char *s, char c)
+{
+   while(*s) {
+      if(*s == c)
+	 return (char *)s;
+      ++s;
+   }
+   return NULL;
+}
+
+static INLINE char*
+util_strncat(char *dst, const char *src, size_t n)
+{
+   char *p = dst + strlen(dst);
+   const char *q = src;
+   size_t i;
+
+   for (i = 0; i < n && *q != '\0'; ++i)
+       *p++ = *q++;
+   *p = '\0';
+
+   return dst;
+}
+
+static INLINE int
+util_strcmp(const char *s1, const char *s2)
+{
+   unsigned char u1, u2;
+
+   while (1) {
+      u1 = (unsigned char) *s1++;
+      u2 = (unsigned char) *s2++;
+      if (u1 != u2)
+	 return u1 - u2;
+      if (u1 == '\0')
+	 return 0;
+   }
+   return 0;
+}
+
+static INLINE int
+util_strncmp(const char *s1, const char *s2, size_t n)
+{
+   unsigned char u1, u2;
+
+   while (n-- > 0) {
+      u1 = (unsigned char) *s1++;
+      u2 = (unsigned char) *s2++;
+      if (u1 != u2)
+	 return u1 - u2;
+      if (u1 == '\0')
+	 return 0;
+   }
+   return 0;
+}
+
+static INLINE char *
+util_strstr(const char *haystack, const char *needle)
+{
+   const char *p = haystack;
+   int len = strlen(needle);
+
+   for (; (p = util_strchr(p, *needle)) != 0; p++) {
+      if (util_strncmp(p, needle, len) == 0) {
+	 return (char *)p;
+      }
+   }
+   return NULL;
+}
+
+static INLINE void *
+util_memmove(void *dest, const void *src, size_t n)
+{
+   char *p = (char *)dest;
+   const char *q = (const char *)src;
+   if (dest < src) {
+      while (n--)
+	 *p++ = *q++;
+   }
+   else
+   {
+      p += n;
+      q += n;
+      while (n--)
+	 *--p = *--q;
+   }
+   return dest;
+}
+
+
+#else
+
+#define util_vsnprintf vsnprintf
+#define util_snprintf snprintf
+#define util_vsprintf vsprintf
+#define util_sprintf sprintf
+#define util_strchr strchr
+#define util_strcmp strcmp
+#define util_strncmp strncmp
+#define util_strncat strncat
+#define util_strstr strstr
+#define util_memmove memmove
+
+#endif
+
+
+/**
+ * Printable string buffer
+ */
+struct util_strbuf
+{
+   char *str;
+   char *ptr;
+   size_t left;
+};
+
+
+static INLINE void
+util_strbuf_init(struct util_strbuf *sbuf, char *str, size_t size) 
+{
+   sbuf->str = str;
+   sbuf->str[0] = 0;
+   sbuf->ptr = sbuf->str;
+   sbuf->left = size;
+}
+
+
+static INLINE void
+util_strbuf_printf(struct util_strbuf *sbuf, const char *format, ...)
+{
+   if(sbuf->left > 1) {
+      size_t written;
+      va_list ap;
+      va_start(ap, format);
+      written = util_vsnprintf(sbuf->ptr, sbuf->left, format, ap);
+      va_end(ap);
+      sbuf->ptr += written;
+      sbuf->left -= written;
+   }
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_STRING_H_ */
diff --git a/src/gallium/auxiliary/util/u_tile.c b/src/gallium/auxiliary/util/u_tile.c
new file mode 100644
index 0000000000..32f6b072a0
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.c
@@ -0,0 +1,1198 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * RGBA/float tile get/put functions.
+ * Usable both by drivers and state trackers.
+ * Surfaces should already be in a mapped state.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_rect.h"
+#include "util/u_tile.h"
+
+
+/**
+ * Move raw block of pixels from surface to user memory.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *dst, int dst_stride)
+{
+   const void *src;
+
+   if (dst_stride == 0)
+      dst_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   src = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   assert(src);
+   if(!src)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, dst_stride, 0, 0, w, h, src, ps->stride, x, y);
+
+   pipe_surface_unmap(ps);
+}
+
+
+/**
+ * Move raw block of pixels from user memory to surface.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *src, int src_stride)
+{
+   void *dst;
+
+   if (src_stride == 0)
+      src_stride = pf_get_nblocksx(&ps->block, w) * ps->block.size;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   dst = pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   assert(dst);
+   if(!dst)
+      return;
+
+   pipe_copy_rect(dst, &ps->block, ps->stride, x, y, w, h, src, src_stride, 0, 0);
+
+   pipe_surface_unmap(ps);
+}
+
+
+
+
+/** Convert short in [-32768,32767] to GLfloat in [-1.0,1.0] */
+#define SHORT_TO_FLOAT(S)   ((2.0F * (S) + 1.0F) * (1.0F/65535.0F))
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+a8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >> 24) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+x8r8g8b8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >>  0) & 0xff);
+         pRow[3] = ubyte_to_float(0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+x8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         *dst++ = (0xff << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+
+static void
+b8g8r8a8_get_tile_rgba(const unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = ubyte_to_float((pixel >>  8) & 0xff);
+         pRow[1] = ubyte_to_float((pixel >> 16) & 0xff);
+         pRow[2] = ubyte_to_float((pixel >> 24) & 0xff);
+         pRow[3] = ubyte_to_float((pixel >>  0) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+b8g8r8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+
+static void
+a1r5g5b5_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x1f) * (1.0f / 31.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = ((pixel >> 15)       ) * 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a1r5g5b5_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r = r >> 3;  /* 5 bits */
+         g = g >> 3;  /* 5 bits */
+         b = b >> 3;  /* 5 bits */
+         a = a >> 7;  /* 1 bit */
+         *dst++ = (a << 15) | (r << 10) | (g << 5) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+
+static void
+a4r4g4b4_get_tile_rgba(const ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >>  8) & 0xf) * (1.0f / 15.0f);
+         pRow[1] = ((pixel >>  4) & 0xf) * (1.0f / 15.0f);
+         pRow[2] = ((pixel      ) & 0xf) * (1.0f / 15.0f);
+         pRow[3] = ((pixel >> 12)      ) * (1.0f / 15.0f);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a4r4g4b4_put_tile_rgba(ushort *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         r = float_to_ubyte(pRow[0]);
+         g = float_to_ubyte(pRow[1]);
+         b = float_to_ubyte(pRow[2]);
+         a = float_to_ubyte(pRow[3]);
+         r >>= 4;
+         g >>= 4;
+         b >>= 4;
+         a >>= 4;
+         *dst++ = (a << 12) | (r << 16) | (g << 4) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+
+static void
+r5g6b5_get_tile_rgba(const ushort *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x3f) * (1.0f / 63.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r5g6b5_put_tile_rgba(ushort *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0);
+         uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0);
+         uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0);
+         *dst++ = (r << 11) | (g << 5) | (b);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_Z16_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z16_get_tile_rgba(const ushort *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const float scale = 1.0f / 65535.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++ * scale;
+      }
+      p += dst_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_L8_UNORM ***/
+
+static void
+l8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(*src);
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+l8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = (ubyte) r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_A8_UNORM ***/
+
+static void
+a8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned a;
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (ubyte) a;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_R16_SNORM ***/
+
+static void
+r16_get_tile_rgba(const short *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16_put_tile_rgba(short *dst,
+                  unsigned w, unsigned h,
+                  const float *p,
+                  unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst++, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
+
+static void
+r16g16b16a16_get_tile_rgba(const short *src,
+                           unsigned w, unsigned h,
+                           float *p,
+                           unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src += 4, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] = SHORT_TO_FLOAT(src[1]);
+         pRow[2] = SHORT_TO_FLOAT(src[2]);
+         pRow[3] = SHORT_TO_FLOAT(src[3]);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16g16b16a16_put_tile_rgba(short *dst,
+                           unsigned w, unsigned h,
+                           const float *p,
+                           unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst += 4, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_I8_UNORM ***/
+
+static void
+i8_get_tile_rgba(const ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = ubyte_to_float(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+i8_put_tile_rgba(ubyte *dst,
+                 unsigned w, unsigned h,
+                 const float *p,
+                 unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r;
+         r = float_to_ubyte(pRow[0]);
+         *dst++ = (ubyte) r;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A8L8_UNORM ***/
+
+static void
+a8l8_get_tile_rgba(const ushort *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         ushort p = *src++;
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = ubyte_to_float(p & 0xff);
+         pRow[3] = ubyte_to_float(p >> 8);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8l8_put_tile_rgba(ushort *dst,
+                   unsigned w, unsigned h,
+                   const float *p,
+                   unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, a;
+         r = float_to_ubyte(pRow[0]);
+         a = float_to_ubyte(pRow[3]);
+         *dst++ = (a << 8) | r;
+      }
+      p += src_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_Z32_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32_get_tile_rgba(const unsigned *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const double scale = 1.0 / (double) 0xffffffff;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (*src++ * scale);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+s8z24_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ & 0xffffff));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+z24s8_get_tile_rgba(const unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ >> 8));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_Z32_FLOAT ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32f_get_tile_rgba(const float *src,
+                   unsigned w, unsigned h,
+                   float *p,
+                   unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++;
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_YCBCR / PIPE_FORMAT_YCBCR_REV ***/
+
+/**
+ * Convert YCbCr (or YCrCb) to RGBA.
+ */
+static void
+ycbcr_get_tile_rgba(const ushort *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride,
+                    boolean rev)
+{
+   const float scale = 1.0f / 255.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      /* do two texels at a time */
+      for (j = 0; j < (w & ~1); j += 2, src += 2) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         const ubyte y1 = (t1 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+         /* odd pixel: use y1,cr,cb */
+         r = 1.164f * (y1-16) + 1.596f * (cr-128);
+         g = 1.164f * (y1-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y1-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+
+      }
+      /* do the last texel */
+      if (w & 1) {
+         const ushort t0 = src[0];
+         const ushort t1 = src[1];
+         const ubyte y0 = (t0 >> 8) & 0xff;  /* luminance */
+         ubyte cb, cr;
+         float r, g, b;
+
+         if (rev) {
+            cb = t1 & 0xff;         /* chroma U */
+            cr = t0 & 0xff;         /* chroma V */
+         }
+         else {
+            cb = t0 & 0xff;         /* chroma U */
+            cr = t1 & 0xff;         /* chroma V */
+         }
+
+         /* even pixel: y0,cr,cb */
+         r = 1.164f * (y0-16) + 1.596f * (cr-128);
+         g = 1.164f * (y0-16) - 0.813f * (cr-128) - 0.391f * (cb-128);
+         b = 1.164f * (y0-16) + 2.018f * (cb-128);
+         pRow[0] = r * scale;
+         pRow[1] = g * scale;
+         pRow[2] = b * scale;
+         pRow[3] = 1.0f;
+         pRow += 4;
+      }
+      p += dst_stride;
+   }
+}
+
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride)
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_get_tile_rgba((ubyte *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_get_tile_rgba((short *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      z16_get_tile_rgba((ushort *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      z32_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      s8z24_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      z24s8_get_tile_rgba((unsigned *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_Z32_FLOAT:
+      z32f_get_tile_rgba((float *) src, w, h, dst, dst_stride);
+      break;
+   case PIPE_FORMAT_YCBCR:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, FALSE);
+      break;
+   case PIPE_FORMAT_YCBCR_REV:
+      ycbcr_get_tile_rgba((ushort *) src, w, h, dst, dst_stride, TRUE);
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p)
+{
+   unsigned dst_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   if(ps->format == PIPE_FORMAT_YCBCR || ps->format == PIPE_FORMAT_YCBCR_REV)
+      assert((x & 1) == 0);
+
+   pipe_get_tile_raw(ps, x, y, w, h, packed, 0);
+
+   pipe_tile_raw_to_rgba(ps->format, packed, w, h, p, dst_stride);
+
+   FREE(packed);
+}
+
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p)
+{
+   unsigned src_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(pf_get_nblocks(&ps->block, w, h) * ps->block.size);
+
+   if (!packed)
+      return;
+
+   switch (ps->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      x8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      assert(0);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_L8_UNORM:
+      l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8_UNORM:
+      a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_I8_UNORM:
+      i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A8L8_UNORM:
+      a8l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16_SNORM:
+      r16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_put_tile_raw(ps, x, y, w, h, packed, 0);
+
+   FREE(packed);
+}
+
+
+/**
+ * Get a block of Z values, converted to 32-bit range.
+ */
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z)
+{
+   const uint dstStride = w;
+   ubyte *map;
+   uint *pDest = z;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_READ);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map  + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         const uint *pSrc
+            = (const uint *)(map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 24-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 8) | (pSrc[j] & 0xff);
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/4;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         const ushort *pSrc
+            = (const ushort *)(map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 16-bit Z to 32-bit Z */
+               pDest[j] = (pSrc[j] << 16) | pSrc[j];
+            }
+            pDest += dstStride;
+            pSrc += ps->stride/2;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *zSrc)
+{
+   const uint srcStride = w;
+   const uint *pSrc = zSrc;
+   ubyte *map;
+   uint i, j;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   map = (ubyte *)pipe_surface_map(ps, PIPE_BUFFER_USAGE_CPU_WRITE);
+   if (!map) {
+      assert(0);
+      return;
+   }
+
+   switch (ps->format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            memcpy(pDest, pSrc, 4 * w);
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      {
+         uint *pDest = (uint *) (map + y * ps->stride + x*4);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 24-bit Z (0 stencil) */
+               pDest[j] = pSrc[j] >> 8;
+            }
+            pDest += ps->stride/4;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         ushort *pDest = (ushort *) (map + y * ps->stride + x*2);
+         for (i = 0; i < h; i++) {
+            for (j = 0; j < w; j++) {
+               /* convert 32-bit Z to 16-bit Z */
+               pDest[j] = pSrc[j] >> 16;
+            }
+            pDest += ps->stride/2;
+            pSrc += srcStride;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
diff --git a/src/gallium/auxiliary/util/u_tile.h b/src/gallium/auxiliary/util/u_tile.h
new file mode 100644
index 0000000000..a8ac805308
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_tile.h
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_TILE_H
+#define P_TILE_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_surface;
+
+
+/**
+ * Clip tile against surface dims.
+ * \return TRUE if tile is totally clipped, FALSE otherwise
+ */
+static INLINE boolean
+pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps)
+{
+   if (x >= ps->width)
+      return TRUE;
+   if (y >= ps->height)
+      return TRUE;
+   if (x + *w > ps->width)
+      *w = ps->width - x;
+   if (y + *h > ps->height)
+      *h = ps->height - y;
+   return FALSE;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+pipe_get_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *p, int dst_stride);
+
+void
+pipe_put_tile_raw(struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *p, int src_stride);
+
+
+void
+pipe_get_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p);
+
+void
+pipe_put_tile_rgba(struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p);
+
+
+void
+pipe_get_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                uint *z);
+
+void
+pipe_put_tile_z(struct pipe_surface *ps,
+                uint x, uint y, uint w, uint h,
+                const uint *z);
+
+void
+pipe_tile_raw_to_rgba(enum pipe_format format,
+                      void *src,
+                      uint w, uint h,
+                      float *dst, unsigned dst_stride);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
new file mode 100644
index 0000000000..57b80e5604
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -0,0 +1,220 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX)
+#include <sys/time.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+#include <windows.h>
+#include <winddi.h>
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+#include <windows.h>
+extern VOID KeQuerySystemTime(PLARGE_INTEGER);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+#include <windows.h>
+#else
+#error Unsupported OS
+#endif
+
+#include "util/u_time.h"
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+
+static int64_t frequency = 0;
+
+static INLINE void 
+util_time_get_frequency(void)
+{
+   if(!frequency) {
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+      LONGLONG temp;
+      EngQueryPerformanceFrequency(&temp);
+      frequency = temp;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+      LARGE_INTEGER temp;
+      QueryPerformanceFrequency(&temp);
+      frequency = temp.QuadPart;
+#endif
+   }
+}
+#endif
+
+
+void 
+util_time_get(struct util_time *t)
+{
+#if defined(PIPE_OS_LINUX)
+   gettimeofday(&t->tv, NULL);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+   LONGLONG temp;
+   EngQueryPerformanceCounter(&temp);
+   t->counter = temp;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   /* Updated every 10 miliseconds, measured in units of 100 nanoseconds.
+    * http://msdn.microsoft.com/en-us/library/ms801642.aspx */
+   LARGE_INTEGER temp;
+   KeQuerySystemTime(&temp);
+   t->counter = temp.QuadPart;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   LARGE_INTEGER temp;
+   QueryPerformanceCounter(&temp);
+   t->counter = temp.QuadPart;
+#endif
+}
+
+
+void 
+util_time_add(const struct util_time *t1,
+              int64_t usecs,
+              struct util_time *t2)
+{
+#if defined(PIPE_OS_LINUX)
+   t2->tv.tv_sec = t1->tv.tv_sec + usecs / 1000000;
+   t2->tv.tv_usec = t1->tv.tv_usec + usecs % 1000000;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   util_time_get_frequency();
+   t2->counter = t1->counter + (usecs * frequency + INT64_C(999999))/INT64_C(1000000);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   /* 1 tick = 100 nano seconds. */
+   t2->counter = t1->counter + usecs * 10;
+#elif 
+   LARGE_INTEGER temp;
+   LONGLONG freq;
+   freq = temp.QuadPart;
+   t2->counter = t1->counter + (usecs * freq)/1000000L;
+#endif
+}
+
+
+int64_t
+util_time_diff(const struct util_time *t1, 
+               const struct util_time *t2)
+{
+#if defined(PIPE_OS_LINUX)
+   return (t2->tv.tv_usec - t1->tv.tv_usec) + 
+          (t2->tv.tv_sec - t1->tv.tv_sec)*1000000;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   util_time_get_frequency();
+   return (t2->counter - t1->counter)*INT64_C(1000000)/frequency;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   return (t2->counter - t1->counter)/10;
+#endif
+}
+
+
+
+uint64_t
+util_time_micros( void )
+{
+   struct util_time t1;
+   
+   util_time_get(&t1);
+   
+#if defined(PIPE_OS_LINUX)
+   return t1.tv.tv_usec + t1.tv.tv_sec*1000000LL;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   util_time_get_frequency();
+   return t1.counter*INT64_C(1000000)/frequency;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   return t1.counter/10;
+#endif
+}
+
+
+
+/**
+ * Compare two time values.
+ * 
+ * Not publicly available because it does not take in account wrap-arounds. 
+ * Use util_time_timeout instead.
+ */
+static INLINE int
+util_time_compare(const struct util_time *t1, 
+                  const struct util_time *t2)
+{
+#if defined(PIPE_OS_LINUX)
+   if (t1->tv.tv_sec < t2->tv.tv_sec)
+      return -1;
+   else if(t1->tv.tv_sec > t2->tv.tv_sec)
+      return 1;
+   else if (t1->tv.tv_usec < t2->tv.tv_usec)
+      return -1;
+   else if(t1->tv.tv_usec > t2->tv.tv_usec)
+      return 1;
+   else 
+      return 0;
+#elif defined(PIPE_OS_WINDOWS)
+   if (t1->counter < t2->counter)
+      return -1;
+   else if(t1->counter > t2->counter)
+      return 1;
+   else 
+      return 0;
+#endif
+}
+
+
+boolean 
+util_time_timeout(const struct util_time *start, 
+                  const struct util_time *end,
+                  const struct util_time *curr) 
+{
+   if(util_time_compare(start, end) <= 0)
+      return !(util_time_compare(start, curr) <= 0 && util_time_compare(curr, end) < 0);
+   else
+      return !(util_time_compare(start, curr) <= 0 || util_time_compare(curr, end) < 0);
+}
+
+
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+void util_time_sleep(unsigned usecs)
+{
+   LONGLONG start, curr, end;
+   
+   EngQueryPerformanceCounter(&start);
+   
+   if(!frequency)
+      EngQueryPerformanceFrequency(&frequency);
+   
+   end = start + (usecs * frequency + 999999LL)/1000000LL;
+   
+   do {
+      EngQueryPerformanceCounter(&curr);
+   } while(start <= curr && curr < end || 
+	   end < start && (curr < end || start <= curr));
+}
+#endif
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
new file mode 100644
index 0000000000..35d97d16c7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -0,0 +1,104 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef U_TIME_H_
+#define U_TIME_H_
+
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX)
+#include <time.h> /* timeval */
+#include <unistd.h> /* usleep */
+#endif
+
+#include "pipe/p_compiler.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Time abstraction.
+ * 
+ * Do not access this structure directly. Use the provided function instead.
+ */
+struct util_time 
+{
+#if defined(PIPE_OS_LINUX)
+   struct timeval tv;
+#else
+   int64_t counter;
+#endif
+};
+   
+
+void 
+util_time_get(struct util_time *t);
+
+void 
+util_time_add(const struct util_time *t1,
+              int64_t usecs,
+              struct util_time *t2);
+
+uint64_t
+util_time_micros( void );
+
+int64_t
+util_time_diff(const struct util_time *t1, 
+               const struct util_time *t2);
+
+/**
+ * Returns non-zero when the timeout expires.
+ */
+boolean 
+util_time_timeout(const struct util_time *start, 
+                  const struct util_time *end,
+                  const struct util_time *curr);
+
+#if defined(PIPE_OS_LINUX)
+#define util_time_sleep usleep
+#else
+void
+util_time_sleep(unsigned usecs);
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* U_TIME_H_ */
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.c b/src/gallium/auxiliary/util/u_timed_winsys.c
new file mode 100644
index 0000000000..8beb3b4c88
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_timed_winsys.c
@@ -0,0 +1,346 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include "pipe/p_winsys.h"
+#include "u_timed_winsys.h"
+#include "util/u_memory.h"
+#include "util/u_time.h"
+
+
+struct timed_winsys {
+   struct pipe_winsys base;
+   struct pipe_winsys *backend;
+   uint64_t last_dump;
+   struct {
+      const char *name_key;
+      double total;
+      unsigned calls;
+   } funcs[13];
+};
+
+
+static struct timed_winsys *timed_winsys( struct pipe_winsys *winsys )
+{
+   return (struct timed_winsys *)winsys;
+}
+
+
+static uint64_t time_start( void )
+{
+   return util_time_micros();
+}
+
+
+static void time_display( struct pipe_winsys *winsys )
+{
+   struct timed_winsys *tws = timed_winsys(winsys);
+   unsigned i;
+   double overall = 0;
+
+   for (i = 0; i < Elements(tws->funcs); i++) {
+      if (tws->funcs[i].name_key) {
+         debug_printf("*** %-25s %5.3fms (%d calls, avg %.3fms)\n", 
+                      tws->funcs[i].name_key,
+                      tws->funcs[i].total,
+                      tws->funcs[i].calls,
+                      tws->funcs[i].total / tws->funcs[i].calls);
+         overall += tws->funcs[i].total;
+         tws->funcs[i].calls = 0;
+         tws->funcs[i].total = 0;
+      }
+   }
+
+   debug_printf("*** %-25s %5.3fms\n", 
+                "OVERALL WINSYS",
+                overall);
+}
+
+static void time_finish( struct pipe_winsys *winsys,
+                         long long startval, 
+                         unsigned idx,
+                         const char *name ) 
+{
+   struct timed_winsys *tws = timed_winsys(winsys);
+   uint64_t endval = util_time_micros();
+   double elapsed = (endval - startval)/1000.0;
+
+   if (endval - startval > 1000LL) 
+      debug_printf("*** %s %.3f\n", name, elapsed );
+
+   assert( tws->funcs[idx].name_key == name ||
+           tws->funcs[idx].name_key == NULL);
+
+   tws->funcs[idx].name_key = name;
+   tws->funcs[idx].total += elapsed;
+   tws->funcs[idx].calls++;
+
+   if (endval - tws->last_dump > 10LL * 1000LL * 1000LL) {
+      time_display( winsys );
+      tws->last_dump = endval;
+   }
+}
+
+
+/* Pipe has no concept of pools, but the psb driver passes a flag that
+ * can be mapped onto pools in the backend.
+ */
+static struct pipe_buffer *
+timed_buffer_create(struct pipe_winsys *winsys, 
+                    unsigned alignment, 
+                    unsigned usage, 
+                    unsigned size )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_buffer *buf = backend->buffer_create( backend, alignment, usage, size );
+
+   time_finish(winsys, start, 0, __FUNCTION__);
+   
+   return buf;
+}
+
+
+
+
+static struct pipe_buffer *
+timed_user_buffer_create(struct pipe_winsys *winsys,
+                             void *data, 
+                             unsigned bytes) 
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_buffer *buf = backend->user_buffer_create( backend, data, bytes );
+
+   time_finish(winsys, start, 1, __FUNCTION__);
+   
+   return buf;
+}
+
+
+static void *
+timed_buffer_map(struct pipe_winsys *winsys,
+                     struct pipe_buffer *buf,
+                     unsigned flags)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   void *map = backend->buffer_map( backend, buf, flags );
+
+   time_finish(winsys, start, 2, __FUNCTION__);
+   
+   return map;
+}
+
+
+static void
+timed_buffer_unmap(struct pipe_winsys *winsys,
+                       struct pipe_buffer *buf)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->buffer_unmap( backend, buf );
+
+   time_finish(winsys, start, 3, __FUNCTION__);
+}
+
+
+static void
+timed_buffer_destroy(struct pipe_winsys *winsys,
+                         struct pipe_buffer *buf)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->buffer_destroy( backend, buf );
+
+   time_finish(winsys, start, 4, __FUNCTION__);
+}
+
+
+static void
+timed_flush_frontbuffer( struct pipe_winsys *winsys,
+                         struct pipe_surface *surf,
+                         void *context_private)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->flush_frontbuffer( backend, surf, context_private );
+
+   time_finish(winsys, start, 5, __FUNCTION__);
+}
+
+
+
+
+static struct pipe_surface *
+timed_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_surface *surf = backend->surface_alloc( backend );
+
+   time_finish(winsys, start, 6, __FUNCTION__);
+   
+   return surf;
+}
+
+
+
+static int
+timed_surface_alloc_storage(struct pipe_winsys *winsys,
+                              struct pipe_surface *surf,
+                              unsigned width, unsigned height,
+                              enum pipe_format format, 
+                              unsigned flags,
+                              unsigned tex_usage)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->surface_alloc_storage( backend, surf, width, height, 
+                                             format, flags, tex_usage );
+
+   time_finish(winsys, start, 7, __FUNCTION__);
+   
+   return ret;
+}
+
+
+static void
+timed_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->surface_release( backend, s );
+
+   time_finish(winsys, start, 8, __FUNCTION__);
+}
+
+
+
+static const char *
+timed_get_name( struct pipe_winsys *winsys )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   const char *ret = backend->get_name( backend );
+
+   time_finish(winsys, start, 9, __FUNCTION__);
+   
+   return ret;
+}
+
+static void
+timed_fence_reference(struct pipe_winsys *winsys,
+                    struct pipe_fence_handle **ptr,
+                    struct pipe_fence_handle *fence)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->fence_reference( backend, ptr, fence );
+
+   time_finish(winsys, start, 10, __FUNCTION__);
+}
+
+
+static int
+timed_fence_signalled( struct pipe_winsys *winsys,
+                       struct pipe_fence_handle *fence,
+                       unsigned flag )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->fence_signalled( backend, fence, flag );
+
+   time_finish(winsys, start, 11, __FUNCTION__);
+   
+   return ret;
+}
+
+static int
+timed_fence_finish( struct pipe_winsys *winsys,
+                     struct pipe_fence_handle *fence,
+                     unsigned flag )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->fence_finish( backend, fence, flag );
+
+   time_finish(winsys, start, 12, __FUNCTION__);
+   
+   return ret;
+}
+
+static void
+timed_winsys_destroy( struct pipe_winsys *winsys )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   backend->destroy( backend );
+   FREE(winsys);
+}
+
+
+
+struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend )
+{
+   struct timed_winsys *ws = CALLOC_STRUCT(timed_winsys);
+   
+   ws->base.user_buffer_create = timed_user_buffer_create;
+   ws->base.buffer_map = timed_buffer_map;
+   ws->base.buffer_unmap = timed_buffer_unmap;
+   ws->base.buffer_destroy = timed_buffer_destroy;
+   ws->base.buffer_create = timed_buffer_create;
+   ws->base.flush_frontbuffer = timed_flush_frontbuffer;
+   ws->base.get_name = timed_get_name;
+   ws->base.surface_alloc = timed_surface_alloc;
+   ws->base.surface_alloc_storage = timed_surface_alloc_storage;
+   ws->base.surface_release = timed_surface_release;
+   ws->base.fence_reference = timed_fence_reference;
+   ws->base.fence_signalled = timed_fence_signalled;
+   ws->base.fence_finish = timed_fence_finish;
+   ws->base.destroy = timed_winsys_destroy;
+   
+   ws->backend = backend;
+
+   return &ws->base;
+}
+
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.h b/src/gallium/auxiliary/util/u_timed_winsys.h
new file mode 100644
index 0000000000..542365112d
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_timed_winsys.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+
+#ifndef U_TIMED_WINSYS_H
+#define U_TIMED_WINSYS_H
+
+
+struct pipe_winsys;
+struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend );
+
+
+#endif
diff --git a/src/gallium/drivers/Makefile b/src/gallium/drivers/Makefile
new file mode 100644
index 0000000000..6161cb6ff8
--- /dev/null
+++ b/src/gallium/drivers/Makefile
@@ -0,0 +1,20 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = $(GALLIUM_DRIVER_DIRS)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/drivers/cell/Makefile b/src/gallium/drivers/cell/Makefile
new file mode 100644
index 0000000000..47aef7b05f
--- /dev/null
+++ b/src/gallium/drivers/cell/Makefile
@@ -0,0 +1,12 @@
+# Cell Gallium driver Makefile
+
+
+default:
+	( cd spu ; make )
+	( cd ppu ; make )
+
+
+
+clean:
+	( cd spu ; make clean )
+	( cd ppu ; make clean )
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
new file mode 100644
index 0000000000..98554d7f52
--- /dev/null
+++ b/src/gallium/drivers/cell/common.h
@@ -0,0 +1,355 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Types and tokens which are common to the SPU and PPU code.
+ */
+
+
+#ifndef CELL_COMMON_H
+#define CELL_COMMON_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+
+/** The standard assert macro doesn't seem to work reliably */
+#define ASSERT(x) \
+   if (!(x)) { \
+      ubyte *p = NULL; \
+      fprintf(stderr, "%s:%d: %s(): assertion %s failed.\n", \
+              __FILE__, __LINE__, __FUNCTION__, #x);             \
+      *p = 0; \
+      exit(1); \
+   }
+
+
+/** for sanity checking */
+#define ASSERT_ALIGN16(ptr) \
+  ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
+
+
+/** round up value to next multiple of 4 */
+#define ROUNDUP4(k)  (((k) + 0x3) & ~0x3)
+
+/** round up value to next multiple of 8 */
+#define ROUNDUP8(k)  (((k) + 0x7) & ~0x7)
+
+/** round up value to next multiple of 16 */
+#define ROUNDUP16(k)  (((k) + 0xf) & ~0xf)
+
+
+#define CELL_MAX_SPUS 8
+
+#define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+#define CELL_MAX_WIDTH 1024    /**< max framebuffer width */
+#define CELL_MAX_HEIGHT 1024   /**< max framebuffer width */
+
+#define TILE_SIZE 32
+
+
+/**
+ * The low byte of a mailbox word contains the command opcode.
+ * Remaining higher bytes are command specific.
+ */
+#define CELL_CMD_OPCODE_MASK 0xff
+
+#define CELL_CMD_EXIT                 1
+#define CELL_CMD_CLEAR_SURFACE        2
+#define CELL_CMD_FINISH               3
+#define CELL_CMD_RENDER               4
+#define CELL_CMD_BATCH                5
+#define CELL_CMD_RELEASE_VERTS        6
+#define CELL_CMD_STATE_FRAMEBUFFER   10
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
+#define CELL_CMD_STATE_SAMPLER       12
+#define CELL_CMD_STATE_TEXTURE       13
+#define CELL_CMD_STATE_VERTEX_INFO   14
+#define CELL_CMD_STATE_VIEWPORT      15
+#define CELL_CMD_STATE_UNIFORMS      16
+#define CELL_CMD_STATE_VS_ARRAY_INFO 17
+#define CELL_CMD_STATE_BIND_VS       18
+#define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
+#define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
+#define CELL_CMD_STATE_RASTERIZER    22
+#define CELL_CMD_VS_EXECUTE          23
+#define CELL_CMD_FLUSH_BUFFER_RANGE  24
+#define CELL_CMD_FENCE               25
+
+
+/** Command/batch buffers */
+#define CELL_NUM_BUFFERS 4
+#define CELL_BUFFER_SIZE (4*1024)  /**< 16KB would be the max */
+
+#define CELL_BUFFER_STATUS_FREE 10
+#define CELL_BUFFER_STATUS_USED 20
+
+/** Debug flags */
+#define CELL_DEBUG_CHECKER              (1 << 0)
+#define CELL_DEBUG_ASM                  (1 << 1)
+#define CELL_DEBUG_SYNC                 (1 << 2)
+#define CELL_DEBUG_FRAGMENT_OPS         (1 << 3)
+#define CELL_DEBUG_FRAGMENT_OP_FALLBACK (1 << 4)
+#define CELL_DEBUG_CMD                  (1 << 5)
+#define CELL_DEBUG_CACHE                (1 << 6)
+
+#define CELL_FENCE_IDLE      0
+#define CELL_FENCE_EMITTED   1
+#define CELL_FENCE_SIGNALLED 2
+
+#define CELL_FACING_FRONT    0
+#define CELL_FACING_BACK     1
+
+struct cell_fence
+{
+   /** There's a 16-byte status qword per SPU */
+   volatile uint status[CELL_MAX_SPUS][4];
+};
+
+
+/**
+ * Fence command sent to SPUs.  In response, the SPUs will write
+ * CELL_FENCE_STATUS_SIGNALLED back to the fence status word in main memory.
+ */
+struct cell_command_fence
+{
+   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   struct cell_fence *fence;
+};
+
+
+/**
+ * Command to specify per-fragment operations state and generated code.
+ * Note that this is a variant-length structure, allocated with as 
+ * much memory as needed to hold the generated code; the "code"
+ * field *must* be the last field in the structure.  Also, the entire
+ * length of the structure (including the variant code field) must be
+ * a multiple of 8 bytes; we require that this structure itself be
+ * a multiple of 8 bytes, and that the generated code also be a multiple
+ * of 8 bytes.
+ *
+ * Also note that the dsa, blend, blend_color fields are really only needed
+ * for the fallback/C per-pixel code.  They're not used when we generate
+ * dynamic SPU fragment code (which is the normal case), and will eventually
+ * be removed from this structure.
+ */
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+
+   /* Fields for the fallback case */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+
+   /* Fields for the generated SPU code */
+   unsigned total_code_size;
+   unsigned front_code_index;
+   unsigned back_code_index;
+   /* this field has variant length, and must be the last field in 
+    * the structure
+    */
+   unsigned code[0];
+};
+
+
+/** Max instructions for fragment programs */
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
+
+/**
+ * Command to send a fragment program to SPUs.
+ */
+struct cell_command_fragment_program
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   uint num_inst;        /**< Number of instructions */
+   unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
+};
+
+
+/**
+ * Tell SPUs about the framebuffer size, location
+ */
+struct cell_command_framebuffer
+{
+   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   int width, height;
+   void *color_start, *depth_start;
+   enum pipe_format color_format, depth_format;
+};
+
+
+/**
+ * Tell SPUs about rasterizer state.
+ */
+struct cell_command_rasterizer
+{
+   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   struct pipe_rasterizer_state rasterizer;
+};
+
+
+/**
+ * Clear framebuffer to the given value/color.
+ */
+struct cell_command_clear_surface
+{
+   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   uint surface; /**< Temporary: 0=color, 1=Z */
+   uint value;
+};
+
+
+/**
+ * Array info used by the vertex shader's vertex puller.
+ */
+struct cell_array_info
+{
+   uint64_t base;      /**< Base address of the 0th element. */
+   uint attr;          /**< Attribute that this state is for. */
+   uint pitch;         /**< Byte pitch from one entry to the next. */
+   uint size;
+   uint function_offset;
+};
+
+
+struct cell_attribute_fetch_code
+{
+   uint64_t base;
+   uint size;
+};
+
+
+struct cell_buffer_range
+{
+   uint64_t base;
+   unsigned size;
+};
+
+
+struct cell_shader_info
+{
+   uint64_t declarations;
+   uint64_t instructions;
+   uint64_t  immediates;
+
+   unsigned num_outputs;
+   unsigned num_declarations;
+   unsigned num_instructions;
+   unsigned num_immediates;
+};
+
+
+#define SPU_VERTS_PER_BATCH 64
+struct cell_command_vs
+{
+   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   uint64_t vOut[SPU_VERTS_PER_BATCH];
+   unsigned num_elts;
+   unsigned elts[SPU_VERTS_PER_BATCH];
+   float plane[12][4];
+   unsigned nr_planes;
+   unsigned nr_attrs;
+};
+
+
+struct cell_command_render
+{
+   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   uint prim_type;    /**< PIPE_PRIM_x */
+   uint num_verts;
+   uint vertex_size;  /**< bytes per vertex */
+   uint num_indexes;
+   uint vertex_buf;  /**< which cell->buffer[] contains the vertex data */
+   float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
+   uint min_index;
+   boolean inline_verts;
+};
+
+
+struct cell_command_release_verts
+{
+   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+struct cell_command_sampler
+{
+   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   uint unit;
+   struct pipe_sampler_state state;
+};
+
+
+struct cell_command_texture
+{
+   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
+   uint unit;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+#define MAX_SPU_FUNCTIONS 12
+/**
+ * Used to tell the PPU about the address of particular functions in the
+ * SPU's address space.
+ */
+struct cell_spu_function_info
+{
+   uint num;
+   char names[MAX_SPU_FUNCTIONS][16];
+   uint addrs[MAX_SPU_FUNCTIONS];
+   char pad[12];   /**< Pad struct to multiple of 16 bytes (256 currently) */
+};
+
+
+/** This is the object passed to spe_create_thread() */
+struct cell_init_info
+{
+   unsigned id;
+   unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
+   float inv_timebase;    /**< 1.0/timebase, for perf measurement */
+
+   /** Buffers for command batches, vertex/index data */
+   ubyte *buffers[CELL_NUM_BUFFERS];
+   uint *buffer_status;  /**< points at cell_context->buffer_status */
+
+   struct cell_spu_function_info *spu_functions;
+} ALIGN16_ATTRIB;
+
+
+#endif /* CELL_COMMON_H */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
new file mode 100644
index 0000000000..9358a47284
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -0,0 +1,84 @@
+# Gallium3D Cell driver: PPU code
+
+# This makefile builds the libcell.a library which gets pulled into
+# the main libGL.so library
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+# This is the "top-level" cell PPU driver code, will get pulled into libGL.so
+# by the winsys Makefile.
+CELL_LIB = ../libcell.a
+
+
+# This is the SPU code.  We'd like to be able to put this into the libcell.a
+# archive with the PPU code, but nesting .a libs doesn't seem to work.
+# So, it's pulled into libGL.so in gallium/winsys/xlib/Makefile
+SPU_CODE_MODULE = ../spu/g3d_spu.a
+
+
+SOURCES = \
+	cell_batch.c \
+	cell_clear.c \
+	cell_context.c \
+	cell_draw_arrays.c \
+	cell_fence.c \
+	cell_flush.c \
+	cell_gen_fragment.c \
+	cell_gen_fp.c \
+	cell_state_derived.c \
+	cell_state_emit.c \
+	cell_state_shader.c \
+	cell_pipe_state.c \
+	cell_screen.c \
+	cell_state_vertex.c \
+	cell_spu.c \
+	cell_surface.c \
+	cell_texture.c \
+	cell_vbuf.c \
+	cell_vertex_fetch.c \
+	cell_vertex_shader.c \
+	cell_winsys.c
+
+
+OBJECTS = $(SOURCES:.c=.o) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+
+
+default: $(CELL_LIB)
+
+
+$(CELL_LIB): $(OBJECTS) $(SPU_CODE_MODULE)
+#	ar -ru $(CELL_LIB) $(OBJECTS) $(SPU_CODE_MODULE) # doesn't work
+	ar -ru $(CELL_LIB) $(OBJECTS)
+
+#$(PROG): $(PPU_OBJECTS)
+#	$(CC) -o $(PROG) $(PPU_OBJECTS) $(SPU_CODE_MODULE) $(PPU_LFLAGS)
+
+
+
+clean:
+	rm -f *.o *~ $(CELL_LIB)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
+
+
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
new file mode 100644
index 0000000000..962775cd33
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -0,0 +1,315 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_spu.h"
+
+
+
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
+uint
+cell_get_empty_buffer(struct cell_context *cell)
+{
+   static uint prev_buffer = 0;
+   uint buf = (prev_buffer + 1) % CELL_NUM_BUFFERS;
+   uint tries = 0;
+
+   /* Find a buffer that's marked as free by all SPUs */
+   while (1) {
+      uint spu, num_free = 0;
+
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (cell->buffer_status[spu][buf][0] == CELL_BUFFER_STATUS_FREE) {
+            num_free++;
+
+            if (num_free == cell->num_spus) {
+               /* found a free buffer, now mark status as used */
+               for (spu = 0; spu < cell->num_spus; spu++) {
+                  cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+               }
+               /*
+               printf("PPU: ALLOC BUFFER %u, %u tries\n", buf, tries);
+               */
+               prev_buffer = buf;
+
+               /* release tex buffer associated w/ prev use of this batch buf */
+               cell_free_fenced_buffers(cell, &cell->fenced_buffers[buf]);
+
+               return buf;
+            }
+         }
+         else {
+            break;
+         }
+      }
+
+      /* try next buf */
+      buf = (buf + 1) % CELL_NUM_BUFFERS;
+
+      tries++;
+      if (tries == 100) {
+         /*
+         printf("PPU WAITING for buffer...\n");
+         */
+      }
+   }
+}
+
+
+/**
+ * Append a fence command to the current batch buffer.
+ * Note that we're sure there's always room for this because of the
+ * adjusted size check in cell_batch_free_space().
+ */
+static void
+emit_fence(struct cell_context *cell)
+{
+   const uint batch = cell->cur_batch;
+   const uint size = cell->buffer_size[batch];
+   struct cell_command_fence *fence_cmd;
+   struct cell_fence *fence = &cell->fenced_buffers[batch].fence;
+   uint i;
+
+   /* set fence status to emitted, not yet signalled */
+   for (i = 0; i < cell->num_spus; i++) {
+      fence->status[i][0] = CELL_FENCE_EMITTED;
+   }
+
+   ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
+
+   fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
+   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->fence = fence;
+
+   /* update batch buffer size */
+   cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
+   assert(sizeof(struct cell_command_fence) % 8 == 0);
+}
+
+
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
+void
+cell_batch_flush(struct cell_context *cell)
+{
+   static boolean flushing = FALSE;
+   uint batch = cell->cur_batch;
+   uint size = cell->buffer_size[batch];
+   uint spu, cmd_word;
+
+   assert(!flushing);
+
+   if (size == 0)
+      return;
+
+   /* Before we use this batch buffer, make sure any fenced texture buffers
+    * are released.
+    */
+   if (cell->fenced_buffers[batch].head) {
+      emit_fence(cell);
+      size = cell->buffer_size[batch];
+   }
+
+   flushing = TRUE;
+
+   assert(batch < CELL_NUM_BUFFERS);
+
+   /*
+   printf("cell_batch_dispatch: buf %u at %p, size %u\n",
+          batch, &cell->buffer[batch][0], size);
+   */
+     
+   /*
+    * Build "BATCH" command and send to all SPUs.
+    */
+   cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
+
+   for (spu = 0; spu < cell->num_spus; spu++) {
+      assert(cell->buffer_status[spu][batch][0] == CELL_BUFFER_STATUS_USED);
+      send_mbox_message(cell_global.spe_contexts[spu], cmd_word);
+   }
+
+   /* When the SPUs are done copying the buffer into their locals stores
+    * they'll write a BUFFER_STATUS_FREE message into the buffer_status[]
+    * array indicating that the PPU can re-use the buffer.
+    */
+
+   batch = cell_get_empty_buffer(cell);
+
+   cell->buffer_size[batch] = 0;  /* empty */
+   cell->cur_batch = batch;
+
+   flushing = FALSE;
+}
+
+
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
+uint
+cell_batch_free_space(const struct cell_context *cell)
+{
+   uint free = CELL_BUFFER_SIZE - cell->buffer_size[cell->cur_batch];
+   free -= sizeof(struct cell_command_fence);
+   return free;
+}
+
+
+/**
+ * Append data to the current batch buffer.
+ * \param data  address of block of bytes to append
+ * \param bytes  size of block of bytes
+ */
+void
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
+{
+   uint size;
+
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   if (bytes > cell_batch_free_space(cell)) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+}
+
+
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * \return address in batch buffer to put data
+ */
+void *
+cell_batch_alloc(struct cell_context *cell, uint bytes)
+{
+   return cell_batch_alloc_aligned(cell, bytes, 1);
+}
+
+
+/**
+ * Same as \sa cell_batch_alloc, but return an address at a particular
+ * alignment.
+ */
+void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment)
+{
+   void *pos;
+   uint size, padbytes;
+
+   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes <= CELL_BUFFER_SIZE);
+   ASSERT(alignment > 0);
+   ASSERT(cell->cur_batch >= 0);
+
+#ifdef ASSERT
+   {
+      uint spu;
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
+                 == CELL_BUFFER_STATUS_USED);
+      }
+   }
+#endif
+
+   size = cell->buffer_size[cell->cur_batch];
+
+   padbytes = (alignment - (size % alignment)) % alignment;
+
+   if (padbytes + bytes > cell_batch_free_space(cell)) {
+      cell_batch_flush(cell);
+      size = 0;
+   }
+   else {
+      size += padbytes;
+   }
+
+   ASSERT(size % alignment == 0);
+   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
+
+   pos = (void *) (cell->buffer[cell->cur_batch] + size);
+
+   cell->buffer_size[cell->cur_batch] = size + bytes;
+
+   return pos;
+}
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
new file mode 100644
index 0000000000..f74dd60079
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -0,0 +1,61 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_BATCH_H
+#define CELL_BATCH_H
+
+#include "pipe/p_compiler.h"
+
+
+struct cell_context;
+
+
+extern uint
+cell_get_empty_buffer(struct cell_context *cell);
+
+extern void
+cell_batch_flush(struct cell_context *cell);
+
+extern uint
+cell_batch_free_space(const struct cell_context *cell);
+
+extern void
+cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
+
+extern void *
+cell_batch_alloc(struct cell_context *cell, uint bytes);
+
+extern void *
+cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
+                         uint alignment);
+
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
+
+#endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
new file mode 100644
index 0000000000..037635e466
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -0,0 +1,122 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdint.h>
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+#include "cell/common.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_state.h"
+
+
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Called via pipe->clear()
+ */
+void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct cell_context *cell = cell_context(pipe);
+   uint surfIndex;
+
+   if (cell->dirty)
+      cell_update_derived(cell);
+
+
+   if (!cell->cbuf_map[0])
+      cell->cbuf_map[0] = screen->surface_map(screen, ps,
+                                              PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   if (ps == cell->framebuffer.zsbuf) {
+      /* clear z/stencil buffer */
+      surfIndex = 1;
+   }
+   else {
+      /* clear color buffer */
+      surfIndex = 0;
+
+      if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+         clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                                    ps->format);
+      }
+   }
+
+
+   /* Build a CLEAR command and place it in the current batch buffer */
+   {
+      struct cell_command_clear_surface *clr
+         = (struct cell_command_clear_surface *)
+         cell_batch_alloc(cell, sizeof(*clr));
+      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+      clr->surface = surfIndex;
+      clr->value = clearValue;
+   }
+
+   /* Technically, the surface's contents are now known and cleared,
+    * so we could set the status to PIPE_SURFACE_STATUS_CLEAR.  But
+    * it turns out it's quite painful to recognize when any particular
+    * surface goes from PIPE_SURFACE_STATUS_CLEAR to 
+    * PIPE_SURFACE_STATUS_DEFINED (i.e. with known contents), because
+    * the drawing commands could be operating on numerous draw buffers,
+    * which we'd have to iterate through to set all their stati...
+    * For now, we cheat a bit and set the surface's status to DEFINED
+    * right here.  Later we should revisit this and set the status to
+    * CLEAR here, and find a better place to set the status to DEFINED.
+    */
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.h b/src/gallium/drivers/cell/ppu/cell_clear.h
new file mode 100644
index 0000000000..ff47d43f4c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CLEAR_H
+#define CELL_CLEAR_H
+
+
+struct pipe_context;
+struct pipe_surface;
+
+
+extern void
+cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
+                   unsigned clearValue);
+
+
+
+#endif /* CELL_CLEAR_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
new file mode 100644
index 0000000000..22d552d8e3
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -0,0 +1,183 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include <stdio.h>
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_screen.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+#include "cell/common.h"
+#include "cell_batch.h"
+#include "cell_clear.h"
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_state.h"
+#include "cell_surface.h"
+#include "cell_spu.h"
+#include "cell_pipe_state.h"
+#include "cell_texture.h"
+#include "cell_vbuf.h"
+
+
+
+static void
+cell_destroy_context( struct pipe_context *pipe )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
+   cell_spu_exit(cell);
+
+   align_free(cell);
+}
+
+
+static struct draw_context *
+cell_draw_create(struct cell_context *cell)
+{
+   struct draw_context *draw = draw_create();
+
+#if 0 /* broken */
+   if (getenv("GALLIUM_CELL_VS")) {
+      /* plug in SPU-based vertex transformation code */
+      draw->shader_queue_flush = cell_vertex_shader_queue_flush;
+      draw->driver_private = cell;
+   }
+#endif
+
+   return draw;
+}
+
+
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"asm", CELL_DEBUG_ASM},        /**< dump SPU asm code */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {"fragops", CELL_DEBUG_FRAGMENT_OPS}, /**< SPUs emit fragment ops debug messages*/
+   {"fragopfallback", CELL_DEBUG_FRAGMENT_OP_FALLBACK}, /**< SPUs use reference implementation for fragment ops*/
+   {"cmd", CELL_DEBUG_CMD},       /**< SPUs dump command buffer info */
+   {"cache", CELL_DEBUG_CACHE},   /**< report texture cache stats on exit */
+   {NULL, 0}
+};
+
+
+struct pipe_context *
+cell_create_context(struct pipe_screen *screen,
+                    struct cell_winsys *cws)
+{
+   struct cell_context *cell;
+   uint i;
+
+   /* some fields need to be 16-byte aligned, so align the whole object */
+   cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
+   if (!cell)
+      return NULL;
+
+   memset(cell, 0, sizeof(*cell));
+
+   cell->winsys = cws;
+   cell->pipe.winsys = screen->winsys;
+   cell->pipe.screen = screen;
+   cell->pipe.destroy = cell_destroy_context;
+
+   cell->pipe.clear = cell_clear_surface;
+   cell->pipe.flush = cell_flush;
+
+#if 0
+   cell->pipe.begin_query = cell_begin_query;
+   cell->pipe.end_query = cell_end_query;
+   cell->pipe.wait_query = cell_wait_query;
+#endif
+
+   cell_init_draw_functions(cell);
+   cell_init_state_functions(cell);
+   cell_init_shader_functions(cell);
+   cell_init_surface_functions(cell);
+   cell_init_vertex_functions(cell);
+
+   cell->draw = cell_draw_create(cell);
+
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
+   cell_init_vbuf(cell);
+
+   draw_set_rasterize_stage(cell->draw, cell->vbuf);
+
+   /* convert all points/lines to tris for the time being */
+   draw_wide_point_threshold(cell->draw, 0.0);
+   draw_wide_line_threshold(cell->draw, 0.0);
+
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
+   for (i = 0; i < CELL_NUM_BUFFERS; i++)
+      cell_fence_init(&cell->fenced_buffers[i].fence);
+
+
+   /*
+    * SPU stuff
+    */
+   /* This call only works with SDK 3.0.  Anyone still using 2.1??? */
+   cell->num_cells = spe_cpu_info_get(SPE_COUNT_PHYSICAL_CPU_NODES, -1);
+   cell->num_spus = spe_cpu_info_get(SPE_COUNT_USABLE_SPES, 0);
+   if (cell->debug_flags) {
+      printf("Cell: found %d Cell(s) with %u SPUs\n",
+             cell->num_cells, cell->num_spus);
+   }
+   if (getenv("CELL_NUM_SPUS")) {
+      cell->num_spus = atoi(getenv("CELL_NUM_SPUS"));
+      assert(cell->num_spus > 0);
+   }
+
+   cell_start_spus(cell);
+
+   cell_init_batch_buffers(cell);
+
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+   return &cell->pipe;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
new file mode 100644
index 0000000000..eb1397bb3f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -0,0 +1,205 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_CONTEXT_H
+#define CELL_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vbuf.h"
+#include "cell_winsys.h"
+#include "cell/common.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
+
+
+struct cell_vbuf_render;
+
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
+struct cell_vertex_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   void *draw_data;
+};
+
+
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
+struct cell_fragment_shader_state
+{
+   struct pipe_shader_state shader;
+   struct tgsi_shader_info info;
+   struct spe_function code;
+   void *data;
+};
+
+
+/**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
+struct cell_buffer_node;
+
+/**
+ * Fenced buffer list.  List of buffers which can be unreferenced after
+ * the fence has been executed/signalled.
+ */
+struct cell_buffer_list
+{
+   struct cell_fence fence ALIGN16_ATTRIB;
+   struct cell_buffer_node *head;
+};
+
+
+/**
+ * Per-context state, subclass of pipe_context.
+ */
+struct cell_context
+{
+   struct pipe_context pipe;
+
+   struct cell_winsys *winsys;
+
+   const struct pipe_blend_state *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   uint num_samplers;
+   const struct pipe_depth_stencil_alpha_state *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct cell_vertex_shader_state *vs;
+   const struct cell_fragment_shader_state *fs;
+
+   struct spe_function logic_op;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[2];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct cell_texture *texture[PIPE_MAX_SAMPLERS];
+   uint num_textures;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   uint num_vertex_buffers;
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   uint num_vertex_elements;
+
+   ubyte *cbuf_map[PIPE_MAX_COLOR_BUFS];
+   ubyte *zsbuf_map;
+
+   struct pipe_surface *tex_surf;
+   uint *tex_map;
+
+   uint dirty;
+   uint dirty_textures;  /* bitmask of texture units */
+   uint dirty_samplers;  /* bitmask of sampler units */
+
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *render_stage;
+
+   /** For post-transformed vertex buffering: */
+   struct cell_vbuf_render *vbuf_render;
+   struct draw_stage *vbuf;
+
+   struct vertex_info vertex_info;
+
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   struct cell_spu_function_info spu_functions ALIGN16_ATTRIB;
+
+   uint num_cells, num_spus;
+
+   /** Buffers for command batches, vertex/index data */
+   uint buffer_size[CELL_NUM_BUFFERS];
+   ubyte buffer[CELL_NUM_BUFFERS][CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+
+   int cur_batch;  /**< which buffer is being filled w/ commands */
+
+   /** [4] to ensure 16-byte alignment for each status word */
+   uint buffer_status[CELL_MAX_SPUS][CELL_NUM_BUFFERS][4] ALIGN16_ATTRIB;
+
+
+   /** Associated with each command/batch buffer is a list of pipe_buffers
+    * that are fenced.  When the last command in a buffer is executed, the
+    * fence will be signalled, indicating that any pipe_buffers preceeding
+    * that fence can be unreferenced (and probably freed).
+    */
+   struct cell_buffer_list fenced_buffers[CELL_NUM_BUFFERS];
+
+
+   struct spe_function attrib_fetch;
+   unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
+};
+
+
+
+
+static INLINE struct cell_context *
+cell_context(struct pipe_context *pipe)
+{
+   return (struct cell_context *) pipe;
+}
+
+
+extern struct pipe_context *
+cell_create_context(struct pipe_screen *screen, struct cell_winsys *cws);
+
+extern void
+cell_vertex_shader_queue_flush(struct draw_context *draw);
+
+
+/* XXX find a better home for this */
+extern void cell_update_vertex_fetch(struct draw_context *draw);
+
+
+#endif /* CELL_CONTEXT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
new file mode 100644
index 0000000000..880d535320
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -0,0 +1,191 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_state.h"
+#include "cell_flush.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+cell_map_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].size) {
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+         cell_flush_buffer_range(sp, sp->mapped_constants[i], 
+                                 sp->constants[i].buffer->size);
+      }
+   }
+
+   draw_set_mapped_constant_buffer(sp->draw,
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   sp->constants[PIPE_SHADER_VERTEX].size);
+}
+
+static void
+cell_unmap_constant_buffers(struct cell_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].size)
+         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      sp->mapped_constants[i] = NULL;
+   }
+}
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+static boolean
+cell_draw_range_elements(struct pipe_context *pipe,
+                         struct pipe_buffer *indexBuffer,
+                         unsigned indexSize,
+                         unsigned min_index,
+                         unsigned max_index,
+                         unsigned mode, unsigned start, unsigned count)
+{
+   struct cell_context *sp = cell_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   if (sp->dirty)
+      cell_update_derived( sp );
+
+#if 0
+   cell_map_surfaces(sp);
+#endif
+   cell_map_constant_buffers(sp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf = pipe_buffer_map(pipe->screen,
+                                           sp->vertex_buffer[i].buffer,
+                                           PIPE_BUFFER_USAGE_CPU_READ);
+      cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes = pipe_buffer_map(pipe->screen,
+                                                      indexBuffer,
+                                                      PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+   }
+
+   /* Note: leave drawing surfaces mapped */
+   cell_unmap_constant_buffers(sp);
+
+   return TRUE;
+}
+
+
+static boolean
+cell_draw_elements(struct pipe_context *pipe,
+                   struct pipe_buffer *indexBuffer,
+                   unsigned indexSize,
+                   unsigned mode, unsigned start, unsigned count)
+{
+   return cell_draw_range_elements( pipe, indexBuffer,
+                                    indexSize,
+                                    0, 0xffffffff,
+                                    mode, start, count );
+}
+
+
+static boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+static void
+cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct cell_context *cell = cell_context(pipe);
+   draw_set_edgeflags(cell->draw, edgeflags);
+}
+
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
new file mode 100644
index 0000000000..148873aa67
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_DRAW_ARRAYS_H
+#define CELL_DRAW_ARRAYS_H
+
+
+extern void
+cell_init_draw_functions(struct cell_context *cell);
+
+
+#endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.c b/src/gallium/drivers/cell/ppu/cell_fence.c
new file mode 100644
index 0000000000..867b5dcaa0
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.c
@@ -0,0 +1,168 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <unistd.h>
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_fence.h"
+#include "cell_texture.h"
+
+
+void
+cell_fence_init(struct cell_fence *fence)
+{
+   uint i;
+   ASSERT_ALIGN16(fence->status);
+   for (i = 0; i < CELL_MAX_SPUS; i++) {
+      fence->status[i][0] = CELL_FENCE_IDLE;
+   }
+}
+
+
+boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence)
+{
+   uint i;
+   for (i = 0; i < cell->num_spus; i++) {
+      if (fence->status[i][0] != CELL_FENCE_SIGNALLED)
+         return FALSE;
+      /*assert(fence->status[i][0] == CELL_FENCE_EMITTED);*/
+   }
+   return TRUE;
+}
+
+
+void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence)
+{
+   while (!cell_fence_signalled(cell, fence)) {
+      usleep(10);
+   }
+
+#ifdef DEBUG
+   {
+      uint i;
+      for (i = 0; i < cell->num_spus; i++) {
+         assert(fence->status[i][0] == CELL_FENCE_SIGNALLED);
+      }
+   }
+#endif
+}
+
+
+
+
+struct cell_buffer_node
+{
+   struct pipe_buffer *buffer;
+   struct cell_buffer_node *next;
+};
+
+
+static void
+cell_add_buffer_to_list(struct cell_context *cell,
+                        struct cell_buffer_list *list,
+                        struct pipe_buffer *buffer)
+{
+   struct pipe_screen *ps = cell->pipe.screen;
+   struct cell_buffer_node *node = CALLOC_STRUCT(cell_buffer_node);
+   /* create new list node which references the buffer, insert at head */
+   if (node) {
+      pipe_buffer_reference(ps, &node->buffer, buffer);
+      node->next = list->head;
+      list->head = node;
+   }
+}
+
+
+/**
+ * Wait for completion of the given fence, then unreference any buffers
+ * on the list.
+ * This typically unrefs/frees texture buffers after any rendering which uses
+ * them has completed.
+ */
+void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list)
+{
+   if (list->head) {
+      struct pipe_screen *ps = cell->pipe.screen;
+      struct cell_buffer_node *node;
+
+      cell_fence_finish(cell, &list->fence);
+
+      /* traverse the list, unreferencing buffers, freeing nodes */
+      node = list->head;
+      while (node) {
+         struct cell_buffer_node *next = node->next;
+         assert(node->buffer);
+         pipe_buffer_unmap(ps, node->buffer);
+#if 0
+         printf("Unref buffer %p\n", node->buffer);
+         if (node->buffer->refcount == 1)
+            printf("   Delete!\n");
+#endif
+         pipe_buffer_reference(ps, &node->buffer, NULL);
+         FREE(node);
+         node = next;
+      }
+      list->head = NULL;
+   }
+}
+
+
+/**
+ * This should be called for each render command.
+ * Any texture buffers that are current bound will be added to a fenced
+ * list to be freed later when the fence is executed/signalled.
+ */
+void
+cell_add_fenced_textures(struct cell_context *cell)
+{
+   struct cell_buffer_list *list = &cell->fenced_buffers[cell->cur_batch];
+   uint i;
+
+   for (i = 0; i < cell->num_textures; i++) {
+      struct cell_texture *ct = cell->texture[i];
+      if (ct) {
+         uint level;
+         for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+            if (ct->tiled_buffer[level]) {
+#if 0
+               printf("Adding texture %p buffer %p to list\n",
+                      ct, ct->tiled_buffer[level]);
+#endif
+               cell_add_buffer_to_list(cell, list, ct->tiled_buffer[level]);
+            }
+         }
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_fence.h b/src/gallium/drivers/cell/ppu/cell_fence.h
new file mode 100644
index 0000000000..536b4ba411
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_fence.h
@@ -0,0 +1,57 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FENCE_H
+#define CELL_FENCE_H
+
+
+extern void
+cell_fence_init(struct cell_fence *fence);
+
+
+extern boolean
+cell_fence_signalled(const struct cell_context *cell,
+                     const struct cell_fence *fence);
+
+
+extern void
+cell_fence_finish(const struct cell_context *cell,
+                  const struct cell_fence *fence);
+
+
+
+extern void
+cell_free_fenced_buffers(struct cell_context *cell,
+                         struct cell_buffer_list *list);
+
+
+extern void
+cell_add_fenced_textures(struct cell_context *cell);
+
+
+#endif /* CELL_FENCE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
new file mode 100644
index 0000000000..a64967b4b9
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_render.h"
+#include "draw/draw_context.h"
+
+
+/**
+ * Called via pipe->flush()
+ */
+void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (fence) {
+      *fence = NULL;
+      /* XXX: Implement real fencing */
+      flags |= CELL_FLUSH_WAIT;
+   }
+
+   if (flags & (PIPE_FLUSH_SWAPBUFFERS | PIPE_FLUSH_RENDER_CACHE))
+      flags |= CELL_FLUSH_WAIT;
+
+   draw_flush( cell->draw );
+   cell_flush_int(cell, flags);
+}
+
+
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
+void
+cell_flush_int(struct cell_context *cell, unsigned flags)
+{
+   static boolean flushing = FALSE;  /* recursion catcher */
+   uint i;
+
+   ASSERT(!flushing);
+   flushing = TRUE;
+
+   if (flags & CELL_FLUSH_WAIT) {
+      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
+      *cmd = CELL_CMD_FINISH;
+   }
+
+   cell_batch_flush(cell);
+
+#if 0
+   /* Send CMD_FINISH to all SPUs */
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_FINISH);
+   }
+#endif
+
+   if (flags & CELL_FLUSH_WAIT) {
+      /* Wait for ack */
+      for (i = 0; i < cell->num_spus; i++) {
+         uint k = wait_mbox_message(cell_global.spe_contexts[i]);
+         assert(k == CELL_CMD_FINISH);
+      }
+   }
+
+   flushing = FALSE;
+}
+
+
+void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size)
+{
+   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
+   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
+
+   batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
+   br->base = (uintptr_t) ptr;
+   br->size = size;
+   cell_batch_append(cell, batch, sizeof(batch));
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
new file mode 100644
index 0000000000..509ae6239a
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_FLUSH
+#define CELL_FLUSH
+
+#define CELL_FLUSH_WAIT 0x80000000
+
+extern void
+cell_flush(struct pipe_context *pipe, unsigned flags,
+           struct pipe_fence_handle **fence);
+
+extern void
+cell_flush_int(struct cell_context *cell, unsigned flags);
+
+extern void
+cell_flush_buffer_range(struct cell_context *cell, void *ptr,
+			unsigned size);
+
+#endif
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
new file mode 100644
index 0000000000..96a1743fc1
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -0,0 +1,2069 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU fragment program/shader code.
+ *
+ * Note that we generate SOA-style code here.  So each TGSI instruction
+ * operates on four pixels (and is translated into four SPU instructions,
+ * generally speaking).
+ *
+ * \author Brian Paul
+ */
+
+#include <math.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_dump.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fp.h"
+
+
+#define MAX_TEMPS 16
+#define MAX_IMMED  8
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+/**
+ * Context needed during code generation.
+ */
+struct codegen
+{
+   struct cell_context *cell;
+   int inputs_reg;      /**< 1st function parameter */
+   int outputs_reg;     /**< 2nd function parameter */
+   int constants_reg;   /**< 3rd function parameter */
+   int temp_regs[MAX_TEMPS][4]; /**< maps TGSI temps to SPE registers */
+   int imm_regs[MAX_IMMED][4];  /**< maps TGSI immediates to SPE registers */
+
+   int num_imm;  /**< number of immediates */
+
+   int one_reg;         /**< register containing {1.0, 1.0, 1.0, 1.0} */
+
+   /** Per-instruction temps / intermediate temps */
+   int num_itemps;
+   int itemps[12];
+
+   /** Current IF/ELSE/ENDIF nesting level */
+   int if_nesting;
+   /** Index of execution mask register */
+   int exec_mask_reg;
+
+   /** KIL mask: indicates which fragments have been killed */
+   int kill_mask_reg;
+
+   int frame_size;  /**< Stack frame size, in words */
+
+   struct spe_function *f;
+   boolean error;
+};
+
+
+/**
+ * Allocate an intermediate temporary register.
+ */
+static int
+get_itemp(struct codegen *gen)
+{
+   int t = spe_allocate_available_register(gen->f);
+   assert(gen->num_itemps < Elements(gen->itemps));
+   gen->itemps[gen->num_itemps++] = t;
+   return t;
+}
+
+/**
+ * Free all intermediate temporary registers.  To be called after each
+ * instruction has been emitted.
+ */
+static void
+free_itemps(struct codegen *gen)
+{
+   int i;
+   for (i = 0; i < gen->num_itemps; i++) {
+      spe_release_register(gen->f, gen->itemps[i]);
+   }
+   gen->num_itemps = 0;
+}
+
+
+/**
+ * Return index of an SPE register containing {1.0, 1.0, 1.0, 1.0}.
+ * The register is allocated and initialized upon the first call.
+ */
+static int
+get_const_one_reg(struct codegen *gen)
+{
+   if (gen->one_reg <= 0) {
+      gen->one_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT CONSTANT 1.0:");
+
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(gen->f, gen->one_reg, 1.0f);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->one_reg;
+}
+
+
+/**
+ * Return index of the pixel execution mask.
+ * The register is allocated an initialized upon the first call.
+ *
+ * The pixel execution mask controls which pixels in a quad are
+ * modified, according to surrounding conditionals, loops, etc.
+ */
+static int
+get_exec_mask_reg(struct codegen *gen)
+{
+   if (gen->exec_mask_reg <= 0) {
+      gen->exec_mask_reg = spe_allocate_available_register(gen->f);
+
+      spe_indent(gen->f, 4);
+      spe_comment(gen->f, -4, "INIT EXEC MASK = ~0:");
+
+      /* exec_mask = {~0, ~0, ~0, ~0} */
+      spe_load_int(gen->f, gen->exec_mask_reg, ~0);
+
+      spe_indent(gen->f, -4);
+   }
+
+   return gen->exec_mask_reg;
+}
+
+
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
+/**
+ * Return the index of the SPU temporary containing the named TGSI
+ * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
+ * just return the corresponding SPE register.  If the TGIS register
+ * is TGSI_FILE_INPUT/CONSTANT/IMMEDIATE we allocate a new SPE register
+ * and emit an SPE load instruction.
+ */
+static int
+get_src_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_src_register *src)
+{
+   int reg = -1;
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   boolean reg_is_itemp = FALSE;
+   uint sign_op;
+
+   assert(swizzle >= TGSI_SWIZZLE_X);
+   assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
+
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      assert(swizzle < 4);
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
+         }
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
+         {
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
+            reg = get_itemp(gen);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   /*
+    * Handle absolute value, negate or set-negative of src register.
+    */
+   sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+   if (sign_op != TGSI_UTIL_SIGN_KEEP) {
+      /*
+       * All sign ops are done by manipulating bit 31, the IEEE float sign bit.
+       */
+      const int bit31mask_reg = get_itemp(gen);
+      int result_reg;
+
+      if (reg_is_itemp) {
+         /* re-use 'reg' for the result */
+         result_reg = reg;
+      }
+      else {
+         /* alloc a new reg for the result */
+         result_reg = get_itemp(gen);
+      }
+
+      /* mask with bit 31 set, the rest cleared */
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+      if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
+         spe_andc(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else if (sign_op == TGSI_UTIL_SIGN_SET) {
+         spe_and(gen->f, result_reg, reg, bit31mask_reg);
+      }
+      else {
+         assert(sign_op == TGSI_UTIL_SIGN_TOGGLE);
+         spe_xor(gen->f, result_reg, reg, bit31mask_reg);
+      }
+
+      reg = result_reg;
+   }
+
+   return reg;
+}
+
+
+/**
+ * Return the index of an SPE register to use for the given TGSI register.
+ * If the TGSI register is TGSI_FILE_TEMPORARAY, the index of the
+ * corresponding SPE register is returned.  If the TGSI register is
+ * TGSI_FILE_OUTPUT we allocate an intermediate temporary register.
+ * See store_dest_reg() below...
+ */
+static int
+get_dst_reg(struct codegen *gen,
+            int channel,
+            const struct tgsi_full_dst_register *dest)
+{
+   int reg = -1;
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0)
+         reg = get_itemp(gen);
+      else
+         reg = gen->temp_regs[dest->DstRegister.Index][channel];
+      break;
+   case TGSI_FILE_OUTPUT:
+      reg = get_itemp(gen);
+      break;
+   default:
+      assert(0);
+   }
+
+   return reg;
+}
+
+
+/**
+ * When a TGSI instruction is writing to an output register, this
+ * function emits the SPE store instruction to store the value_reg.
+ * \param value_reg  the SPE register containing the value to store.
+ *                   This would have been returned by get_dst_reg().
+ */
+static void
+store_dest_reg(struct codegen *gen,
+               int value_reg, int channel,
+               const struct tgsi_full_dst_register *dest)
+{
+   /*
+    * XXX need to implement dst reg clamping/saturation
+    */
+#if 0
+   switch (inst->Instruction.Saturate) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      break;
+   default:
+      assert( 0 );
+   }
+#endif
+
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (gen->if_nesting > 0) {
+         int d_reg = gen->temp_regs[dest->DstRegister.Index][channel];
+         int exec_reg = get_exec_mask_reg(gen);
+         /* Mix d with new value according to exec mask:
+          * d[i] = mask_reg[i] ? value_reg : d_reg
+          */
+         spe_selb(gen->f, d_reg, d_reg, value_reg, exec_reg);
+      }
+      else {
+         /* we're not inside a condition or loop: do nothing special */
+
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      {
+         /* offset is measured in quadwords, not bytes */
+         int offset = dest->DstRegister.Index * 4 + channel;
+         if (gen->if_nesting > 0) {
+            int exec_reg = get_exec_mask_reg(gen);
+            int curval_reg = get_itemp(gen);
+            /* First read the current value from memory:
+             * Load:  curval = memory[(machine_reg) + offset]
+             */
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+            /* Mix curval with newvalue according to exec mask:
+             * d[i] = mask_reg[i] ? value_reg : d_reg
+             */
+            spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
+            /* Store: memory[(machine_reg) + offset] = curval */
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
+         }
+         else {
+            /* Store: memory[(machine_reg) + offset] = reg */
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   const int return_reg = 3;
+
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   spe_comment(gen->f, 0, "return the killed mask");
+   if (gen->kill_mask_reg > 0) {
+      /* shader called KIL, return the "alive" mask */
+      spe_move(gen->f, return_reg, gen->kill_mask_reg);
+   }
+   else {
+      /* return {0,0,0,0} */
+      spe_load_uint(gen->f, return_reg, 0);
+   }
+
+   spe_comment(gen->f, 0, "restore stack and return");
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
+static boolean
+emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, src_reg[4], dst_reg[4];
+
+   spe_comment(gen->f, -4, "MOV:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+             is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+            /* special-case: register to memory store */
+            store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
+         else {
+            spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+            store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit addition instructions.  Recall that a single TGSI_OPCODE_ADD
+ * becomes (up to) four SPU "fa" instructions because we're doing SOA
+ * processing.
+ */
+static boolean
+emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
+   spe_comment(gen->f, -4, "ADD:");
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
+   for (ch = 0; ch < 4; ch++) {
+      /* If the dest R, G, B or A writemask is enabled... */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* Emit actual SPE instruction: d = s1 + s2 */
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         /* Free any intermediate temps we allocated */
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit subtract.  See emit_ADD for comments.
+ */
+static boolean
+emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "SUB:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 - s2 */
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit multiply add.  See emit_ADD for comments.
+ */
+static boolean
+emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MAD:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 + s3 */
+         spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+
+/**
+ * Emit linear interpolate.  See emit_ADD for comments.
+ */
+static boolean
+emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
+   spe_comment(gen->f, -4, "LERP:");
+   /* setup/get src/dst/temp regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);
+      }
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit multiply.  See emit_ADD for comments.
+ */
+static boolean
+emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+   spe_comment(gen->f, -4, "MUL:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 * s2 */
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal.  See emit_ADD for comments.
+ */
+static boolean
+emit_RCP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RCP:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit reciprocal sqrt.  See emit_ADD for comments.
+ */
+static boolean
+emit_RSQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "RSQ:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* d = 1/s1 */
+         spe_frsqest(gen->f, d_reg, s1_reg);
+         spe_fi(gen->f, d_reg, s1_reg, d_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit absolute value.  See emit_ADD for comments.
+ */
+static boolean
+emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   spe_comment(gen->f, -4, "ABS:");
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         const int bit31mask_reg = get_itemp(gen);
+
+         /* mask with bit 31 set, the rest cleared */  
+         spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
+
+         /* d = sign bit cleared in s1 */
+         spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+   return true;
+}
+
+/**
+ * Emit 3 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "DP3:");
+
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 4 component dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "DP4:");
+
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
+
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit homogeneous dot product.  See emit_ADD for comments.
+ */
+static boolean
+emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   /* XXX rewrite this function to look more like DP3/DP4 */
+   int ch;
+   spe_comment(gen->f, -4, "DPH:");
+
+   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = x0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = y0 * y1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = z0 * z1 + t */
+   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
+   /* t = w1 + t */
+   spe_fa(gen->f, tmp_reg, s2_reg, tmp_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
+         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit 3-component vector normalize.
+ */
+static boolean
+emit_NRM3(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int src_reg[3];
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
+   spe_comment(gen->f, -4, "NRM3:");
+
+   src_reg[0] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   src_reg[1] = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   src_reg[2] = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+
+   /* t0 = x * x */
+   spe_fm(gen->f, t0_reg, src_reg[0], src_reg[0]);
+
+   /* t1 = y * y */
+   spe_fm(gen->f, t1_reg, src_reg[1], src_reg[1]);
+
+   /* t0 = z * z + t0 */
+   spe_fma(gen->f, t0_reg, src_reg[2], src_reg[2], t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
+
+   /* t1 = 1.0 / sqrt(t0) */
+   spe_frsqest(gen->f, t1_reg, t0_reg);
+   spe_fi(gen->f, t1_reg, t0_reg, t1_reg);
+
+   for (ch = 0; ch < 3; ch++) {  /* NOTE: omit W channel */
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         /* dst = src[ch] * t1 */
+         spe_fm(gen->f, d_reg, src_reg[ch], t1_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+
+/**
+ * Emit cross product.  See emit_ADD for comments.
+ */
+static boolean
+emit_XPD(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   spe_comment(gen->f, -4, "XPD:");
+
+   int s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   int s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   int tmp_reg = get_itemp(gen);
+
+   /* t = z0 * y1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = y0 * z1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_X)) {
+      store_dest_reg(gen, tmp_reg, CHAN_X, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   /* t = x0 * z1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = z0 * x1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Y)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Y, &inst->FullDstRegisters[0]);
+   }
+
+   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   /* t = y0 * x1 */
+   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+
+   s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   /* t = x0 * y1 - t */
+   spe_fms(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+
+   if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << CHAN_Z)) {
+      store_dest_reg(gen, tmp_reg, CHAN_Z, &inst->FullDstRegisters[0]);
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit set-if-greater-than.
+ * Note that the SPE fcgt instruction produces 0x0 and 0xffffffff as
+ * the result but OpenGL/TGSI needs 0.0 and 1.0 results.
+ * We can easily convert 0x0/0xffffffff to 0.0/1.0 with a bitwise AND.
+ */
+static boolean
+emit_SGT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 > s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLT(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLT:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 < s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_greater-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SGE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SGE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 >= s2) */
+         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_less-then-or-equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SLE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SLE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 <= s2) */
+         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = ~d & one_reg */
+         spe_andc(gen->f, d_reg, get_const_one_reg(gen), d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SEQ(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SEQ:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 == s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit set-if_not_equal.  See emit_SGT for comments.
+ */
+static boolean
+emit_SNE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "SNE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* d = (s1 != s2) */
+         spe_fceq(gen->f, d_reg, s1_reg, s2_reg);
+         spe_nor(gen->f, d_reg, d_reg, d_reg);
+
+         /* convert d from 0x0/0xffffffff to 0.0/1.0 */
+         /* d = d & one_reg */
+         spe_and(gen->f, d_reg, d_reg, get_const_one_reg(gen));
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit compare.  See emit_SGT for comments.
+ */
+static boolean
+emit_CMP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "CMP:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int zero_reg = get_itemp(gen);
+   
+         spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+         /* d = (s1 < 0) ? s2 : s3 */
+         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, d_reg, s3_reg, s2_reg, d_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit trunc.  
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_TRUNC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "TRUNC:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, d_reg, s1_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, d_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Emit floor.  
+ * If negative int subtract one
+ * Convert float to signed int
+ * Convert signed int to float
+ */
+static boolean
+emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FLR:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+   
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, d_reg, tmp_reg, 0);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+/**
+ * Compute frac = Input - FLR(Input)
+ */
+static boolean
+emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, "FRC:");
+
+   int zero_reg = get_itemp(gen);
+   spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         int tmp_reg = get_itemp(gen);
+
+         /* If negative, subtract 1.0 */
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
+
+         /* Convert float to int */
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* Convert int to float */
+         spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
+
+         /* d = s1 - FLR(s1) */
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+#if 0
+static void
+print_functions(struct cell_context *cell)
+{
+   struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i;
+   for (i = 0; i < funcs->num; i++) {
+      printf("SPU func %u: %s at %u\n",
+             i, funcs->names[i], funcs->addrs[i]);
+   }
+}
+#endif
+
+
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
+/**
+ * Emit code to call a SPU function.
+ * Used to implement instructions like SIN/COS/POW/TEX/etc.
+ * If scalar, only the X components of the src regs are used, and the
+ * result is replicated across the dest register's XYZW components.
+ */
+static boolean
+emit_function_call(struct codegen *gen,
+                   const struct tgsi_full_instruction *inst,
+                   char *funcname, uint num_args, boolean scalar)
+{
+   const uint addr = lookup_function(gen->cell, funcname);
+   char comment[100];
+   int s_regs[3];
+   int func_called = FALSE;
+   uint a, ch;
+   int retval_reg = -1;
+
+   assert(num_args <= 3);
+
+   snprintf(comment, sizeof(comment), "CALL %s:", funcname);
+   spe_comment(gen->f, -4, comment);
+
+   if (scalar) {
+      for (a = 0; a < num_args; a++) {
+         s_regs[a] = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[a]);
+      }
+      /* we'll call the function, put the return value in this register,
+       * then replicate it across all write-enabled components in d_reg.
+       */
+      retval_reg = spe_allocate_available_register(gen->f);
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint i, numUsed;
+
+         if (!scalar) {
+            for (a = 0; a < num_args; a++) {
+               s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
+            }
+         }
+
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         if (!scalar || !func_called) {
+            /* for a scalar function, we'll really only call the function once */
+
+            numUsed = spe_get_registers_used(gen->f, usedRegs);
+            assert(numUsed < gen->frame_size / 16 - 2);
+
+            /* save registers to stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               int offset = 2 + i;
+               spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+
+            /* setup function arguments */
+            for (a = 0; a < num_args; a++) {
+               spe_move(gen->f, 3 + a, s_regs[a]);
+            }
+
+            /* branch to function, save return addr */
+            spe_brasl(gen->f, SPE_REG_RA, addr);
+
+            /* save function's return value */
+            if (scalar)
+               spe_move(gen->f, retval_reg, 3);
+            else
+               spe_move(gen->f, d_reg, 3);
+
+            /* restore registers from stack */
+            for (i = 0; i < numUsed; i++) {
+               uint reg = usedRegs[i];
+               if (reg != d_reg && reg != retval_reg) {
+                  int offset = 2 + i;
+                  spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+               }
+            }
+
+            func_called = TRUE;
+         }
+
+         if (scalar) {
+            spe_move(gen->f, d_reg, retval_reg);
+         }
+
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   if (scalar) {
+      spe_release_register(gen->f, retval_reg);
+   }
+
+   return true;
+}
+
+
+static boolean
+emit_TEX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint target = inst->InstructionExtTexture.Texture;
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint addr;
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   switch (target) {
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_2D:
+      addr = lookup_function(gen->cell, "spu_tex_2d");
+      break;
+   case TGSI_TEXTURE_3D:
+      addr = lookup_function(gen->cell, "spu_tex_3d");
+      break;
+   case TGSI_TEXTURE_CUBE:
+      addr = lookup_function(gen->cell, "spu_tex_cube");
+      break;
+   default:
+      ASSERT(0 && "unsupported texture target");
+      return FALSE;
+   }
+
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL tex:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments (XXX depends on target) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
+/**
+ * KILL if any of src reg values are less than zero.
+ */
+static boolean
+emit_KIL(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch;
+   int s_regs[4], kil_reg = -1, cmp_reg, zero_reg;
+
+   spe_comment(gen->f, -4, "CALL kil:");
+
+   /* zero = {0,0,0,0} */
+   zero_reg = get_itemp(gen);
+   spe_load_uint(gen->f, zero_reg, 0);
+
+   cmp_reg = get_itemp(gen);
+
+   /* get src regs */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      }
+   }
+
+   /* test if any src regs are < 0 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (kil_reg >= 0) {
+            /* cmp = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, cmp_reg, zero_reg, s_regs[ch]);
+            /* kil = kil | cmp */
+            spe_or(gen->f, kil_reg, kil_reg, cmp_reg);
+         }
+         else {
+            kil_reg = get_itemp(gen);
+            /* kil = 0 > src ? : ~0 : 0 */
+            spe_fcgt(gen->f, kil_reg, zero_reg, s_regs[ch]);
+         }
+      }
+   }
+
+   if (gen->if_nesting) {
+      /* may have been a conditional kil */
+      spe_and(gen->f, kil_reg, kil_reg, gen->exec_mask_reg);
+   }
+
+   /* allocate the kill mask reg if needed */
+   if (gen->kill_mask_reg <= 0) {
+      gen->kill_mask_reg = spe_allocate_available_register(gen->f);
+      spe_move(gen->f, gen->kill_mask_reg, kil_reg);
+   }
+   else {
+      spe_or(gen->f, gen->kill_mask_reg, gen->kill_mask_reg, kil_reg);
+   }
+
+   free_itemps(gen);
+
+   return TRUE;
+}
+
+
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   spe_comment(gen->f, -4, "MAX:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
+
+   /* d = (s0 > s1) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+/**
+ * Emit max.  See emit_SGT for comments.
+ */
+static boolean
+emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
+
+   spe_comment(gen->f, -4, "MIN:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
+
+   /* d = (s1 > s0) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   free_itemps(gen);
+   return true;
+}
+
+static boolean
+emit_IF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int channel = 0;
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "IF:");
+
+   /* update execution mask with the predicate register */
+   int tmp_reg = get_itemp(gen);
+   int s1_reg = get_src_reg(gen, channel, &inst->FullSrcRegisters[0]);
+
+   /* tmp = (s1_reg == 0) */
+   spe_ceqi(gen->f, tmp_reg, s1_reg, 0);
+   /* tmp = !tmp */
+   spe_complement(gen->f, tmp_reg, tmp_reg);
+   /* exec_mask = exec_mask & tmp */
+   spe_and(gen->f, exec_reg, exec_reg, tmp_reg);
+
+   gen->if_nesting++;
+
+   free_itemps(gen);
+
+   return true;
+}
+
+
+static boolean
+emit_ELSE(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ELSE:");
+
+   /* exec_mask = !exec_mask */
+   spe_complement(gen->f, exec_reg, exec_reg);
+
+   return true;
+}
+
+
+static boolean
+emit_ENDIF(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const int exec_reg = get_exec_mask_reg(gen);
+
+   spe_comment(gen->f, -4, "ENDIF:");
+
+   /* XXX todo: pop execution mask */
+
+   spe_load_int(gen->f, exec_reg, ~0x0);
+
+   gen->if_nesting--;
+   return true;
+}
+
+
+static boolean
+emit_DDX_DDY(struct codegen *gen, const struct tgsi_full_instruction *inst,
+             boolean ddx)
+{
+   int ch;
+
+   spe_comment(gen->f, -4, ddx ? "DDX:" : "DDY:");
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int s_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+
+         int t1_reg = get_itemp(gen);
+         int t2_reg = get_itemp(gen);
+
+         spe_splat_word(gen->f, t1_reg, s_reg, 0); /* upper-left pixel */
+         if (ddx) {
+            spe_splat_word(gen->f, t2_reg, s_reg, 1); /* upper-right pixel */
+         }
+         else {
+            spe_splat_word(gen->f, t2_reg, s_reg, 2); /* lower-left pixel */
+         }
+         spe_fs(gen->f, d_reg, t2_reg, t1_reg);
+
+         free_itemps(gen);
+      }
+   }
+
+   return true;
+}
+
+
+
+
+/**
+ * Emit END instruction.
+ * We just return from the shader function at this point.
+ *
+ * Note that there may be more code after this that would be
+ * called by TGSI_OPCODE_CALL.
+ */
+static boolean
+emit_END(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "END:");
+   emit_epilogue(gen);
+   return true;
+}
+
+
+/**
+ * Emit code for the given instruction.  Just a big switch stmt.
+ */
+static boolean
+emit_instruction(struct codegen *gen,
+                 const struct tgsi_full_instruction *inst)
+{
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      return emit_MOV(gen, inst);
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(gen, inst);
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(gen, inst);
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(gen, inst);
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(gen, inst);
+   case TGSI_OPCODE_LERP:
+      return emit_LERP(gen, inst);
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(gen, inst);
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(gen, inst);
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(gen, inst);
+   case TGSI_OPCODE_NRM:
+      return emit_NRM3(gen, inst);
+   case TGSI_OPCODE_XPD:
+      return emit_XPD(gen, inst);
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(gen, inst);
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(gen, inst);
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(gen, inst);
+   case TGSI_OPCODE_SGT:
+      return emit_SGT(gen, inst);
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(gen, inst);
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(gen, inst);
+   case TGSI_OPCODE_SLE:
+      return emit_SLE(gen, inst);
+   case TGSI_OPCODE_SEQ:
+      return emit_SEQ(gen, inst);
+   case TGSI_OPCODE_SNE:
+      return emit_SNE(gen, inst);
+   case TGSI_OPCODE_CMP:
+      return emit_CMP(gen, inst);
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(gen, inst);
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(gen, inst);
+   case TGSI_OPCODE_TRUNC:
+      return emit_TRUNC(gen, inst);
+   case TGSI_OPCODE_FLR:
+      return emit_FLR(gen, inst);
+   case TGSI_OPCODE_FRC:
+      return emit_FRC(gen, inst);
+   case TGSI_OPCODE_END:
+      return emit_END(gen);
+
+   case TGSI_OPCODE_COS:
+      return emit_function_call(gen, inst, "spu_cos", 1, TRUE);
+   case TGSI_OPCODE_SIN:
+      return emit_function_call(gen, inst, "spu_sin", 1, TRUE);
+   case TGSI_OPCODE_POW:
+      return emit_function_call(gen, inst, "spu_pow", 2, TRUE);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1, TRUE);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1, TRUE);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXL:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TEX(gen, inst);
+   case TGSI_OPCODE_KIL:
+      return emit_KIL(gen, inst);
+
+   case TGSI_OPCODE_IF:
+      return emit_IF(gen, inst);
+   case TGSI_OPCODE_ELSE:
+      return emit_ELSE(gen, inst);
+   case TGSI_OPCODE_ENDIF:
+      return emit_ENDIF(gen, inst);
+
+   case TGSI_OPCODE_DDX:
+      return emit_DDX_DDY(gen, inst, true);
+   case TGSI_OPCODE_DDY:
+      return emit_DDX_DDY(gen, inst, false);
+
+   /* XXX lots more cases to do... */
+
+   default:
+      fprintf(stderr, "Cell: unimplemented TGSI instruction %d!\n",
+              inst->Instruction.Opcode);
+      return false;
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Emit code for a TGSI immediate value (vector of four floats).
+ * This involves register allocation and initialization.
+ * XXX the initialization should be done by a "prepare" stage, not
+ * per quad execution!
+ */
+static boolean
+emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
+{
+   int ch;
+
+   assert(gen->num_imm < MAX_TEMPS);
+
+   spe_comment(gen->f, -4, "IMMEDIATE:");
+
+   for (ch = 0; ch < 4; ch++) {
+      float val = immed->u.ImmediateFloat32[ch].Float;
+
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return false;
+
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
+
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
+   }
+
+   gen->num_imm++;
+
+   return true;
+}
+
+
+
+/**
+ * Emit "code" for a TGSI declaration.
+ * We only care about TGSI TEMPORARY register declarations at this time.
+ * For each TGSI TEMPORARY we allocate four SPE registers.
+ */
+static boolean
+emit_declaration(struct cell_context *cell,
+                 struct codegen *gen, const struct tgsi_full_declaration *decl)
+{
+   int i, ch;
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_TEMPORARY:
+      for (i = decl->DeclarationRange.First;
+           i <= decl->DeclarationRange.Last;
+           i++) {
+         assert(i < MAX_TEMPS);
+         for (ch = 0; ch < 4; ch++) {
+            gen->temp_regs[i][ch] = spe_allocate_available_register(gen->f);
+            if (gen->temp_regs[i][ch] < 0)
+               return false; /* out of regs */
+         }
+
+         /* XXX if we run out of SPE registers, we need to spill
+          * to SPU memory.  someday...
+          */
+
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, -4, buf);
+         }
+      }
+      break;
+   default:
+      ; /* ignore */
+   }
+
+   return true;
+}
+
+
+
+/**
+ * Translate TGSI shader code to SPE instructions.  This is done when
+ * the state tracker gives us a new shader (via pipe->create_fs_state()).
+ *
+ * \param cell    the rendering context (in)
+ * \param tokens  the TGSI shader (in)
+ * \param f       the generated function (out)
+ */
+boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f)
+{
+   struct tgsi_parse_context parse;
+   struct codegen gen;
+
+   memset(&gen, 0, sizeof(gen));
+   gen.cell = cell;
+   gen.f = f;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   gen.inputs_reg = 3;     /* pointer to inputs array */
+   gen.outputs_reg = 4;    /* pointer to outputs array */
+   gen.constants_reg = 5;  /* pointer to constants array */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, gen.inputs_reg);
+   spe_allocate_register(f, gen.outputs_reg);
+   spe_allocate_register(f, gen.constants_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      printf("Begin %s\n", __FUNCTION__);
+      tgsi_dump(tokens, 0);
+   }
+
+   tgsi_parse_init(&parse, tokens);
+
+   emit_prologue(&gen);
+
+   while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
+            gen.error = true;
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (!emit_declaration(cell, &gen, &parse.FullToken.FullDeclaration))
+            gen.error = true;
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (!emit_instruction(&gen, &parse.FullToken.FullInstruction))
+            gen.error = true;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   if (gen.error) {
+      /* terminate the SPE code */
+      return emit_END(&gen);
+   }
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      printf("cell_gen_fragment_program nr instructions: %d\n", f->num_inst);
+      printf("End %s\n", __FUNCTION__);
+   }
+
+   tgsi_parse_free( &parse );
+
+   return !gen.error;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.h b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
new file mode 100644
index 0000000000..99faea7046
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef CELL_GEN_FP_H
+#define CELL_GEN_FP_H
+
+
+
+extern boolean
+cell_gen_fragment_program(struct cell_context *cell,
+                          const struct tgsi_token *tokens,
+                          struct spe_function *f);
+
+
+#endif /* CELL_GEN_FP_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 0000000000..2c64eb1bcc
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,2101 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
+ */
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
+    * quantities.  This only makes a difference for 32-bit Z values though.
+    */
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_clgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_clgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
+   }
+
+   return false;
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+/* This pair of functions is used inline to allocate and deallocate
+ * optional constant registers.  Once a constant is discovered to be 
+ * needed, we will likely need it again, so we don't want to deallocate
+ * it and have to allocate and load it again unnecessarily.
+ */
+static inline void
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+{
+   if (*is_already_set) return;
+   *r = spe_allocate_available_register(f);
+   *is_already_set = true;
+}
+
+static inline void
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    if (!*is_already_set) return;
+    spe_release_register(f, r);
+    *is_already_set = false;
+}
+
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          const struct pipe_blend_color *blend_color,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int tmp_reg = spe_allocate_available_register(f);
+
+   /* Optional constant registers we might or might not end up using;
+    * if we do use them, make sure we only allocate them once by
+    * keeping a flag on each one.
+    */
+   boolean one_reg_set = false;
+   unsigned int one_reg;
+   boolean constR_reg_set = false, constG_reg_set = false, 
+      constB_reg_set = false, constA_reg_set = false;
+   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_load_int(f, mask_reg, 0xff);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbG_reg, fbG_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbR_reg, fbR_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+   /*
+    * Compute Src RGB terms.  We're actually looking for the value
+    * of (the appropriate RGB factors) * (the incoming source RGB color),
+    * because in some cases (like PIPE_BLENDFACTOR_ONE and 
+    * PIPE_BLENDFACTOR_ZERO) we can avoid doing unnecessary math.
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (R,G,B) */
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factors = (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term1R_reg, 0.0f);
+      spe_load_float(f, term1G_reg, 0.0f);
+      spe_load_float(f, term1B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*R, G*G, B*B) */
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (R*A, G*A, B*A) */
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (R*(1-R), G*(1-G), B*(1-B)) 
+       * or in other words term = (R-R*R, G-G*G, B-B*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (R*(1-Rfb),G*(1-Gfb),B*(1-Bfb))
+       * or term = (R-R*Rfb, G-G*Gfb, B-B*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A), so term = (R*(1-A),G*(1-A),B*(1-A))
+       * or term = (R-R*A,G-G*A,B-B*A)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fragA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fragA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fragA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (R*Afb, G*Afb, B*Afb) */
+      spe_fm(f, term1R_reg, fragR_reg, fbA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fbA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (R*(1-Afb),G*(1-Afb),B*(1-Afb)) 
+       * or term = (R-R*Afb,G-G*Afb,b-B*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, fbA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, fbA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, fbA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
+      spe_fm(f, term1R_reg, fragR_reg, constR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
+      spe_fm(f, term1R_reg, fragR_reg, constA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, constA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
+       * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constR_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constG_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
+       * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term1R_reg, fragR_reg, constA_reg, fragR_reg);
+      spe_fnms(f, term1G_reg, fragG_reg, constA_reg, fragG_reg);
+      spe_fnms(f, term1B_reg, fragB_reg, constA_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* We'll need the optional {1,1,1,1} register */
+      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+      /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
+       * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
+       * We could expand the term (as a*min(b,c) == min(a*b,a*c)
+       * as long as a is positive), but then we'd have to do three
+       * spe_float_min() functions instead of one, so this is simpler.
+       */
+      /* tmp = 1 - Afb */
+      spe_fs(f, tmp_reg, one_reg, fbA_reg);
+      /* tmp = min(A,tmp) */
+      spe_float_min(f, tmp_reg, fragA_reg, tmp_reg);
+      /* term = R*tmp */
+      spe_fm(f, term1R_reg, fragR_reg, tmp_reg);
+      spe_fm(f, term1G_reg, fragG_reg, tmp_reg);
+      spe_fm(f, term1B_reg, fragB_reg, tmp_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term.  Like the above, we're looking for
+    * the full term A*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term1A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* fall through */
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = A */
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = A*A */
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = A*(1-A) = A-A*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fragA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = A*Afb */
+      spe_fm(f, term1A_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = A*(1-Afb) = A - A*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = A*Ac */
+      spe_fm(f, term1A_reg, fragA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB term.  Like the above, we're looking for
+    * the full term (Rfb,Gfb,Bfb)*(factor), not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factors = (1,1,1), so term = (Rfb,Gfb,Bfb) */
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor s= (0,0,0), so term = (0,0,0) */
+      spe_load_float(f, term2R_reg, 0.0f);
+      spe_load_float(f, term2G_reg, 0.0f);
+      spe_load_float(f, term2B_reg, 0.0f);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factors = (R,G,B), so term = (R*Rfb, G*Gfb, B*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factors = (1-R,1-G,1-B), so term = (Rfb*(1-R), Gfb*(1-G), Bfb*(1-B)) 
+       * or in other words term = (Rfb-Rfb*R, Gfb-Gfb*G, Bfb-Bfb*B)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fragR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fragG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fragB_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      /* factors = (A,A,A), so term = (Rfb*A, Gfb*A, Bfb*A) */
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* factors = (1-A,1-A,1-A) so term = (Rfb-Rfb*A,Gfb-Gfb*A,Bfb-Bfb*A) */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2R_reg, fbR_reg, fragA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fragA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fragA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factors = (Rfb,Gfb,Bfb), so term = (Rfb*Rfb, Gfb*Gfb, Bfb*Bfb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factors = (1-Rfb,1-Gfb,1-Bfb), so term = (Rfb*(1-Rfb),Gfb*(1-Gfb),Bfb*(1-Bfb))
+       * or term = (Rfb-Rfb*Rfb, Gfb-Gfb*Gfb, Bfb-Bfb*Bfb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbB_reg, fbB_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      /* factors = (Afb, Afb, Afb), so term = (Rfb*Afb, Gfb*Afb, Bfb*Afb) */
+      spe_fm(f, term2R_reg, fbR_reg, fbA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fbA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      /* factors = (1-Afb, 1-Afb, 1-Afb), so term = (Rfb*(1-Afb),Gfb*(1-Afb),Bfb*(1-Afb)) 
+       * or term = (Rfb-Rfb*Afb,Gfb-Gfb*Afb,Bfb-Bfb*Afb)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, fbA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, fbA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, fbA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
+      spe_fm(f, term2R_reg, fbR_reg, constR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constB_reg);
+      break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      /* we'll need the optional constant alpha register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
+      spe_fm(f, term2R_reg, fbR_reg, constA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, constA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, constA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
+       * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constR_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constG_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constB_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      /* We need the optional constant color registers */
+      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
+       * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
+       * fnms(a,b,c,d) computes a = d - b*c
+       */
+      spe_fnms(f, term2R_reg, fbR_reg, constA_reg, fbR_reg);
+      spe_fnms(f, term2G_reg, fbG_reg, constA_reg, fbG_reg);
+      spe_fnms(f, term2B_reg, fbB_reg, constA_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest RGB */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term.  Like the above, we're looking for
+    * the full term Afb*factor, not just the factor itself, because
+    * in many cases we can avoid doing unnecessary multiplies.
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* factor = 1, so term = Afb */
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      /* factor = 0, so term = 0 */
+      spe_load_float(f, term2A_reg, 0.0f);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* factor = A, so term = Afb*A */
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* factor = 1-A, so term = Afb*(1-A) = Afb-Afb*A */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fragA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* factor = Afb, so term = Afb*Afb */
+      spe_fm(f, term2A_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* factor = 1-Afb, so term = Afb*(1-Afb) = Afb - Afb*Afb */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, fbA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = Ac, so term = Afb*Ac */
+      spe_fm(f, term2A_reg, fbA_reg, constA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* We need the optional constA_reg register */
+      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
+      /* fnms(a,b,c,d) computes a = d - b*c */
+      spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: /* not supported for dest alpha */
+      ASSERT(0);
+      break;
+
+      /* These are special D3D cases involving a second color output
+       * from the fragment shader.  I'm not sure we can support them
+       * yet... XXX
+       */
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms as per the blend equation.
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragR_reg, term2R_reg, term1R_reg);
+      spe_fs(f, fragG_reg, term2G_reg, term1G_reg);
+      spe_fs(f, fragB_reg, term2B_reg, term1B_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_min(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_min(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_float_max(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_float_max(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      spe_fs(f, fragA_reg, term2A_reg, term1A_reg);
+      break;
+   case PIPE_BLEND_MIN:
+      spe_float_min(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_MAX:
+      spe_float_max(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, tmp_reg);
+
+   /* Free any optional registers that actually got used */
+   release_const_register(f, &one_reg_set, one_reg);
+   release_const_register(f, &constR_reg_set, constR_reg);
+   release_const_register(f, &constG_reg_set, constG_reg);
+   release_const_register(f, &constB_reg_set, constB_reg);
+   release_const_register(f, &constA_reg_set, constA_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.
+    * */
+   ASSERT(blend->logicop_enable);
+
+   switch(blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR: /* 0 */
+         spe_zero(f, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOR: /* ~(s | d) */
+         spe_nor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED: /* ~s & d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED: /* ~s */
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND_REVERSE: /* s & ~d */
+         /* andc R, A, B computes R = A & ~B */
+         spe_andc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_INVERT: /* ~d */
+         /* Note that (A nor A) == ~(A|A) == ~A */
+         spe_nor(f, fragRGBA_reg, fbRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_XOR: /* s ^ d */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NAND: /* ~(s & d) */
+         spe_nand(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_AND: /* s & d */
+         spe_and(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_EQUIV: /* ~(s ^ d) */
+         spe_xor(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         spe_complement(f, fragRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_NOOP: /* d */
+         spe_move(f, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR_INVERTED: /* ~s | d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg);
+         break;
+      case PIPE_LOGICOP_COPY: /* s */
+         break;
+      case PIPE_LOGICOP_OR_REVERSE: /* s | ~d */
+         /* orc R, A, B computes R = A | ~B */
+         spe_orc(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_OR: /* s | d */
+         spe_or(f, fragRGBA_reg, fragRGBA_reg, fbRGBA_reg);
+         break;
+      case PIPE_LOGICOP_SET: /* 1 */
+         spe_load_int(f, fragRGBA_reg, 0xffffffff);
+         break;
+      default:
+         ASSERT(0);
+   }
+}
+
+
+/**
+ * Generate code to pack a quad of float colors into four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   int rg_reg = spe_allocate_available_register(f);
+   int ba_reg = spe_allocate_available_register(f);
+
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to the least significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rg_reg, r_reg, g_reg);
+   spe_or(f, ba_reg, a_reg, b_reg);
+   spe_or(f, rgba_reg, rg_reg, ba_reg);
+
+   spe_release_register(f, rg_reg);
+   spe_release_register(f, ba_reg);
+}
+
+static void
+gen_colormask(struct spe_function *f,
+              uint colormask,
+              enum pipe_format color_format,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* We've got four 32-bit RGBA packed pixels in each of
+    * fragRGBA_reg and fbRGBA_reg, not sets of floating-point
+    * reds, greens, blues, and alphas.  Further, the pixels
+    * are packed according to the given color format, not
+    * necessarily RGBA...
+    */
+   unsigned int r_mask;
+   unsigned int g_mask;
+   unsigned int b_mask;
+   unsigned int a_mask;
+
+   /* Calculate exactly where the bits for any particular color
+    * end up, so we can mask them correctly.
+    */
+   switch(color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* ARGB */
+         a_mask = 0xff000000;
+         r_mask = 0x00ff0000;
+         g_mask = 0x0000ff00;
+         b_mask = 0x000000ff;
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* BGRA */
+         b_mask = 0xff000000;
+         g_mask = 0x00ff0000;
+         r_mask = 0x0000ff00;
+         a_mask = 0x000000ff;
+         break;
+      default:
+         ASSERT(0);
+   }
+
+   /* For each R, G, B, and A component we're supposed to mask out, 
+    * clear its bits.   Then our mask operation later will work 
+    * as expected.
+    */
+   if (!(colormask & PIPE_MASK_R)) {
+      r_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_G)) {
+      g_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_B)) {
+      b_mask = 0;
+   }
+   if (!(colormask & PIPE_MASK_A)) {
+      a_mask = 0;
+   }
+
+   /* Get a temporary register to hold the mask that will be applied to the fragment */
+   int colormask_reg = spe_allocate_available_register(f);
+
+   /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
+    * masks.  Load the result value into our temporary register.
+    */
+   spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
+
+   /* Use the mask register to select between the fragment color
+    * values and the frame buffer color values.  Wherever the
+    * mask has a 0 bit, the current frame buffer color should override
+    * the fragment color.  Wherever the mask has a 1 bit, the 
+    * fragment color should persevere.  The Select Bits (selb rt, rA, rB, rM)
+    * instruction will select bits from its first operand rA wherever the
+    * the mask bits rM are 0, and from its second operand rB wherever the
+    * mask bits rM are 1.  That means that the frame buffer color is the
+    * first operand, and the fragment color the second.
+    */
+    spe_selb(f, fragRGBA_reg, fbRGBA_reg, fragRGBA_reg, colormask_reg);
+
+    /* Release the temporary register and we're done */
+    spe_release_register(f, colormask_reg);
+}
+
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * There's some added complexity if there's a non-trivial state->mask
+ * value; then stencil and reference both must be masked
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int stencil_max_value,
+                 unsigned int fragment_mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & ~(s == reference) */
+         spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LESS:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference < s)  */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GREATER:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference > s) */
+         /* There's no convenient Compare Less Than Immediate instruction, so
+          * we'll have to do this one the harder way, by loading a register and 
+          * comparing directly.  Compare Logical Greater Than Word (clgt) 
+          * treats its operands as unsigned - no sign extension.
+          */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference >= s) 
+          *              = fragment_mask & ~(s > reference) */
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      if (state->value_mask == stencil_max_value) {
+         /* stencil_pass = fragment_mask & (reference <= s) ]
+          *               = fragment_mask & ~(reference > s) */
+         /* As above, we have to do this by loading a register */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+      }
+      else {
+         /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
+         unsigned int tmp_reg = spe_allocate_available_register(f);
+         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
+         spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
+         spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
+         spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
+         spe_release_register(f, tmp_reg);
+         spe_release_register(f, tmp_masked_stencil);
+      }
+      break;
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = fragment_mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = fragment_mask & 1 = fragment_mask */
+      spe_move(f, stencil_pass_reg, fragment_mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (fragment_mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, newS_reg, fbS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
+                       const unsigned int depth_enabled,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg)
+{
+   unsigned zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(stencil->enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes, which means that the zfail_op is not considered - a
+    * failing stencil test triggers the fail_op, and a passing one
+    * triggers the zpass_op
+    *
+    * As an optimization, override calculation of the zfail_op values
+    * if they aren't going to be used.  By setting the value of
+    * the operation to PIPE_STENCIL_OP_KEEP, its value will be assumed
+    * to match the incoming stencil values, and no calculation will
+    * be done.
+    */
+   if (depth_enabled) {
+      zfail_op = stencil->zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->fail_op, stencil->ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   /* Check the possibly overridden value, not the structure value */
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == stencil->fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zfail_op, stencil->ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (stencil->zpass_op == stencil->fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (stencil->zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, stencil->zpass_op, stencil->ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+}
+
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const uint facing,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   struct pipe_stencil_state *stencil;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* The facing we're given is the fragment facing; it doesn't
+    * exactly match the stencil facing.  If stencil is enabled,
+    * but two-sided stencil is *not* enabled, we use the same
+    * stencil settings for both front- and back-facing fragments.
+    * We only use the "back-facing" stencil for backfacing fragments
+    * if two-sided stenciling is enabled.
+    */
+   if (facing == CELL_FACING_BACK && dsa->stencil[1].enabled) {
+      stencil = &dsa->stencil[1];
+   }
+   else {
+      stencil = &dsa->stencil[0];
+   }
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    *
+    * Note that if the backface stencil is *not* enabled, the backface
+    * stencil will have the same values as the frontface stencil.
+    */
+   if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
+       stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
+       need_to_calculate_stencil_values = false;
+       need_to_writemask_stencil_values = false;
+    }
+    else if (stencil->write_mask == 0x0) {
+      /* All changes are writemasked out, so no need to calculate
+       * what those changes might be, and no need to write anything back.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (stencil->write_mask == 0xff) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[facing].write_mask);
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, stencil, 0xff, mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
+         &stencil_fail_values, &stencil_pass_depth_fail_values, 
+         &stencil_pass_depth_pass_values);
+   }  
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = true;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param facing whether the generated code is for front-facing or 
+ *              back-facing fragments
+ * \param f     the generated function (in/out); on input, the function
+ *              must already have been initialized.  On exit, whatever
+ *              instructions within the generated function have had
+ *              the fragment ops appended.
+ */
+void
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
+   const struct pipe_blend_state *blend = cell->blend;
+   const struct pipe_blend_color *blend_color = &cell->blend_color;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   ASSERT(facing == CELL_FACING_FRONT || facing == CELL_FACING_BACK);
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      spe_print_code(f, true);
+      spe_indent(f, 8);
+      spe_comment(f, -4, facing == CELL_FACING_FRONT ? "Begin front-facing per-fragment ops": "Begin back-facing per-fragment ops");
+   }
+
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_comment(f, 0, "Compute quad offset within tile");
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+   /* Generate the alpha test, if needed. */
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       * Note that even if depth or stencil is *not* enabled, if it's
+       * present in the buffer, we pull it out and put it back later;
+       * otherwise, we can inadvertently destroy the contents of
+       * buffers we're not supposed to touch (e.g., if the user is
+       * clearing the depth buffer but not the stencil buffer, a
+       * quad of constant depth is drawn over the surface; the stencil
+       * buffer must be maintained).
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            /* Pull out both Z and stencil */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* four 24-bit Z values in the low-order bits */
+            spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* four 8-bit stencil values in the high-order bits */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+         break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+
+            /* shift by 8 to get the upper 24-bit values */
+            spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 24-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+            /* 8-bit stencil in the low-order bits - mask them out */
+            spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+         break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 32-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            /* No stencil, so can't do anything there */
+         break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            /* XXX Not sure this is correct, but it was here before, so we're
+             * going with it for now
+             */
+            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+            /* Copy over 4 32-bit values */
+            spe_move(f, fbZ_reg, fbZS_reg);
+
+            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+             * to a 16-bit unsigned integer
+             */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            /* No stencil */
+
+         default:
+            ASSERT(0); /* invalid format */
+      }
+
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
+      if (dsa->stencil[0].enabled) {
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
+
+         /* Note that fbZ_reg may not be set on entry, if stenciling
+          * is enabled but there's no Z-buffer.  The 
+          * gen_stencil_depth_test() function must ignore the
+          * fbZ_reg register if depth is not enabled.
+          */
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+      else {
+         write_depth_stencil = false;
+      }
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            ASSERT(0);   /* XXX to do */
+         }
+         else {
+            ASSERT(0); /* bad zs_format */
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      /* Don't need these any more */
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
+   }
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_comment(f, 0, "Fetch quad colors from tile");
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+   if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
+      gen_blend(blend, blend_color, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
+         gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
+      }
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_comment(f, 0, "Store quad colors into color tile");
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   //printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+
+   if (cell->debug_flags & CELL_DEBUG_ASM) {
+      char buffer[1024];
+      sprintf(buffer, "End %s-facing per-fragment ops: %d instructions", 
+         facing == CELL_FACING_FRONT ? "front" : "back", f->num_inst);
+      spe_comment(f, -4, buffer);
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
new file mode 100644
index 0000000000..21b35d1faf
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
+
+
+extern void
+cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f);
+
+
+#endif /* CELL_GEN_FRAGMENT_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
new file mode 100644
index 0000000000..81efd137c7
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -0,0 +1,358 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:
+ *  Keith Whitwell <keith@tungstengraphics.com>
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "draw/draw_context.h"
+#include "cell_context.h"
+#include "cell_flush.h"
+#include "cell_pipe_state.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static void *
+cell_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+
+static void
+cell_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend = (struct pipe_blend_state *) blend;
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+static void
+cell_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+
+static void
+cell_set_blend_color(struct pipe_context *pipe,
+                     const struct pipe_blend_color *blend_color)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->blend_color = *blend_color;
+
+   cell->dirty |= CELL_NEW_BLEND;
+}
+
+
+
+
+static void *
+cell_create_depth_stencil_alpha_state(struct pipe_context *pipe,
+                 const struct pipe_depth_stencil_alpha_state *dsa)
+{
+   return mem_dup(dsa, sizeof(*dsa));
+}
+
+
+static void
+cell_bind_depth_stencil_alpha_state(struct pipe_context *pipe,
+                                    void *dsa)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   draw_flush(cell->draw);
+
+   cell->depth_stencil = (struct pipe_depth_stencil_alpha_state *) dsa;
+   cell->dirty |= CELL_NEW_DEPTH_STENCIL;
+}
+
+
+static void
+cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *dsa)
+{
+   FREE(dsa);
+}
+
+
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(cell->draw, clip);
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void
+cell_set_viewport_state( struct pipe_context *pipe,
+                         const struct pipe_viewport_state *viewport )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->viewport = *viewport; /* struct copy */
+   cell->dirty |= CELL_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(cell->draw, viewport);
+
+   /* Using tnl/ and vf/ modules is temporary while getting started.
+    * Full pipe will have vertex shader, vertex fetch of its own.
+    */
+}
+
+
+static void
+cell_set_scissor_state( struct pipe_context *pipe,
+                        const struct pipe_scissor_state *scissor )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->scissor, scissor, sizeof(*scissor) );
+   cell->dirty |= CELL_NEW_SCISSOR;
+}
+
+
+static void
+cell_set_polygon_stipple( struct pipe_context *pipe,
+                          const struct pipe_poly_stipple *stipple )
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   memcpy( &cell->poly_stipple, stipple, sizeof(*stipple) );
+   cell->dirty |= CELL_NEW_STIPPLE;
+}
+
+
+
+static void *
+cell_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   return mem_dup(rasterizer, sizeof(*rasterizer));
+}
+
+
+static void
+cell_bind_rasterizer_state(struct pipe_context *pipe, void *rast)
+{
+   struct pipe_rasterizer_state *rasterizer =
+      (struct pipe_rasterizer_state *) rast;
+   struct cell_context *cell = cell_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(cell->draw, rasterizer);
+
+   cell->rasterizer = rasterizer;
+
+   cell->dirty |= CELL_NEW_RASTERIZER;
+}
+
+
+static void
+cell_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
+{
+   FREE(rasterizer);
+}
+
+
+
+static void *
+cell_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+static void
+cell_bind_sampler_states(struct pipe_context *pipe,
+                         unsigned num, void **samplers)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   draw_flush(cell->draw);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_sampler_state *new_samp = i < num ? samplers[i] : NULL;
+      if (cell->sampler[i] != new_samp) {
+         cell->sampler[i] = new_samp;
+         changed |= (1 << i);
+      }
+   }
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_SAMPLER;
+      cell->dirty_samplers |= changed;
+   }
+}
+
+
+static void
+cell_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
+static void
+cell_set_sampler_textures(struct pipe_context *pipe,
+                          unsigned num, struct pipe_texture **texture)
+{
+   struct cell_context *cell = cell_context(pipe);
+   uint i, changed = 0x0;
+
+   assert(num <= CELL_MAX_SAMPLERS);
+
+   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+      struct pipe_texture *new_tex = i < num ? texture[i] : NULL;
+      if ((struct pipe_texture *) cell->texture[i] != new_tex) {
+         pipe_texture_reference((struct pipe_texture **) &cell->texture[i],
+                                new_tex);
+         changed |= (1 << i);
+      }
+   }
+
+   cell->num_textures = num;
+
+   if (changed) {
+      cell->dirty |= CELL_NEW_TEXTURE;
+      cell->dirty_textures |= changed;
+   }
+}
+
+
+
+static void
+cell_set_framebuffer_state(struct pipe_context *pipe,
+                           const struct pipe_framebuffer_state *fb)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   if (1 /*memcmp(&cell->framebuffer, fb, sizeof(*fb))*/) {
+      struct pipe_surface *csurf = fb->cbufs[0];
+      struct pipe_surface *zsurf = fb->zsbuf;
+      uint i;
+      uint flags = (PIPE_BUFFER_USAGE_GPU_WRITE |
+                    PIPE_BUFFER_USAGE_GPU_READ);
+
+      /* unmap old surfaces */
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         if (cell->framebuffer.cbufs[i] && cell->cbuf_map[i]) {
+            pipe_surface_unmap(cell->framebuffer.cbufs[i]);
+            cell->cbuf_map[i] = NULL;
+         }
+      }
+
+      if (cell->framebuffer.zsbuf && cell->zsbuf_map) {
+         pipe_surface_unmap(cell->framebuffer.zsbuf);
+         cell->zsbuf_map = NULL;
+      }
+
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.num_cbufs = fb->num_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
+
+      /* map new surfaces */
+      if (csurf)
+         cell->cbuf_map[0] = pipe_surface_map(csurf, flags);
+
+      if (zsurf)
+         cell->zsbuf_map = pipe_surface_map(zsurf, flags);
+
+      cell->dirty |= CELL_NEW_FRAMEBUFFER;
+   }
+}
+
+
+
+void
+cell_init_state_functions(struct cell_context *cell)
+{
+   cell->pipe.create_blend_state = cell_create_blend_state;
+   cell->pipe.bind_blend_state   = cell_bind_blend_state;
+   cell->pipe.delete_blend_state = cell_delete_blend_state;
+
+   cell->pipe.create_sampler_state = cell_create_sampler_state;
+   cell->pipe.bind_sampler_states = cell_bind_sampler_states;
+   cell->pipe.delete_sampler_state = cell_delete_sampler_state;
+
+   cell->pipe.set_sampler_textures = cell_set_sampler_textures;
+
+   cell->pipe.create_depth_stencil_alpha_state = cell_create_depth_stencil_alpha_state;
+   cell->pipe.bind_depth_stencil_alpha_state   = cell_bind_depth_stencil_alpha_state;
+   cell->pipe.delete_depth_stencil_alpha_state = cell_delete_depth_stencil_alpha_state;
+
+   cell->pipe.create_rasterizer_state = cell_create_rasterizer_state;
+   cell->pipe.bind_rasterizer_state   = cell_bind_rasterizer_state;
+   cell->pipe.delete_rasterizer_state = cell_delete_rasterizer_state;
+
+   cell->pipe.set_blend_color = cell_set_blend_color;
+   cell->pipe.set_clip_state = cell_set_clip_state;
+
+   cell->pipe.set_framebuffer_state = cell_set_framebuffer_state;
+
+   cell->pipe.set_polygon_stipple = cell_set_polygon_stipple;
+   cell->pipe.set_scissor_state = cell_set_scissor_state;
+   cell->pipe.set_viewport_state = cell_set_viewport_state;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.h b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
new file mode 100644
index 0000000000..1889bd52ff
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_PIPE_STATE_H
+#define CELL_PIPE_STATE_H
+
+
+struct cell_context;
+
+extern void
+cell_init_state_functions(struct cell_context *cell);
+
+
+#endif /* CELL_PIPE_STATE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
new file mode 100644
index 0000000000..79cb8df82f
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -0,0 +1,211 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Last stage of 'draw' pipeline: send tris to SPUs.
+ * \author  Brian Paul
+ */
+
+#include "cell_context.h"
+#include "cell_render.h"
+#include "cell_spu.h"
+#include "util/u_memory.h"
+#include "draw/draw_private.h"
+
+
+struct render_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct cell_context *cell;
+};
+
+
+static INLINE struct render_stage *
+render_stage(struct draw_stage *stage)
+{
+   return (struct render_stage *) stage;
+}
+
+
+static void render_begin( struct draw_stage *stage )
+{
+#if 0
+   struct render_stage *render = render_stage(stage);
+   struct cell_context *sp = render->cell;
+   const struct pipe_shader_state *fs = &render->cell->fs->shader;
+   render->quad.nr_attrs = render->cell->nr_frag_attrs;
+
+   render->firstFpInput = fs->input_semantic_name[0];
+
+   sp->quad.first->begin(sp->quad.first);
+#endif
+}
+
+
+static void render_end( struct draw_stage *stage )
+{
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+   struct render_stage *render = render_stage(stage);
+   /*render->cell->line_stipple_counter = 0;*/
+}
+
+
+static void
+render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+static void
+render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+}
+
+
+/** Write a vertex into the prim buffer */
+static void
+save_vertex(struct cell_prim_buffer *buf, uint pos,
+            const struct vertex_header *vert)
+{
+   uint attr, j;
+
+   for (attr = 0; attr < 2; attr++) {
+      for (j = 0; j < 4; j++) {
+         buf->vertex[pos][attr][j] = vert->data[attr][j];
+      }
+   }
+
+   /* update bounding box */
+   if (vert->data[0][0] < buf->xmin)
+      buf->xmin = vert->data[0][0];
+   if (vert->data[0][0] > buf->xmax)
+      buf->xmax = vert->data[0][0];
+   if (vert->data[0][1] < buf->ymin)
+      buf->ymin = vert->data[0][1];
+   if (vert->data[0][1] > buf->ymax)
+      buf->ymax = vert->data[0][1];
+}
+
+
+static void
+render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct render_stage *rs = render_stage(stage);
+   struct cell_context *cell = rs->cell;
+   struct cell_prim_buffer *buf = &cell->prim_buffer;
+   uint i;
+
+   if (buf->num_verts + 3 > CELL_MAX_VERTS) {
+      cell_flush_prim_buffer(cell);
+   }
+
+   i = buf->num_verts;
+   assert(i+2 <= CELL_MAX_VERTS);
+   save_vertex(buf, i+0, prim->v[0]);
+   save_vertex(buf, i+1, prim->v[1]);
+   save_vertex(buf, i+2, prim->v[2]);
+   buf->num_verts += 3;
+}
+
+
+/**
+ * Send the a RENDER command to all SPUs to have them render the prims
+ * in the current prim_buffer.
+ */
+void
+cell_flush_prim_buffer(struct cell_context *cell)
+{
+   uint i;
+
+   if (cell->prim_buffer.num_verts == 0)
+      return;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      struct cell_command_render *render = &cell_global.command[i].render;
+      render->prim_type = PIPE_PRIM_TRIANGLES;
+      render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
+      render->vertex_size = cell->vertex_info->size * 4;
+      render->xmin = cell->prim_buffer.xmin;
+      render->ymin = cell->prim_buffer.ymin;
+      render->xmax = cell->prim_buffer.xmax;
+      render->ymax = cell->prim_buffer.ymax;
+      render->vertex_data = &cell->prim_buffer.vertex;
+      ASSERT_ALIGN16(render->vertex_data);
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_RENDER);
+   }
+
+   cell->prim_buffer.num_verts = 0;
+
+   cell->prim_buffer.xmin = 1e100;
+   cell->prim_buffer.ymin = 1e100;
+   cell->prim_buffer.xmax = -1e100;
+   cell->prim_buffer.ymax = -1e100;
+
+   /* XXX temporary, need to double-buffer the prim buffer until we get
+    * a real command buffer/list system.
+    */
+   cell_flush(&cell->pipe, 0x0);
+}
+
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new draw/render stage.  This will be plugged into the
+ * draw module as the last pipeline stage.
+ */
+struct draw_stage *cell_draw_render_stage( struct cell_context *cell )
+{
+   struct render_stage *render = CALLOC_STRUCT(render_stage);
+
+   render->cell = cell;
+   render->stage.draw = cell->draw;
+   render->stage.begin = render_begin;
+   render->stage.point = render_point;
+   render->stage.line = render_line;
+   render->stage.tri = render_tri;
+   render->stage.end = render_end;
+   render->stage.reset_stipple_counter = reset_stipple_counter;
+   render->stage.destroy = render_destroy;
+
+   /*
+   render->quad.coef = render->coef;
+   render->quad.posCoef = &render->posCoef;
+   */
+
+   return &render->stage;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_render.h b/src/gallium/drivers/cell/ppu/cell_render.h
new file mode 100644
index 0000000000..826dcbafeb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_render.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_RENDER_H
+#define CELL_RENDER_H
+
+struct cell_context;
+struct draw_stage;
+
+extern void
+cell_flush_prim_buffer(struct cell_context *cell);
+
+extern struct draw_stage *cell_draw_render_stage( struct cell_context *cell );
+
+#endif /* CELL_RENDER_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
new file mode 100644
index 0000000000..d223557950
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "cell/common.h"
+#include "cell_screen.h"
+#include "cell_texture.h"
+#include "cell_winsys.h"
+
+
+static const char *
+cell_get_vendor(struct pipe_screen *screen)
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+cell_get_name(struct pipe_screen *screen)
+{
+   return "Cell";
+}
+
+
+static int
+cell_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return CELL_MAX_SAMPLERS;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 10;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return CELL_MAX_TEXTURE_LEVELS;
+   default:
+      return 10;
+   }
+}
+
+
+static float
+cell_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+
+   default:
+      return 10;
+   }
+}
+
+
+static boolean
+cell_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   /* cell supports most formats, XXX for now anyway */
+   if (format == PIPE_FORMAT_DXT5_RGBA ||
+       format == PIPE_FORMAT_R8G8B8A8_SRGB)
+      return FALSE;
+   else
+      return TRUE;
+}
+
+
+static void
+cell_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no cell_screen) but
+ * that would be the place to put SPU thread/context info...
+ */
+struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys)
+{
+   struct pipe_screen *screen = CALLOC_STRUCT(pipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->winsys = winsys;
+
+   screen->destroy = cell_destroy_screen;
+
+   screen->get_name = cell_get_name;
+   screen->get_vendor = cell_get_vendor;
+   screen->get_param = cell_get_param;
+   screen->get_paramf = cell_get_paramf;
+   screen->is_format_supported = cell_is_format_supported;
+
+   cell_init_screen_texture_funcs(screen);
+
+   return screen;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.h b/src/gallium/drivers/cell/ppu/cell_screen.h
new file mode 100644
index 0000000000..c7e15889d6
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_screen.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_SCREEN_H
+#define CELL_SCREEN_H
+
+
+struct pipe_screen;
+struct pipe_winsys;
+
+
+extern struct pipe_screen *
+cell_create_screen(struct pipe_winsys *winsys);
+
+
+#endif /* CELL_SCREEN_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
new file mode 100644
index 0000000000..28e5e6d706
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -0,0 +1,219 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
+#include <pthread.h>
+
+#include "cell_spu.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/opt/ibm/cell-sdk/prototype/src/include/ppu/cbe_mfc.h
+*/
+
+
+/**
+ * Cell/SPU info that's not per-context.
+ */
+struct cell_global_info cell_global;
+
+
+/**
+ * Scan /proc/cpuinfo to determine the timebase for the system.
+ * This is used by the SPUs to convert 'decrementer' ticks to seconds.
+ * There may be a better way to get this value...
+ */
+static unsigned
+get_timebase(void)
+{
+   FILE *f = fopen("/proc/cpuinfo", "r");
+   unsigned timebase;
+
+   assert(f);
+   while (!feof(f)) {
+      char line[80];
+      fgets(line, sizeof(line), f);
+      if (strncmp(line, "timebase", 8) == 0) {
+         char *colon = strchr(line, ':');
+         if (colon) {
+            timebase = atoi(colon + 2);
+            break;
+         }
+      }
+   }
+   fclose(f);
+
+   return timebase;
+}
+
+
+/**
+ * Write a 1-word message to the given SPE mailbox.
+ */
+void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg)
+{
+   spe_in_mbox_write(ctx, &msg, 1, SPE_MBOX_ALL_BLOCKING);
+}
+
+
+/**
+ * Wait for a 1-word message to arrive in given mailbox.
+ */
+uint
+wait_mbox_message(spe_context_ptr_t ctx)
+{
+   do {
+      unsigned data;
+      int count = spe_out_mbox_read(ctx, &data, 1);
+
+      if (count == 1) {
+	 return data;
+      }
+      
+      if (count < 0) {
+	 /* error */ ;
+      }
+   } while (1);
+}
+
+
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
+{
+   struct cell_init_info *init = (struct cell_init_info *) arg;
+   unsigned entry = SPE_DEFAULT_ENTRY;
+
+   ASSERT_ALIGN16(init);
+
+   if (spe_context_run(cell_global.spe_contexts[init->id], &entry, 0,
+                       init, NULL, NULL) < 0) {
+      fprintf(stderr, "spe_context_run() failed\n");
+      exit(1);
+   }
+
+   pthread_exit(NULL);
+}
+
+
+/**
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
+ */
+void
+cell_start_spus(struct cell_context *cell)
+{
+   static boolean one_time_init = FALSE;
+   uint i, j;
+   uint timebase = get_timebase();
+
+   if (one_time_init) {
+      fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
+	      "on Cell.\n");
+      abort();
+   }
+
+   one_time_init = TRUE;
+
+   assert(cell->num_spus <= CELL_MAX_SPUS);
+
+   ASSERT_ALIGN16(&cell_global.inits[0]);
+   ASSERT_ALIGN16(&cell_global.inits[1]);
+
+   /*
+    * Initialize the global 'inits' structure for each SPU.
+    * A pointer to the init struct will be passed to each SPU.
+    * The SPUs will then each grab their init info with mfc_get().
+    */
+   for (i = 0; i < cell->num_spus; i++) {
+      cell_global.inits[i].id = i;
+      cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
+      cell_global.inits[i].inv_timebase = 1000.0f / timebase;
+
+      for (j = 0; j < CELL_NUM_BUFFERS; j++) {
+         cell_global.inits[i].buffers[j] = cell->buffer[j];
+      }
+      cell_global.inits[i].buffer_status = &cell->buffer_status[0][0][0];
+
+      cell_global.inits[i].spu_functions = &cell->spu_functions;
+
+      cell_global.spe_contexts[i] = spe_context_create(0, NULL);
+      if (!cell_global.spe_contexts[i]) {
+         fprintf(stderr, "spe_context_create() failed\n");
+         exit(1);
+      }
+
+      if (spe_program_load(cell_global.spe_contexts[i], &g3d_spu)) {
+         fprintf(stderr, "spe_program_load() failed\n");
+         exit(1);
+      }
+      
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
+   }
+}
+
+
+/**
+ * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
+ */
+void
+cell_spu_exit(struct cell_context *cell)
+{
+   uint i;
+
+   for (i = 0; i < cell->num_spus; i++) {
+      send_mbox_message(cell_global.spe_contexts[i], CELL_CMD_EXIT);
+   }
+
+   /* wait for threads to exit */
+   for (i = 0; i < cell->num_spus; i++) {
+      void *value;
+      pthread_join(cell_global.spe_threads[i], &value);
+      cell_global.spe_threads[i] = 0;
+      cell_global.spe_contexts[i] = 0;
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.h b/src/gallium/drivers/cell/ppu/cell_spu.h
new file mode 100644
index 0000000000..c93958a9ed
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_spu.h
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_SPU
+#define CELL_SPU
+
+
+#include <libspe2.h>
+#include <pthread.h>
+#include "cell/common.h"
+
+#include "cell_context.h"
+
+
+/**
+ * Global vars, for now anyway.
+ */
+struct cell_global_info
+{
+   /**
+    * SPU/SPE handles, etc
+    */
+   spe_context_ptr_t spe_contexts[CELL_MAX_SPUS];
+   pthread_t spe_threads[CELL_MAX_SPUS];
+
+   /**
+    * Data sent to SPUs at start-up
+    */
+   struct cell_init_info inits[CELL_MAX_SPUS];
+};
+
+
+extern struct cell_global_info cell_global;
+
+
+/** This is the handle for the actual SPE code */
+extern spe_program_handle_t g3d_spu;
+
+
+extern void
+send_mbox_message(spe_context_ptr_t ctx, unsigned int msg);
+
+extern uint
+wait_mbox_message(spe_context_ptr_t ctx);
+
+
+extern void
+cell_start_spus(struct cell_context *cell);
+
+
+extern void
+cell_spu_exit(struct cell_context *cell);
+
+
+#endif /* CELL_SPU */
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
new file mode 100644
index 0000000000..b193170f9c
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_STATE_H
+#define CELL_STATE_H
+
+
+#define CELL_NEW_VIEWPORT      0x1
+#define CELL_NEW_RASTERIZER    0x2
+#define CELL_NEW_FS            0x4
+#define CELL_NEW_BLEND         0x8
+#define CELL_NEW_CLIP          0x10
+#define CELL_NEW_SCISSOR       0x20
+#define CELL_NEW_STIPPLE       0x40
+#define CELL_NEW_FRAMEBUFFER   0x80
+#define CELL_NEW_ALPHA_TEST    0x100
+#define CELL_NEW_DEPTH_STENCIL 0x200
+#define CELL_NEW_SAMPLER       0x400
+#define CELL_NEW_TEXTURE       0x800
+#define CELL_NEW_VERTEX        0x1000
+#define CELL_NEW_VS            0x2000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
+
+
+extern void
+cell_update_derived( struct cell_context *softpipe );
+
+
+extern void
+cell_init_shader_functions(struct cell_context *cell);
+
+
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_H */
+
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
new file mode 100644
index 0000000000..efc4f78364
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "cell_context.h"
+#include "cell_batch.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+
+
+/**
+ * Determine how to map vertex program outputs to fragment program inputs.
+ * Basically, this will be used when computing the triangle interpolation
+ * coefficients from the post-transform vertex attributes.
+ */
+static void
+calculate_vertex_layout( struct cell_context *cell )
+{
+   const struct cell_fragment_shader_state *fs = cell->fs;
+   const enum interp_mode colorInterp
+      = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   struct vertex_info *vinfo = &cell->vertex_info;
+   uint i;
+   int src;
+
+#if 0
+   if (cell->vbuf) {
+      /* if using the post-transform vertex buffer, tell draw_vbuf to
+       * simply emit the whole post-xform vertex as-is:
+       */
+      struct vertex_info *vinfo_vbuf = &cell->vertex_info_vbuf;
+      vinfo_vbuf->num_attribs = 0;
+      draw_emit_vertex_attr(vinfo_vbuf, EMIT_ALL, INTERP_NONE, 0);
+      vinfo_vbuf->size = 4 * vs->num_outputs + sizeof(struct vertex_header)/4;
+   }
+#endif
+
+   /* reset vinfo */
+   vinfo->num_attribs = 0;
+
+   /* we always want to emit vertex pos */
+   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   assert(src >= 0);
+   draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+
+
+   /*
+    * Loop over fragment shader inputs, searching for the matching output
+    * from the vertex shader.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         /* already done above */
+         break;
+
+      case TGSI_SEMANTIC_COLOR:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+         break;
+
+      case TGSI_SEMANTIC_FOG:
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+#if 1
+         if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
+            src = 0;
+#endif
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+         break;
+
+      case TGSI_SEMANTIC_GENERIC:
+         /* this includes texcoords and varying vars */
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+                              fs->info.input_semantic_index[i]);
+         assert(src >= 0);
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   draw_compute_vertex_size(vinfo);
+
+   /* XXX only signal this if format really changes */
+   cell->dirty |= CELL_NEW_VERTEX_INFO;
+}
+
+
+#if 0
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct cell_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+#endif
+
+
+
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
+void cell_update_derived( struct cell_context *cell )
+{
+   if (cell->dirty & (CELL_NEW_RASTERIZER |
+                      CELL_NEW_FS |
+                      CELL_NEW_VS))
+      calculate_vertex_layout( cell );
+
+#if 0
+   if (cell->dirty & (CELL_NEW_SCISSOR |
+                      CELL_NEW_DEPTH_STENCIL_ALPHA |
+                      CELL_NEW_FRAMEBUFFER))
+      compute_cliprect(cell);
+#endif
+
+   cell_emit_state(cell);
+
+   cell->dirty = 0;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
new file mode 100644
index 0000000000..0a0af81f53
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -0,0 +1,335 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+#include "cell_state.h"
+#include "cell_state_emit.h"
+#include "cell_batch.h"
+#include "cell_texture.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.blend_color = cell->blend_color;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code_front, spe_code_back;
+      unsigned int facing_dependent, total_code_size;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code.  The
+       * "0" passed in for the size means that the SPE code will
+       * use a default size.
+       */
+      spe_init_func(&spe_code_front, 0);
+      spe_init_func(&spe_code_back, 0);
+
+      /* Generate new code.  Always generate new code for both front-facing
+       * and back-facing fragments, even if it's the same code in both
+       * cases.
+       */
+      cell_gen_fragment_function(cell, CELL_FACING_FRONT, &spe_code_front);
+      cell_gen_fragment_function(cell, CELL_FACING_BACK, &spe_code_back);
+
+      /* Make sure the code is a multiple of 8 bytes long; this is
+       * required to ensure that the dual pipe instruction alignment
+       * is correct.  It's also important for the SPU unpacking,
+       * which assumes 8-byte boundaries.
+       */
+      unsigned int front_code_size = spe_code_size(&spe_code_front);
+      while (front_code_size % 8 != 0) {
+         spe_lnop(&spe_code_front);
+         front_code_size = spe_code_size(&spe_code_front);
+      }
+      unsigned int back_code_size = spe_code_size(&spe_code_back);
+      while (back_code_size % 8 != 0) {
+         spe_lnop(&spe_code_back);
+         back_code_size = spe_code_size(&spe_code_back);
+      }
+
+      /* Determine whether the code we generated is facing-dependent, by
+       * determining whether the generated code is different for the front-
+       * and back-facing fragments.
+       */
+      if (front_code_size == back_code_size && memcmp(spe_code_front.store, spe_code_back.store, front_code_size) == 0) {
+         /* Code is identical; only need one copy. */
+         facing_dependent = 0;
+         total_code_size = front_code_size;
+      }
+      else {
+         /* Code is different for front-facing and back-facing fragments.
+          * Need to send both copies.
+          */
+         facing_dependent = 1;
+         total_code_size = front_code_size + back_code_size;
+      }
+
+      /* alloc new fragment ops command.  Note that this structure
+       * has variant length based on the total code size required.
+       */
+      ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->total_code_size = total_code_size;
+      ops->front_code_index = 0;
+      memcpy(ops->code, spe_code_front.store, front_code_size);
+      if (facing_dependent) {
+        /* We have separate front- and back-facing code.  Append the
+         * back-facing code to the buffer.  Be careful because the code
+         * size is in bytes, but the buffer is of unsigned elements.
+         */
+        ops->back_code_index = front_code_size / sizeof(spe_code_front.store[0]);
+        memcpy(ops->code + ops->back_code_index, spe_code_back.store, back_code_size);
+      }
+      else {
+        /* Use the same code for front- and back-facing fragments */
+        ops->back_code_index = ops->front_code_index;
+      }
+
+      /* Set the fields for the fallback case.  Note that these fields
+       * (and the whole fallback case) will eventually go away.
+       */
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+      ops->blend_color = cell->blend_color;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code_front);
+      spe_release_func(&spe_code_back);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
+static void
+emit_state_cmd(struct cell_context *cell, uint cmd,
+               const void *state, uint state_size)
+{
+   uint64_t *dst = (uint64_t *) 
+       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   *dst = cmd;
+   memcpy(dst + 1, state, state_size);
+}
+
+
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
+void
+cell_emit_state(struct cell_context *cell)
+{
+   if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
+      struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
+      struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      struct cell_command_framebuffer *fb
+         = cell_batch_alloc(cell, sizeof(*fb));
+      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+      fb->color_start = cell->cbuf_map[0];
+      fb->color_format = cbuf->format;
+      fb->depth_start = cell->zsbuf_map;
+      fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
+      fb->width = cell->framebuffer.width;
+      fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
+   }
+
+   if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      struct cell_command_rasterizer *rast =
+         cell_batch_alloc(cell, sizeof(*rast));
+      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+      rast->rasterizer = *cell->rasterizer;
+   }
+
+   if (cell->dirty & (CELL_NEW_FS)) {
+      /* Send new fragment program to SPUs */
+      struct cell_command_fragment_program *fp
+            = cell_batch_alloc(cell, sizeof(*fp));
+      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+      fp->num_inst = cell->fs->code.num_inst;
+      memcpy(&fp->code, cell->fs->code.store,
+             SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
+      if (0) {
+         int i;
+         printf("PPU Emit CELL_CMD_STATE_FRAGMENT_PROGRAM:\n");
+         for (i = 0; i < fp->num_inst; i++) {
+            printf(" %3d: 0x%08x\n", i, fp->code[i]);
+         }
+      }
+   }
+
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      /* Note that cell_command_fragment_ops is a variant-sized record */
+      fops = lookup_fragment_ops(cell);
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
+   }
+
+   if (cell->dirty & CELL_NEW_SAMPLER) {
+      uint i;
+      for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_samplers & (1 << i)) {
+            if (cell->sampler[i]) {
+               struct cell_command_sampler *sampler
+                  = cell_batch_alloc(cell, sizeof(*sampler));
+               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+               sampler->unit = i;
+               sampler->state = *cell->sampler[i];
+            }
+         }
+      }
+      cell->dirty_samplers = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_TEXTURE) {
+      uint i;
+      for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
+         if (cell->dirty_textures & (1 << i)) {
+            struct cell_command_texture *texture
+               =  cell_batch_alloc(cell, sizeof(*texture));
+            texture->opcode = CELL_CMD_STATE_TEXTURE;
+            texture->unit = i;
+            if (cell->texture[i]) {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = cell->texture[i]->tiled_mapped[level];
+                  texture->width[level] = cell->texture[i]->base.width[level];
+                  texture->height[level] = cell->texture[i]->base.height[level];
+                  texture->depth[level] = cell->texture[i]->base.depth[level];
+               }
+               texture->target = cell->texture[i]->base.target;
+            }
+            else {
+               uint level;
+               for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+                  texture->start[level] = NULL;
+                  texture->width[level] = 0;
+                  texture->height[level] = 0;
+                  texture->depth[level] = 0;
+               }
+               texture->target = 0;
+            }
+         }
+      }
+      cell->dirty_textures = 0x0;
+   }
+
+   if (cell->dirty & CELL_NEW_VERTEX_INFO) {
+      emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
+                     &cell->vertex_info, sizeof(struct vertex_info));
+   }
+
+#if 0
+   if (cell->dirty & CELL_NEW_VS) {
+      const struct draw_context *const draw = cell->draw;
+      struct cell_shader_info info;
+
+      info.num_outputs = draw_num_vs_outputs(draw);
+      info.declarations = (uintptr_t) draw->vs.machine.Declarations;
+      info.num_declarations = draw->vs.machine.NumDeclarations;
+      info.instructions = (uintptr_t) draw->vs.machine.Instructions;
+      info.num_instructions = draw->vs.machine.NumInstructions;
+      info.immediates = (uintptr_t) draw->vs.machine.Imms;
+      info.num_immediates = draw->vs.machine.ImmLimit / 4;
+
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.h b/src/gallium/drivers/cell/ppu/cell_state_emit.h
new file mode 100644
index 0000000000..59f8affe8d
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.h
@@ -0,0 +1,36 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_STATE_EMIT_H
+#define CELL_STATE_EMIT_H
+
+
+extern void
+cell_emit_state(struct cell_context *cell);
+
+
+#endif /* CELL_STATE_EMIT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
new file mode 100644
index 0000000000..78cb446c14
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -0,0 +1,1430 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file
+ * Generate code to perform all per-fragment operations.
+ *
+ * Code generated by these functions perform both alpha, depth, and stencil
+ * testing as well as alpha blending.
+ *
+ * \note
+ * Occlusion query is not supported, but this is the right place to add that
+ * support.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "cell_context.h"
+
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Generate code to perform alpha testing.
+ *
+ * The code generated by this function uses the register specificed by
+ * \c mask as both an input and an output.
+ *
+ * \param dsa    Current alpha-test state
+ * \param f      Function to which code should be appended
+ * \param mask   Index of register containing active fragment mask
+ * \param alphas Index of register containing per-fragment alpha values
+ *
+ * \note Emits a maximum of 6 instructions.
+ */
+static void
+emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int alphas)
+{
+   /* If the alpha function is either NEVER or ALWAYS, there is no need to
+    * load the reference value into a register.  ALWAYS is a fairly common
+    * case, and this optimization saves 2 instructions.
+    */
+   if (dsa->alpha.enabled
+       && (dsa->alpha.func != PIPE_FUNC_NEVER)
+       && (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      int ref = spe_allocate_available_register(f);
+      int tmp_a = spe_allocate_available_register(f);
+      int tmp_b = spe_allocate_available_register(f);
+      union {
+         float f;
+         unsigned u;
+      } ref_val;
+      boolean complement = FALSE;
+
+      ref_val.f = dsa->alpha.ref;
+
+      spe_il(f, ref, ref_val.u & 0x0000ffff);
+      spe_ilh(f, ref, ref_val.u >> 16);
+
+      switch (dsa->alpha.func) {
+      case PIPE_FUNC_NOTEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_EQUAL:
+         spe_fceq(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LEQUAL:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GREATER:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         break;
+
+      case PIPE_FUNC_LESS:
+         complement = TRUE;
+         /* FALLTHROUGH */
+
+      case PIPE_FUNC_GEQUAL:
+         spe_fcgt(f, tmp_a, ref, alphas);
+         spe_fceq(f, tmp_b, ref, alphas);
+         spe_or(f, tmp_a, tmp_b, tmp_a);
+         break;
+
+      case PIPE_FUNC_ALWAYS:
+      case PIPE_FUNC_NEVER:
+      default:
+         assert(0);
+         break;
+      }
+
+      if (complement) {
+         spe_andc(f, mask, mask, tmp_a);
+      } else {
+         spe_and(f, mask, mask, tmp_a);
+      }
+
+      spe_release_register(f, ref);
+      spe_release_register(f, tmp_a);
+      spe_release_register(f, tmp_b);
+   } else if (dsa->alpha.enabled && (dsa->alpha.func == PIPE_FUNC_NEVER)) {
+      spe_il(f, mask, 0);
+   }
+}
+
+
+/**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
+ * \param dsa        Current depth-test state
+ * \param f          Function to which code should be appended
+ * \param mask       Index of register to contain depth-pass mask
+ * \param stored     Index of register containing values from depth buffer
+ * \param calculated Index of register containing per-fragment depth values
+ *
+ * \return
+ * If the calculated depth comparison mask is the actual mask, \c FALSE is
+ * returned.  If the calculated depth comparison mask is the compliment of
+ * the actual mask, \c TRUE is returned.
+ *
+ * \note Emits a maximum of 3 instructions.
+ */
+static boolean
+emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
+                struct spe_function *f, int mask, int stored, int calculated)
+{
+   unsigned func = (dsa->depth.enabled)
+       ? dsa->depth.func : PIPE_FUNC_ALWAYS;
+   int tmp = spe_allocate_available_register(f);
+   boolean compliment = FALSE;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask, 0);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      spe_ceq(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      spe_clgt(f, mask, calculated, stored);
+      break;
+
+   case PIPE_FUNC_LESS:
+      compliment = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      spe_clgt(f, mask, calculated, stored);
+      spe_ceq(f, tmp, calculated, stored);
+      spe_or(f, mask, mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      spe_il(f, mask, ~0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   spe_release_register(f, tmp);
+   return compliment;
+}
+
+
+/**
+ * Generate code to apply the stencil operation (after testing).
+ * \note Emits a maximum of 5 instructions.
+ *
+ * \warning
+ * Since \c out and \c in might be the same register, this routine cannot
+ * generate code that uses \c out as a temporary.
+ */
+static void
+emit_stencil_op(struct spe_function *f,
+                int out, int in, int mask, unsigned op, unsigned ref)
+{
+   const int clamp = spe_allocate_available_register(f);
+   const int clamp_mask = spe_allocate_available_register(f);
+   const int result = spe_allocate_available_register(f);
+
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      assert(0);
+   case PIPE_STENCIL_OP_ZERO:
+      spe_il(f, result, 0);
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      spe_il(f, result, ref);
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
+      spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
+      spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      spe_il(f, clamp, 0);
+      spe_ai(f, result, in, -1);
+
+      /* If "(s-1) < 0" in signed arithemtic, then "(s-1) > MAX" in unsigned
+       * arithmetic.
+       */
+      spe_clgti(f, clamp_mask, result, 0x0ff);
+      spe_selb(f, result, result, clamp, clamp_mask);
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      spe_ai(f, result, in, 1);
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      spe_ai(f, result, in, -1);
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      spe_nor(f, result, in, in);
+      break;
+   default:
+      assert(0);
+   }
+
+   spe_selb(f, out, in, result, mask);
+
+   spe_release_register(f, result);
+   spe_release_register(f, clamp_mask);
+   spe_release_register(f, clamp);
+}
+
+
+/**
+ * Generate code to do stencil test.  Four pixels are tested at once.
+ * \param dsa        Depth / stencil test state
+ * \param face       0 for front face, 1 for back face
+ * \param f          Function to append instructions to
+ * \param mask       Register containing mask of fragments passing the
+ *                   alpha test
+ * \param depth_mask Register containing mask of fragments passing the
+ *                   depth test
+ * \param depth_compliment  Is \c depth_mask the compliment of the actual mask?
+ * \param stencil    Register containing values from stencil buffer
+ * \param depth_pass Register to store mask of fragments passing stencil test
+ *                   and depth test
+ *
+ * \note
+ * Emits a maximum of 10 + (3 * 5) = 25 instructions.
+ */
+static int
+emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
+                  unsigned face,
+                  struct spe_function *f,
+                  int mask,
+                  int depth_mask,
+                  boolean depth_complement,
+                  int stencil,
+                  int depth_pass)
+{
+   int stencil_fail = spe_allocate_available_register(f);
+   int depth_fail = spe_allocate_available_register(f);
+   int stencil_mask = spe_allocate_available_register(f);
+   int stencil_pass = spe_allocate_available_register(f);
+   int face_stencil = spe_allocate_available_register(f);
+   int stencil_src = stencil;
+   const unsigned ref = (dsa->stencil[face].ref_value
+                         & dsa->stencil[face].value_mask);
+   boolean complement = FALSE;
+   int stored;
+   int tmp = spe_allocate_available_register(f);
+
+
+   if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+       && (dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+       && (dsa->stencil[face].value_mask != 0x0ff)) {
+      stored = spe_allocate_available_register(f);
+      spe_andi(f, stored, stencil, dsa->stencil[face].value_mask);
+   } else {
+      stored = stencil;
+   }
+
+
+   switch (dsa->stencil[face].func) {
+   case PIPE_FUNC_NEVER:
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
+      spe_ceqi(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      break;
+
+   case PIPE_FUNC_LESS:
+      complement = TRUE;
+      /* FALLTHROUGH */
+   case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
+      spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
+      spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
+      spe_or(f, stencil_mask, stencil_mask, tmp);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* See comment below. */
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   if (stored != stencil) {
+      spe_release_register(f, stored);
+   }
+   spe_release_register(f, tmp);
+
+
+   /* ALWAYS is a very common stencil-test, so some effort is applied to
+    * optimize that case.  The stencil-pass mask is the same as the input
+    * fragment mask.  This makes the stencil-test (above) a no-op, and the
+    * input fragment mask can be "renamed" the stencil-pass mask.
+    */
+   if (dsa->stencil[face].func == PIPE_FUNC_ALWAYS) {
+      spe_release_register(f, stencil_pass);
+      stencil_pass = mask;
+   } else {
+      if (complement) {
+         spe_andc(f, stencil_pass, mask, stencil_mask);
+      } else {
+         spe_and(f, stencil_pass, mask, stencil_mask);
+      }
+   }
+
+   if (depth_complement) {
+      spe_andc(f, depth_pass, stencil_pass, depth_mask);
+   } else {
+      spe_and(f, depth_pass, stencil_pass, depth_mask);
+   }
+
+
+   /* Conditionally emit code to update the stencil value under various
+    * condititons.  Note that there is no need to generate code under the
+    * following circumstances:
+    *
+    * - Stencil write mask is zero.
+    * - For stencil-fail if the stencil test is ALWAYS
+    * - For depth-fail if the stencil test is NEVER
+    * - For depth-pass if the stencil test is NEVER
+    * - Any of the 3 conditions if the operation is KEEP
+    */
+   if (dsa->stencil[face].write_mask != 0) {
+      if ((dsa->stencil[face].func != PIPE_FUNC_ALWAYS)
+          && (dsa->stencil[face].fail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (complement) {
+            spe_and(f, stencil_fail, mask, stencil_mask);
+         } else {
+            spe_andc(f, stencil_fail, mask, stencil_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, stencil_fail,
+                         dsa->stencil[face].fail_op,
+                         dsa->stencil[face].ref_value);
+
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zfail_op != PIPE_STENCIL_OP_KEEP)) {
+         if (depth_complement) {
+            spe_and(f, depth_fail, stencil_pass, depth_mask);
+         } else {
+            spe_andc(f, depth_fail, stencil_pass, depth_mask);
+         }
+
+         emit_stencil_op(f, face_stencil, stencil_src, depth_fail,
+                         dsa->stencil[face].zfail_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+
+      if ((dsa->stencil[face].func != PIPE_FUNC_NEVER)
+          && (dsa->stencil[face].zpass_op != PIPE_STENCIL_OP_KEEP)) {
+         emit_stencil_op(f, face_stencil, stencil_src, depth_pass,
+                         dsa->stencil[face].zpass_op,
+                         dsa->stencil[face].ref_value);
+         stencil_src = face_stencil;
+      }
+   }
+
+   spe_release_register(f, stencil_fail);
+   spe_release_register(f, depth_fail);
+   spe_release_register(f, stencil_mask);
+   if (stencil_pass != mask) {
+      spe_release_register(f, stencil_pass);
+   }
+
+   /* If all of the stencil operations were KEEP or the stencil write mask was
+    * zero, "stencil_src" will still be set to "stencil".  In this case
+    * release the "face_stencil" register.  Otherwise apply the stencil write
+    * mask to select bits from the calculated stencil value and the previous
+    * stencil value.
+    */
+   if (stencil_src == stencil) {
+      spe_release_register(f, face_stencil);
+   } else if (dsa->stencil[face].write_mask != 0x0ff) {
+      int tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, dsa->stencil[face].write_mask);
+      spe_selb(f, stencil_src, stencil, stencil_src, tmp);
+
+      spe_release_register(f, tmp);
+   }
+
+   return stencil_src;
+}
+
+
+void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
+{
+   struct pipe_depth_stencil_alpha_state *const dsa = &cdsa->base;
+   struct spe_function *const f = &cdsa->code;
+
+   /* This code generates a maximum of 6 (alpha test) + 3 (depth test)
+    * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
+    * up to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Allocate registers for the function's input parameters.  Cleverly (and
+    * clever code is usually dangerous, but I couldn't resist) the generated
+    * function returns a structure.  Returned structures start with register
+    * 3, and the structure fields are ordered to match up exactly with the
+    * input parameters.
+    */
+   int mask = spe_allocate_register(f, 3);
+   int depth = spe_allocate_register(f, 4);
+   int stencil = spe_allocate_register(f, 5);
+   int zvals = spe_allocate_register(f, 6);
+   int frag_a = spe_allocate_register(f, 7);
+   int facing = spe_allocate_register(f, 8);
+
+   int depth_mask = spe_allocate_available_register(f);
+
+   boolean depth_complement;
+
+
+   emit_alpha_test(dsa, f, mask, frag_a);
+
+   depth_complement = emit_depth_test(dsa, f, depth_mask, depth, zvals);
+
+   if (dsa->stencil[0].enabled) {
+      const int front_depth_pass = spe_allocate_available_register(f);
+      int front_stencil = emit_stencil_test(dsa, 0, f, mask,
+                                            depth_mask, depth_complement,
+                                            stencil, front_depth_pass);
+
+      if (dsa->stencil[1].enabled) {
+         const int back_depth_pass = spe_allocate_available_register(f);
+         int back_stencil = emit_stencil_test(dsa, 1, f, mask,
+                                              depth_mask,  depth_complement,
+                                              stencil, back_depth_pass);
+
+         /* If the front facing stencil value and the back facing stencil
+          * value are stored in the same register, there is no need to select
+          * a value based on the facing.  This can happen if the stencil value
+          * was not modified due to the write masks being zero, the stencil
+          * operations being KEEP, etc.
+          */
+         if (front_stencil != back_stencil) {
+            spe_selb(f, stencil, back_stencil, front_stencil, facing);
+         }
+
+         if (back_stencil != stencil) {
+            spe_release_register(f, back_stencil);
+         }
+
+         if (front_stencil != stencil) {
+            spe_release_register(f, front_stencil);
+         }
+
+         spe_selb(f, mask, back_depth_pass, front_depth_pass, facing);
+
+         spe_release_register(f, back_depth_pass);
+      } else {
+         if (front_stencil != stencil) {
+            spe_or(f, stencil, front_stencil, front_stencil);
+            spe_release_register(f, front_stencil);
+         }
+         spe_or(f, mask, front_depth_pass, front_depth_pass);
+      }
+
+      spe_release_register(f, front_depth_pass);
+   } else if (dsa->depth.enabled) {
+      if (depth_complement) {
+         spe_andc(f, mask, mask, depth_mask);
+      } else {
+         spe_and(f, mask, mask, depth_mask);
+      }
+   }
+
+   if (dsa->depth.writemask) {
+         spe_selb(f, depth, depth, zvals, mask);
+   }
+
+   spe_bi(f, 0, 0, 0);  /* return from function call */
+
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# alpha (%sabled)\n",
+             (dsa->alpha.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->alpha.func);
+      printf("#    ref: %.2f\n", dsa->alpha.ref);
+
+      printf("# depth (%sabled)\n",
+             (dsa->depth.enabled) ? "en" : "dis");
+      printf("#    func: %u\n", dsa->depth.func);
+
+      for (i = 0; i < 2; i++) {
+         printf("# %s stencil (%sabled)\n",
+                (i == 0) ? "front" : "back",
+                (dsa->stencil[i].enabled) ? "en" : "dis");
+
+         printf("#    func: %u\n", dsa->stencil[i].func);
+         printf("#    op (sf, zf, zp): %u %u %u\n",
+                dsa->stencil[i].fail_op,
+                dsa->stencil[i].zfail_op,
+                dsa->stencil[i].zpass_op);
+         printf("#    ref value / value mask / write mask: %02x %02x %02x\n",
+                dsa->stencil[i].ref_value,
+                dsa->stencil[i].value_mask,
+                dsa->stencil[i].write_mask);
+      }
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+/**
+ * \note Emits a maximum of 3 instructions
+ */
+static int
+emit_alpha_factor_calculation(struct spe_function *f,
+                              unsigned factor,
+                              int src_alpha, int dst_alpha, int const_alpha)
+{
+   int factor_reg;
+   int tmp;
+
+
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_or(f, factor_reg, src_alpha, src_alpha);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor_reg = dst_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor_reg = spe_allocate_available_register(f);
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, const_alpha);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor_reg = const_alpha;
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      factor_reg = -1;
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, src_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor_reg = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor_reg, tmp, dst_alpha);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+      factor_reg = -1;
+      break;
+   }
+
+   return factor_reg;
+}
+
+
+/**
+ * \note Emits a maximum of 6 instructions
+ */
+static void
+emit_color_factor_calculation(struct spe_function *f,
+                              unsigned sF, unsigned mask,
+                              const int *src,
+                              const int *dst,
+                              const int *const_color,
+                              int *factor)
+{
+   int tmp;
+   unsigned i;
+
+
+   factor[0] = -1;
+   factor[1] = -1;
+   factor[2] = -1;
+   factor[3] = -1;
+
+   switch (sF) {
+   case PIPE_BLENDFACTOR_ONE:
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_or(f, factor[i], src[i], src[i]);
+         }
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_or(f, factor[0], src[3], src[3]);
+      break;
+
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      factor[0] = dst[3];
+      factor[1] = dst[3];
+      factor[2] = dst[3];
+      break;
+
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      factor[0] = dst[0];
+      factor[1] = dst[1];
+      factor[2] = dst[2];
+      break;
+
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      /* Alpha saturate means min(As, 1-Ad).
+       */
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, tmp, tmp, dst[3]);
+      spe_fcgt(f, factor[0], tmp, src[3]);
+      spe_selb(f, factor[0], src[3], tmp, factor[0]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; i++) {
+         factor[i] = spe_allocate_available_register(f);
+
+         spe_fs(f, factor[i], tmp, const_color[i]);
+      }
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      for (i = 0; i < 3; i++) {
+         factor[i] = const_color[i];
+      }
+      break;
+
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      tmp = spe_allocate_available_register(f);
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, const_color[3]);
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      factor[0] = const_color[3];
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+      break;
+
+   case PIPE_BLENDFACTOR_ZERO:
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, src[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, src[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      tmp = spe_allocate_available_register(f);
+      factor[0] = spe_allocate_available_register(f);
+      factor[1] = factor[0];
+      factor[2] = factor[0];
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+      spe_fs(f, factor[0], tmp, dst[3]);
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      tmp = spe_allocate_available_register(f);
+
+      spe_il(f, tmp, 1);
+      spe_cuflt(f, tmp, tmp, 0);
+
+      for (i = 0; i < 3; ++i) {
+         if ((mask & (1U << i)) != 0) {
+            factor[i] = spe_allocate_available_register(f);
+            spe_fs(f, factor[i], tmp, dst[i]);
+         }
+      }
+
+      spe_release_register(f, tmp);
+      break;
+
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+emit_blend_calculation(struct spe_function *f,
+                       unsigned func, unsigned sF, unsigned dF,
+                       int src, int src_factor, int dst, int dst_factor)
+{
+   int tmp = spe_allocate_available_register(f);
+
+   switch (func) {
+   case PIPE_BLEND_ADD:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fa(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fma(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            /* Do nothing. */
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, src, dst);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, dst, dst_factor);
+         spe_fms(f, src, src, src_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      if (sF == PIPE_BLENDFACTOR_ONE) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, tmp, 0);
+            spe_fs(f, src, tmp, src);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_fs(f, src, dst, src);
+         }
+      } else if (sF == PIPE_BLENDFACTOR_ZERO) {
+         if (dF == PIPE_BLENDFACTOR_ZERO) {
+            spe_il(f, src, 0);
+         } else if (dF == PIPE_BLENDFACTOR_ONE) {
+            spe_or(f, src, dst, dst);
+         } else {
+            spe_fm(f, src, dst, dst_factor);
+         }
+      } else if (dF == PIPE_BLENDFACTOR_ZERO) {
+         spe_fm(f, src, src, src_factor);
+      } else {
+         spe_fm(f, tmp, src, src_factor);
+         spe_fms(f, src, src, dst_factor, tmp);
+      }
+      break;
+
+   case PIPE_BLEND_MIN:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, src, dst, tmp);
+      break;
+
+   case PIPE_BLEND_MAX:
+      spe_cgt(f, tmp, src, dst);
+      spe_selb(f, src, dst, src, tmp);
+      break;
+
+   default:
+      assert(0);
+   }
+
+   spe_release_register(f, tmp);
+}
+
+
+/**
+ * Generate code to perform alpha blending on the SPE
+ */
+void
+cell_generate_alpha_blend(struct cell_blend_state *cb)
+{
+   struct pipe_blend_state *const b = &cb->base;
+   struct spe_function *const f = &cb->code;
+
+   /* This code generates a maximum of 3 (source alpha factor)
+    * + 3 (destination alpha factor) + (3 * 6) (source color factor)
+    * + (3 * 6) (destination color factor) + (4 * 2) (blend equation)
+    * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
+    * make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   const int frag[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+   const int pixel[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+   const int const_color[4] = {
+      spe_allocate_register(f, 11),
+      spe_allocate_register(f, 12),
+      spe_allocate_register(f, 13),
+      spe_allocate_register(f, 14),
+   };
+   unsigned func[4];
+   unsigned sF[4];
+   unsigned dF[4];
+   unsigned i;
+   int src_factor[4];
+   int dst_factor[4];
+
+
+   /* Does the selected blend mode make use of the source / destination
+    * color (RGB) blend factors?
+    */
+   boolean need_color_factor = b->blend_enable
+       && (b->rgb_func != PIPE_BLEND_MIN)
+       && (b->rgb_func != PIPE_BLEND_MAX);
+
+   /* Does the selected blend mode make use of the source / destination
+    * alpha blend factors?
+    */
+   boolean need_alpha_factor = b->blend_enable
+       && (b->alpha_func != PIPE_BLEND_MIN)
+       && (b->alpha_func != PIPE_BLEND_MAX);
+
+
+   if (b->blend_enable) {
+      sF[0] = b->rgb_src_factor;
+      sF[1] = sF[0];
+      sF[2] = sF[0];
+      switch (b->alpha_src_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         sF[3] = PIPE_BLENDFACTOR_ONE;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         sF[3] = b->alpha_src_factor + 1;
+         break;
+      default:
+         sF[3] = b->alpha_src_factor;
+      }
+
+      dF[0] = b->rgb_dst_factor;
+      dF[1] = dF[0];
+      dF[2] = dF[0];
+      switch (b->alpha_dst_factor & 0x0f) {
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+      case PIPE_BLENDFACTOR_DST_COLOR:
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         dF[3] = b->alpha_dst_factor + 1;
+         break;
+      default:
+         dF[3] = b->alpha_dst_factor;
+      }
+
+      func[0] = b->rgb_func;
+      func[1] = func[0];
+      func[2] = func[0];
+      func[3] = b->alpha_func;
+   } else {
+      sF[0] = PIPE_BLENDFACTOR_ONE;
+      sF[1] = PIPE_BLENDFACTOR_ONE;
+      sF[2] = PIPE_BLENDFACTOR_ONE;
+      sF[3] = PIPE_BLENDFACTOR_ONE;
+      dF[0] = PIPE_BLENDFACTOR_ZERO;
+      dF[1] = PIPE_BLENDFACTOR_ZERO;
+      dF[2] = PIPE_BLENDFACTOR_ZERO;
+      dF[3] = PIPE_BLENDFACTOR_ZERO;
+
+      func[0] = PIPE_BLEND_ADD;
+      func[1] = PIPE_BLEND_ADD;
+      func[2] = PIPE_BLEND_ADD;
+      func[3] = PIPE_BLEND_ADD;
+   }
+
+
+   /* If alpha writing is enabled and the alpha blend mode requires use of
+    * the alpha factor, calculate the alpha factor.
+    */
+   if (((b->colormask & 8) != 0) && need_alpha_factor) {
+      src_factor[3] = emit_alpha_factor_calculation(f, sF[3], const_color[3],
+                                                    frag[3], pixel[3]);
+
+      /* If the alpha destination blend factor is the same as the alpha source
+       * blend factor, re-use the previously calculated value.
+       */
+      dst_factor[3] = (dF[3] == sF[3])
+          ? src_factor[3]
+          : emit_alpha_factor_calculation(f, dF[3], const_color[3],
+                                          frag[3], pixel[3]);
+   }
+
+
+   if (sF[0] == sF[3]) {
+      src_factor[0] = src_factor[3];
+      src_factor[1] = src_factor[3];
+      src_factor[2] = src_factor[3];
+   } else if (sF[0] == dF[3]) {
+      src_factor[0] = dst_factor[3];
+      src_factor[1] = dst_factor[3];
+      src_factor[2] = dst_factor[3];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_src_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, src_factor);
+   }
+
+
+   if (dF[0] == sF[3]) {
+      dst_factor[0] = src_factor[3];
+      dst_factor[1] = src_factor[3];
+      dst_factor[2] = src_factor[3];
+   } else if (dF[0] == dF[3]) {
+      dst_factor[0] = dst_factor[3];
+      dst_factor[1] = dst_factor[3];
+      dst_factor[2] = dst_factor[3];
+   } else if (dF[0] == sF[0]) {
+      dst_factor[0] = src_factor[0];
+      dst_factor[1] = src_factor[1];
+      dst_factor[2] = src_factor[2];
+   } else if (need_color_factor) {
+      emit_color_factor_calculation(f,
+                                    b->rgb_dst_factor,
+                                    b->colormask,
+                                    frag, pixel, const_color, dst_factor);
+   }
+
+
+
+   for (i = 0; i < 4; ++i) {
+      if ((b->colormask & (1U << i)) != 0) {
+         emit_blend_calculation(f,
+                                func[i], sF[i], dF[i],
+                                frag[i], src_factor[i],
+                                pixel[i], dst_factor[i]);
+      }
+   }
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+
+      printf("# %u instructions\n", f->csr - f->store);
+      printf("# blend (%sabled)\n",
+             (cb->base.blend_enable) ? "en" : "dis");
+      printf("#    RGB func / sf / df: %u %u %u\n",
+             cb->base.rgb_func,
+             cb->base.rgb_src_factor,
+             cb->base.rgb_dst_factor);
+      printf("#    ALP func / sf / df: %u %u %u\n",
+             cb->base.alpha_func,
+             cb->base.alpha_src_factor,
+             cb->base.alpha_dst_factor);
+
+      printf("\t.text\n");
+      for (/* empty */; p < f->csr; p++) {
+         printf("\t.long\t0x%04x\n", *p);
+      }
+      fflush(stdout);
+   }
+#endif
+}
+
+
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
+{
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
+   const intptr_t ea = ~0x0f & (intptr_t) d;
+
+   return (ea - pc) >> 2;
+}
+
+
+/**
+ * Generate code to perform color conversion and logic op
+ *
+ * \bug
+ * The code generated by this function should also perform dithering.
+ *
+ * \bug
+ * The code generated by this function should also perform color-write
+ * masking.
+ *
+ * \bug
+ * Only two framebuffer formats are supported at this time.
+ */
+void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf)
+{
+   const unsigned logic_op = (blend->logicop_enable)
+       ? blend->logicop_func : PIPE_LOGICOP_COPY;
+
+   /* This code generates a maximum of 37 instructions.  An additional 32
+    * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
+    * to 64 to make it a happy power-of-two.
+    */
+   spe_init_func(f, SPE_INST_SIZE * 64);
+
+
+   /* Pixel colors in framebuffer format in AoS layout.
+    */
+   const int pixel[4] = {
+      spe_allocate_register(f, 3),
+      spe_allocate_register(f, 4),
+      spe_allocate_register(f, 5),
+      spe_allocate_register(f, 6),
+   };
+
+   /* Fragment colors stored as floats in SoA layout.
+    */
+   const int frag[4] = {
+      spe_allocate_register(f, 7),
+      spe_allocate_register(f, 8),
+      spe_allocate_register(f, 9),
+      spe_allocate_register(f, 10),
+   };
+
+   const int mask = spe_allocate_register(f, 11);
+
+
+   /* Short-circuit the noop and invert cases.
+    */
+   if ((logic_op == PIPE_LOGICOP_NOOP) || (blend->colormask == 0)) {
+      spe_bi(f, 0, 0, 0);
+      return;
+   } else if (logic_op == PIPE_LOGICOP_INVERT) {
+      spe_nor(f, pixel[0], pixel[0], pixel[0]);
+      spe_nor(f, pixel[1], pixel[1], pixel[1]);
+      spe_nor(f, pixel[2], pixel[2], pixel[2]);
+      spe_nor(f, pixel[3], pixel[3], pixel[3]);
+      spe_bi(f, 0, 0, 0);
+      return;
+   }
+
+
+   const int tmp[4] = {
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+      spe_allocate_available_register(f),
+   };
+
+   const int shuf_xpose_hi = spe_allocate_available_register(f);
+   const int shuf_xpose_lo = spe_allocate_available_register(f);
+   const int shuf_color = spe_allocate_available_register(f);
+
+
+   /* Pointer to the begining of the function's private data area.
+    */
+   uint32_t *const data = ((uint32_t *) f->store) + (64 - 8);
+
+
+   /* Convert fragment colors to framebuffer format in AoS layout.
+    */
+   switch (surf->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      data[0] = 0x00010203;
+      data[1] = 0x10111213;
+      data[2] = 0x04050607;
+      data[3] = 0x14151617;
+      data[4] = 0x0c000408;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      data[0] = 0x03020100;
+      data[1] = 0x13121110;
+      data[2] = 0x07060504;
+      data[3] = 0x17161514;
+      data[4] = 0x0804000c;
+      data[5] = 0x80808080;
+      data[6] = 0x80808080;
+      data[7] = 0x80808080;
+      break;
+   default:
+      fprintf(stderr, "CELL: Bad pixel format in cell_generate_logic_op()");
+      ASSERT(0);
+   }
+
+   spe_ilh(f, tmp[0], 0x0808);
+   spe_lqr(f, shuf_xpose_hi, PC_OFFSET(f, data+0));
+   spe_lqr(f, shuf_color, PC_OFFSET(f, data+4));
+   spe_a(f, shuf_xpose_lo, shuf_xpose_hi, tmp[0]);
+
+   spe_shufb(f, tmp[0], frag[0], frag[2], shuf_xpose_hi);
+   spe_shufb(f, tmp[1], frag[0], frag[2], shuf_xpose_lo);
+   spe_shufb(f, tmp[2], frag[1], frag[3], shuf_xpose_hi);
+   spe_shufb(f, tmp[3], frag[1], frag[3], shuf_xpose_lo);
+
+   spe_shufb(f, frag[0], tmp[0], tmp[2], shuf_xpose_hi);
+   spe_shufb(f, frag[1], tmp[0], tmp[2], shuf_xpose_lo);
+   spe_shufb(f, frag[2], tmp[1], tmp[3], shuf_xpose_hi);
+   spe_shufb(f, frag[3], tmp[1], tmp[3], shuf_xpose_lo);
+
+   spe_cfltu(f, frag[0], frag[0], 32);
+   spe_cfltu(f, frag[1], frag[1], 32);
+   spe_cfltu(f, frag[2], frag[2], 32);
+   spe_cfltu(f, frag[3], frag[3], 32);
+
+   spe_shufb(f, frag[0], frag[0], pixel[0], shuf_color);
+   spe_shufb(f, frag[1], frag[1], pixel[1], shuf_color);
+   spe_shufb(f, frag[2], frag[2], pixel[2], shuf_color);
+   spe_shufb(f, frag[3], frag[3], pixel[3], shuf_color);
+
+
+   /* If logic op is enabled, perform the requested logical operation on the
+    * converted fragment colors and the pixel colors.
+    */
+   switch (logic_op) {
+   case PIPE_LOGICOP_CLEAR:
+      spe_il(f, frag[0], 0);
+      spe_il(f, frag[1], 0);
+      spe_il(f, frag[2], 0);
+      spe_il(f, frag[3], 0);
+      break;
+   case PIPE_LOGICOP_NOR:
+      spe_nor(f, frag[0], frag[0], pixel[0]);
+      spe_nor(f, frag[1], frag[1], pixel[1]);
+      spe_nor(f, frag[2], frag[2], pixel[2]);
+      spe_nor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      spe_andc(f, frag[0], pixel[0], frag[0]);
+      spe_andc(f, frag[1], pixel[1], frag[1]);
+      spe_andc(f, frag[2], pixel[2], frag[2]);
+      spe_andc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      spe_nor(f, frag[0], frag[0], frag[0]);
+      spe_nor(f, frag[1], frag[1], frag[1]);
+      spe_nor(f, frag[2], frag[2], frag[2]);
+      spe_nor(f, frag[3], frag[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      spe_andc(f, frag[0], frag[0], pixel[0]);
+      spe_andc(f, frag[1], frag[1], pixel[1]);
+      spe_andc(f, frag[2], frag[2], pixel[2]);
+      spe_andc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_XOR:
+      spe_xor(f, frag[0], frag[0], pixel[0]);
+      spe_xor(f, frag[1], frag[1], pixel[1]);
+      spe_xor(f, frag[2], frag[2], pixel[2]);
+      spe_xor(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_NAND:
+      spe_nand(f, frag[0], frag[0], pixel[0]);
+      spe_nand(f, frag[1], frag[1], pixel[1]);
+      spe_nand(f, frag[2], frag[2], pixel[2]);
+      spe_nand(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_AND:
+      spe_and(f, frag[0], frag[0], pixel[0]);
+      spe_and(f, frag[1], frag[1], pixel[1]);
+      spe_and(f, frag[2], frag[2], pixel[2]);
+      spe_and(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      spe_eqv(f, frag[0], frag[0], pixel[0]);
+      spe_eqv(f, frag[1], frag[1], pixel[1]);
+      spe_eqv(f, frag[2], frag[2], pixel[2]);
+      spe_eqv(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      spe_orc(f, frag[0], pixel[0], frag[0]);
+      spe_orc(f, frag[1], pixel[1], frag[1]);
+      spe_orc(f, frag[2], pixel[2], frag[2]);
+      spe_orc(f, frag[3], pixel[3], frag[3]);
+      break;
+   case PIPE_LOGICOP_COPY:
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      spe_orc(f, frag[0], frag[0], pixel[0]);
+      spe_orc(f, frag[1], frag[1], pixel[1]);
+      spe_orc(f, frag[2], frag[2], pixel[2]);
+      spe_orc(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_OR:
+      spe_or(f, frag[0], frag[0], pixel[0]);
+      spe_or(f, frag[1], frag[1], pixel[1]);
+      spe_or(f, frag[2], frag[2], pixel[2]);
+      spe_or(f, frag[3], frag[3], pixel[3]);
+      break;
+   case PIPE_LOGICOP_SET:
+      spe_il(f, frag[0], ~0);
+      spe_il(f, frag[1], ~0);
+      spe_il(f, frag[2], ~0);
+      spe_il(f, frag[3], ~0);
+      break;
+
+   /* These two cases are short-circuited above.
+    */
+   case PIPE_LOGICOP_INVERT:
+   case PIPE_LOGICOP_NOOP:
+   default:
+      assert(0);
+   }
+
+
+   /* Apply fragment mask.
+    */
+   spe_ilh(f, tmp[0], 0x0000);
+   spe_ilh(f, tmp[1], 0x0404);
+   spe_ilh(f, tmp[2], 0x0808);
+   spe_ilh(f, tmp[3], 0x0c0c);
+
+   spe_shufb(f, tmp[0], mask, mask, tmp[0]);
+   spe_shufb(f, tmp[1], mask, mask, tmp[1]);
+   spe_shufb(f, tmp[2], mask, mask, tmp[2]);
+   spe_shufb(f, tmp[3], mask, mask, tmp[3]);
+
+   spe_selb(f, pixel[0], pixel[0], frag[0], tmp[0]);
+   spe_selb(f, pixel[1], pixel[1], frag[1], tmp[1]);
+   spe_selb(f, pixel[2], pixel[2], frag[2], tmp[2]);
+   spe_selb(f, pixel[3], pixel[3], frag[3], tmp[3]);
+
+   spe_bi(f, 0, 0, 0);
+
+#if 0
+   {
+      const uint32_t *p = f->store;
+      unsigned i;
+
+      printf("# %u instructions\n", f->csr - f->store);
+
+      printf("\t.text\n");
+      for (i = 0; i < 64; i++) {
+         printf("\t.long\t0x%04x\n", p[i]);
+      }
+      fflush(stdout);
+   }
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
new file mode 100644
index 0000000000..a8267a5133
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.h
@@ -0,0 +1,39 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CELL_STATE_PER_FRAGMENT_H
+#define CELL_STATE_PER_FRAGMENT_H
+
+extern void
+cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa);
+
+extern void
+cell_generate_alpha_blend(struct cell_blend_state *cb);
+
+extern void
+cell_generate_logic_op(struct spe_function *f,
+                       const struct pipe_blend_state *blend,
+                       struct pipe_surface *surf);
+
+#endif /* CELL_STATE_PER_FRAGMENT_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
new file mode 100644
index 0000000000..cda39f8d59
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -0,0 +1,221 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_gen_fp.h"
+
+
+/** cast wrapper */
+static INLINE struct cell_fragment_shader_state *
+cell_fragment_shader_state(void *shader)
+{
+   return (struct cell_fragment_shader_state *) shader;
+}
+
+
+/** cast wrapper */
+static INLINE struct cell_vertex_shader_state *
+cell_vertex_shader_state(void *shader)
+{
+   return (struct cell_vertex_shader_state *) shader;
+}
+
+
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
+static void *
+cell_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_fragment_shader_state *cfs;
+
+   cfs = CALLOC_STRUCT(cell_fragment_shader_state);
+   if (!cfs)
+      return NULL;
+
+   cfs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cfs->shader.tokens) {
+      FREE(cfs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cfs->info);
+
+   cell_gen_fragment_program(cell, cfs->shader.tokens, &cfs->code);
+
+   return cfs;
+}
+
+
+/**
+ * Called via pipe->bind_fs_state()
+ */
+static void
+cell_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->fs = cell_fragment_shader_state(fs);
+
+   cell->dirty |= CELL_NEW_FS;
+}
+
+
+/**
+ * Called via pipe->delete_fs_state()
+ */
+static void
+cell_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct cell_fragment_shader_state *cfs = cell_fragment_shader_state(fs);
+
+   spe_release_func(&cfs->code);
+
+   FREE((void *) cfs->shader.tokens);
+   FREE(cfs);
+}
+
+
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
+static void *
+cell_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs;
+
+   cvs = CALLOC_STRUCT(cell_vertex_shader_state);
+   if (!cvs)
+      return NULL;
+
+   cvs->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (!cvs->shader.tokens) {
+      FREE(cvs);
+      return NULL;
+   }
+
+   tgsi_scan_shader(templ->tokens, &cvs->info);
+
+   cvs->draw_data = draw_create_vertex_shader(cell->draw, &cvs->shader);
+   if (cvs->draw_data == NULL) {
+      FREE( (void *) cvs->shader.tokens );
+      FREE( cvs );
+      return NULL;
+   }
+
+   return cvs;
+}
+
+
+/**
+ * Called via pipe->bind_vs_state()
+ */
+static void
+cell_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   cell->vs = cell_vertex_shader_state(vs);
+
+   draw_bind_vertex_shader(cell->draw,
+                           (cell->vs ? cell->vs->draw_data : NULL));
+
+   cell->dirty |= CELL_NEW_VS;
+}
+
+
+/**
+ * Called via pipe->delete_vs_state()
+ */
+static void
+cell_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct cell_vertex_shader_state *cvs = cell_vertex_shader_state(vs);
+
+   draw_delete_vertex_shader(cell->draw, cvs->draw_data);
+   FREE( (void *) cvs->shader.tokens );
+   FREE( cvs );
+}
+
+
+/**
+ * Called via pipe->set_constant_buffer()
+ */
+static void
+cell_set_constant_buffer(struct pipe_context *pipe,
+                         uint shader, uint index,
+                         const struct pipe_constant_buffer *buf)
+{
+   struct cell_context *cell = cell_context(pipe);
+   struct pipe_winsys *ws = pipe->winsys;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   draw_flush(cell->draw);
+
+   /* note: reference counting */
+   winsys_buffer_reference(ws,
+                        &cell->constants[shader].buffer,
+                        buf->buffer);
+   cell->constants[shader].size = buf->size;
+
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
+}
+
+
+void
+cell_init_shader_functions(struct cell_context *cell)
+{
+   cell->pipe.create_fs_state = cell_create_fs_state;
+   cell->pipe.bind_fs_state   = cell_bind_fs_state;
+   cell->pipe.delete_fs_state = cell_delete_fs_state;
+
+   cell->pipe.create_vs_state = cell_create_vs_state;
+   cell->pipe.bind_vs_state   = cell_bind_vs_state;
+   cell->pipe.delete_vs_state = cell_delete_vs_state;
+
+   cell->pipe.set_constant_buffer = cell_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
new file mode 100644
index 0000000000..fbe55c8472
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "cell_context.h"
+#include "cell_state.h"
+
+#include "draw/draw_context.h"
+
+
+static void
+cell_set_vertex_elements(struct pipe_context *pipe,
+                         unsigned count,
+                         const struct pipe_vertex_element *elements)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_element, elements, count * sizeof(elements[0]));
+   cell->num_vertex_elements = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_elements(cell->draw, count, elements);
+}
+
+
+static void
+cell_set_vertex_buffers(struct pipe_context *pipe,
+                        unsigned count,
+                        const struct pipe_vertex_buffer *buffers)
+{
+   struct cell_context *cell = cell_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(cell->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   cell->num_vertex_buffers = count;
+
+   cell->dirty |= CELL_NEW_VERTEX;
+
+   draw_set_vertex_buffers(cell->draw, count, buffers);
+}
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
new file mode 100644
index 0000000000..c9203fee08
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "cell_context.h"
+#include "cell_surface.h"
+
+
+void
+cell_init_surface_functions(struct cell_context *cell)
+{
+   cell->pipe.surface_copy = util_surface_copy;
+   cell->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.h b/src/gallium/drivers/cell/ppu/cell_surface.h
new file mode 100644
index 0000000000..9e58f32944
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef CELL_SURFACE_H
+#define CELL_SURFACE_H
+
+
+struct cell_context;
+
+
+extern void
+cell_init_surface_functions(struct cell_context *cell);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
new file mode 100644
index 0000000000..9f83ab8fa4
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -0,0 +1,528 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  *   Brian Paul
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "cell_context.h"
+#include "cell_state.h"
+#include "cell_texture.h"
+
+
+
+static unsigned
+minify(unsigned d)
+{
+   return MAX2(1, d>>1);
+}
+
+
+static void
+cell_texture_layout(struct cell_texture *ct)
+{
+   struct pipe_texture *pt = &ct->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   ct->buffer_size = 0;
+
+   for ( level = 0 ; level <= pt->last_level ; level++ ) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
+
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
+
+      ct->level_offset[level] = ct->buffer_size;
+
+      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      ct->buffer_size += size;
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+}
+
+
+static struct pipe_texture *
+cell_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
+      return NULL;
+
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
+
+   cell_texture_layout(ct);
+
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                  ct->buffer_size);
+
+   if (!ct->buffer) {
+      FREE(ct);
+      return NULL;
+   }
+
+   return &ct->base;
+}
+
+
+static void
+cell_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      /* Delete this texture now.
+       * But note that the underlying pipe_buffer may linger...
+       */
+      struct cell_texture *ct = cell_texture(*pt);
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
+      */
+
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
+
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         /* Unreference the tiled image buffer.
+          * It may not actually be deleted until a fence is hit.
+          */
+         if (ct->tiled_buffer[i]) {
+            ct->tiled_mapped[i] = NULL;
+            winsys_buffer_reference(screen->winsys, &ct->tiled_buffer[i], NULL);
+         }
+      }
+
+      FREE(ct);
+   }
+   *pt = NULL;
+}
+
+
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
+static void
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   src_stride /= 4; /* convert from bytes to pixels */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+
+/**
+ * Convert image from tiled layout to linear layout.  4-byte pixels.
+ */
+static void
+untwiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                     uint dst_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+   uint *tile_buf;
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   dst_stride /= 4; /* convert from bytes to pixels */
+
+   tile_buf = align_malloc(tile_size * tile_size * 4, 16);
+   
+   /* loop over src tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of src tile: */
+         const uint *tsrc = src + (it * w_t + jt) * tile_size2;
+         
+         twiddle_tile(tsrc, tile_buf);
+         tsrc = tile_buf;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               uint dsti = it * tile_size + i;
+               uint dstj = jt * tile_size + j;
+               ASSERT(dsti < h);
+               ASSERT(dstj < w);
+               dst[dsti * dst_stride + dstj] = tsrc[i * tile_size + j];
+            }
+         }
+      }
+   }
+
+   align_free(tile_buf);
+}
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
+static void
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_buffer[level]) {
+            /* allocate buffer for tiled data now */
+            struct pipe_winsys *ws = screen->winsys;
+            uint bytes = bufWidth * bufHeight * 4 * numFaces;
+            ct->tiled_buffer[level] = ws->buffer_create(ws, 16,
+                                                        PIPE_BUFFER_USAGE_PIXEL,
+                                                        bytes);
+            /* and map it */
+            ct->tiled_mapped[level] = ws->buffer_map(ws, ct->tiled_buffer[level],
+                                                     PIPE_BUFFER_USAGE_GPU_READ);
+         }
+         dst = (uint *) ((ubyte *) ct->tiled_mapped[level] + offset);
+
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: twiddle unsupported texture format %s\n", pf_name(ct->base.format));
+      ;
+   }
+
+   pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
+/**
+ * Convert SPU tiled texture image data to linear format for app usage.
+ */
+static void
+cell_untwiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = surface->stride * texHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->untiled_data[level]) {
+            ct->untiled_data[level] =
+               align_malloc(surface->stride * texHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->untiled_data[level] + offset);
+
+         untwiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                              surface->stride, src);
+      }
+      break;
+   default:
+      {
+         ct->untiled_data[level] = NULL;
+         printf("Cell: untwiddle unsupported texture format %s\n", pf_name(ct->base.format));
+      }
+   }
+
+   pipe_buffer_unmap(screen, surface->buffer);
+}
+
+
+static struct pipe_surface *
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct cell_texture *ct = cell_texture(pt);
+   struct pipe_surface *ps;
+
+   ps = ws->surface_alloc(ws);
+   if (ps) {
+      assert(ps->refcount);
+      assert(ps->winsys);
+      winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
+      ps->usage = usage;
+
+      /* XXX may need to override usage flags (see sp_texture.c) */
+
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+	         ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		      ps->nblocksy *
+		      ps->stride;
+      }
+      else {
+         assert(face == 0);
+         assert(zslice == 0);
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_CPU_READ) {
+         /* convert from tiled to linear layout */
+         cell_untwiddle_texture(screen, ps);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   struct cell_texture *ct = cell_texture((*s)->texture);
+   const uint level = (*s)->level;
+
+   if (((*s)->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level]))
+   {
+      align_free(ct->untiled_data[level]);
+      ct->untiled_data[level] = NULL;
+   }
+
+   /* XXX if done rendering to teximage, re-tile */
+
+   pipe_texture_reference(&(*s)->texture, NULL); 
+
+   screen->winsys->surface_release(screen->winsys, s);
+}
+
+
+static void *
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
+{
+   ubyte *map;
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+
+   assert(ct);
+
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+
+   map = pipe_buffer_map( screen, surface->buffer, flags );
+   if (map == NULL)
+      return NULL;
+   else
+   {
+      if ((surface->usage & PIPE_BUFFER_USAGE_CPU_READ) && (ct->untiled_data[level])) {
+         return (void *) ((ubyte *) ct->untiled_data[level] + surface->offset);
+      }
+      else {
+         return (void *) (map + surface->offset);
+      }
+   }
+}
+
+
+static void
+cell_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   struct cell_texture *ct = cell_texture(surface->texture);
+
+   assert(ct);
+
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surface);
+   }
+
+   pipe_buffer_unmap( screen, surface->buffer );
+}
+
+
+
+void
+cell_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = cell_texture_create;
+   screen->texture_release = cell_texture_release;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_release = cell_tex_surface_release;
+
+   screen->surface_map = cell_surface_map;
+   screen->surface_unmap = cell_surface_unmap;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
new file mode 100644
index 0000000000..7018b0c9bf
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_TEXTURE_H
+#define CELL_TEXTURE_H
+
+
+struct cell_context;
+struct pipe_texture;
+
+
+/**
+ * Subclass of pipe_texture
+ */
+struct cell_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+   unsigned long buffer_size;
+
+   /** Texture data in tiled layout is held here */
+   struct pipe_buffer *tiled_buffer[CELL_MAX_TEXTURE_LEVELS];
+   /** Mapped, tiled texture data */
+   void *tiled_mapped[CELL_MAX_TEXTURE_LEVELS];
+   void *untiled_data[CELL_MAX_TEXTURE_LEVELS];
+};
+
+
+/** cast wrapper */
+static INLINE struct cell_texture *
+cell_texture(struct pipe_texture *pt)
+{
+   return (struct cell_texture *) pt;
+}
+
+
+
+extern void
+cell_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* CELL_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
new file mode 100644
index 0000000000..65ba51b6bb
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -0,0 +1,307 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "cell_batch.h"
+#include "cell_context.h"
+#include "cell_fence.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_vbuf.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+/** Allow vertex data to be inlined after RENDER command */
+#define ALLOW_INLINE_VERTS 1
+
+
+/**
+ * Subclass of vbuf_render because we need a cell_context pointer in
+ * a few places.
+ */
+struct cell_vbuf_render
+{
+   struct vbuf_render base;
+   struct cell_context *cell;
+   uint prim;            /**< PIPE_PRIM_x */
+   uint vertex_size;     /**< in bytes */
+   void *vertex_buffer;  /**< just for debug, really */
+   uint vertex_buf;      /**< in [0, CELL_NUM_BUFFERS-1] */
+};
+
+
+/** cast wrapper */
+static struct cell_vbuf_render *
+cell_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct cell_vbuf_render *) vbr;
+}
+
+
+
+static const struct vertex_info *
+cell_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   return &cvbr->cell->vertex_info;
+}
+
+
+static void *
+cell_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   /*printf("Alloc verts %u * %u\n", vertex_size, nr_vertices);*/
+
+   assert(cvbr->vertex_buf == ~0);
+   cvbr->vertex_buf = cell_get_empty_buffer(cvbr->cell);
+   cvbr->vertex_buffer = cvbr->cell->buffer[cvbr->vertex_buf];
+   cvbr->vertex_size = vertex_size;
+   return cvbr->vertex_buffer;
+}
+
+
+static void
+cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices, 
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+
+   /*
+   printf("%s vertex_buf = %u  count = %u\n",
+          __FUNCTION__, cvbr->vertex_buf, vertices_used);
+   */
+
+   /* Make sure texture buffers aren't released until we're done rendering
+    * with them.
+    */
+   cell_add_fenced_textures(cell);
+
+   /* Tell SPUs they can release the vert buf */
+   if (cvbr->vertex_buf != ~0U) {
+      struct cell_command_release_verts *release
+         = (struct cell_command_release_verts *)
+         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
+      release->opcode = CELL_CMD_RELEASE_VERTS;
+      release->vertex_buf = cvbr->vertex_buf;
+   }
+
+   cvbr->vertex_buf = ~0;
+   cell_flush_int(cell, 0x0);
+
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+
+static boolean
+cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->prim = prim;
+   /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
+}
+
+
+static void
+cell_vbuf_draw(struct vbuf_render *vbr,
+	       const ushort *indices,
+               uint nr_indices)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   struct cell_context *cell = cvbr->cell;
+   float xmin, ymin, xmax, ymax;
+   uint i;
+   uint nr_vertices = 0, min_index = ~0;
+   const void *vertices = cvbr->vertex_buffer;
+   const uint vertex_size = cvbr->vertex_size;
+
+   for (i = 0; i < nr_indices; i++) {
+      if (indices[i] > nr_vertices)
+         nr_vertices = indices[i];
+      if (indices[i] < min_index)
+         min_index = indices[i];
+   }
+   nr_vertices++;
+
+#if 0
+   /*if (min_index > 0)*/
+      printf("%s min_index = %u\n", __FUNCTION__, min_index);
+#endif
+
+#if 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u\n",
+          nr_indices, nr_vertices);
+   printf("  ");
+   for (i = 0; i < nr_indices; i += 3) {
+      printf("%u %u %u, ", indices[i+0], indices[i+1], indices[i+2]);
+   }
+   printf("\n");
+#elif 0
+   printf("cell_vbuf_draw() nr_indices = %u nr_verts = %u  indexes = [%u %u %u ...]\n",
+          nr_indices, nr_vertices,
+          indices[0], indices[1], indices[2]);
+   printf("ind space = %u, vert space = %u, space = %u\n",
+          nr_indices * 2,
+          nr_vertices * 4 * cell->vertex_info.size,
+          cell_batch_free_space(cell));
+#endif
+
+   /* compute x/y bounding box */
+   xmin = ymin = 1e50;
+   xmax = ymax = -1e50;
+   for (i = min_index; i < nr_vertices; i++) {
+      const float *v = (float *) ((ubyte *) vertices + i * vertex_size);
+      if (v[0] < xmin)
+         xmin = v[0];
+      if (v[0] > xmax)
+         xmax = v[0];
+      if (v[1] < ymin)
+         ymin = v[1];
+      if (v[1] > ymax)
+         ymax = v[1];
+   }
+#if 0
+   printf("PPU Bounds %g, %g .. %g, %g\n", xmin, ymin, xmax, ymax);
+   fflush(stdout);
+#endif
+
+   if (cvbr->prim != PIPE_PRIM_TRIANGLES)
+      return; /* only render tris for now */
+
+   /* build/insert batch RENDER command */
+   {
+      const uint index_bytes = ROUNDUP8(nr_indices * 2);
+      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
+
+      struct cell_command_render *render
+         = (struct cell_command_render *)
+         cell_batch_alloc(cell, batch_size);
+
+      render->opcode = CELL_CMD_RENDER;
+      render->prim_type = cvbr->prim;
+
+      render->num_indexes = nr_indices;
+      render->min_index = min_index;
+
+      /* append indices after render command */
+      memcpy(render + 1, indices, nr_indices * 2);
+
+      /* if there's room, append vertices after the indices, else leave
+       * vertices in the original/separate buffer.
+       */
+      render->vertex_size = 4 * cell->vertex_info.size;
+      render->num_verts = nr_vertices;
+      if (ALLOW_INLINE_VERTS &&
+          min_index == 0 &&
+          vertex_bytes + 16 <= cell_batch_free_space(cell)) {
+         /* vertex data inlined, after indices, at 16-byte boundary */
+         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         memcpy(dst, vertices, vertex_bytes);
+         render->inline_verts = TRUE;
+         render->vertex_buf = ~0;
+      }
+      else {
+         /* vertex data in separate buffer */
+         render->inline_verts = FALSE;
+         ASSERT(cvbr->vertex_buf >= 0);
+         render->vertex_buf = cvbr->vertex_buf;
+      }
+
+      render->xmin = xmin;
+      render->ymin = ymin;
+      render->xmax = xmax;
+      render->ymax = ymax;
+   }
+
+#if 0
+   /* helpful for debug */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+#endif
+}
+
+
+static void
+cell_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
+   cvbr->cell->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+cell_init_vbuf(struct cell_context *cell)
+{
+   assert(cell->draw);
+
+   cell->vbuf_render = CALLOC_STRUCT(cell_vbuf_render);
+
+   /* The max number of indexes is what can fix into a batch buffer,
+    * minus the render and release-verts commands.
+    */
+   cell->vbuf_render->base.max_indices
+      = (CELL_BUFFER_SIZE
+         - sizeof(struct cell_command_render)
+         - sizeof(struct cell_command_release_verts))
+      / sizeof(ushort);
+   cell->vbuf_render->base.max_vertex_buffer_bytes = CELL_BUFFER_SIZE;
+
+   cell->vbuf_render->base.get_vertex_info = cell_vbuf_get_vertex_info;
+   cell->vbuf_render->base.allocate_vertices = cell_vbuf_allocate_vertices;
+   cell->vbuf_render->base.set_primitive = cell_vbuf_set_primitive;
+   cell->vbuf_render->base.draw = cell_vbuf_draw;
+   cell->vbuf_render->base.release_vertices = cell_vbuf_release_vertices;
+   cell->vbuf_render->base.destroy = cell_vbuf_destroy;
+
+   cell->vbuf_render->cell = cell;
+#if 1
+   cell->vbuf_render->vertex_buf = ~0;
+#endif
+
+   cell->vbuf = draw_vbuf_stage(cell->draw, &cell->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.h b/src/gallium/drivers/cell/ppu/cell_vbuf.h
new file mode 100644
index 0000000000..d265cbf770
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef CELL_VBUF_H
+#define CELL_VBUF_H
+
+
+struct cell_context;
+
+extern void
+cell_init_vbuf(struct cell_context *cell);
+
+
+#endif /* CELL_VBUF_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
new file mode 100644
index 0000000000..9cba537d9e
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -0,0 +1,346 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "../auxiliary/draw/draw_context.h"
+#include "../auxiliary/draw/draw_private.h"
+
+#include "cell_context.h"
+#include "rtasm/rtasm_ppc_spe.h"
+
+
+/**
+ * Emit a 4x4 matrix transpose operation
+ *
+ * \param p         Function that the transpose operation is to be appended to
+ * \param row0      Register containing row 0 of the source matrix
+ * \param row1      Register containing row 1 of the source matrix
+ * \param row2      Register containing row 2 of the source matrix
+ * \param row3      Register containing row 3 of the source matrix
+ * \param dest_ptr  Register containing the address of the destination matrix
+ * \param shuf_ptr  Register containing the address of the shuffled data
+ * \param count     Number of colums to actually be written to the destination
+ *
+ * \note
+ * This function assumes that the registers named by \c row0, \c row1,
+ * \c row2, and \c row3 are scratch and can be modified by the generated code.
+ * Furthermore, these registers will be released, via calls to
+ * \c release_register, by this function.
+ * 
+ * \note
+ * This function requires that four temporary are available on entry.
+ */
+static void
+emit_matrix_transpose(struct spe_function *p,
+		      unsigned row0, unsigned row1, unsigned row2,
+		      unsigned row3, unsigned dest_ptr,
+		      unsigned shuf_ptr, unsigned count)
+{
+   int shuf_hi = spe_allocate_available_register(p);
+   int shuf_lo = spe_allocate_available_register(p);
+   int t1 = spe_allocate_available_register(p);
+   int t2 = spe_allocate_available_register(p);
+   int t3;
+   int t4;
+   int col0;
+   int col1;
+   int col2;
+   int col3;
+
+
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
+   spe_shufb(p, t1, row0, row2, shuf_hi);
+   spe_shufb(p, t2, row0, row2, shuf_lo);
+
+
+   /* row0 and row2 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   t3 = row0;
+   t4 = row2;
+
+   spe_shufb(p, t3, row1, row3, shuf_hi);
+   spe_shufb(p, t4, row1, row3, shuf_lo);
+
+
+   /* row1 and row3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col0 = row1;
+   col1 = row3;
+
+   spe_shufb(p, col0, t1, t3, shuf_hi);
+   if (count > 1) {
+      spe_shufb(p, col1, t1, t3, shuf_lo);
+   }
+
+   /* t1 and t3 are now no longer needed.  Re-use those registers as
+    * temporaries.
+    */
+   col2 = t1;
+   col3 = t3;
+
+   if (count > 2) {
+      spe_shufb(p, col2, t2, t4, shuf_hi);
+   }
+
+   if (count > 3) {
+      spe_shufb(p, col3, t2, t4, shuf_lo);
+   }
+
+
+   /* Store the results.  Remember that the stqd instruction is encoded using
+    * the qword offset (stand-alone assemblers to the byte-offset to
+    * qword-offset conversion for you), so the byte-offset needs be divided by
+    * 16.
+    */
+   switch (count) {
+   case 4:
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
+   case 3:
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
+   case 2:
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
+   case 1:
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
+   }
+
+
+   /* Release all of the temporary registers used.
+    */
+   spe_release_register(p, col0);
+   spe_release_register(p, col1);
+   spe_release_register(p, col2);
+   spe_release_register(p, col3);
+   spe_release_register(p, shuf_hi);
+   spe_release_register(p, shuf_lo);
+   spe_release_register(p, t2);
+   spe_release_register(p, t4);
+}
+
+
+#if 0
+/* This appears to not be used currently */
+static void
+emit_fetch(struct spe_function *p,
+	   unsigned in_ptr, unsigned *offset,
+	   unsigned out_ptr, unsigned shuf_ptr,
+	   enum pipe_format format)
+{
+   const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
+       + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
+   const unsigned type = pf_type(format);
+   const unsigned bytes = pf_size_x(format);
+
+   int v0 = spe_allocate_available_register(p);
+   int v1 = spe_allocate_available_register(p);
+   int v2 = spe_allocate_available_register(p);
+   int v3 = spe_allocate_available_register(p);
+   int tmp = spe_allocate_available_register(p);
+   int float_zero = -1;
+   int float_one = -1;
+   float scale_signed = 0.0;
+   float scale_unsigned = 0.0;
+
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
+   offset[0] += 4;
+   
+   switch (bytes) {
+   case 1:
+      scale_signed = 1.0f / 127.0f;
+      scale_unsigned = 1.0f / 255.0f;
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 2:
+      scale_signed = 1.0f / 32767.0f;
+      scale_unsigned = 1.0f / 65535.0f;
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
+      spe_shufb(p, v0, v0, v0, tmp);
+      spe_shufb(p, v1, v1, v1, tmp);
+      spe_shufb(p, v2, v2, v2, tmp);
+      spe_shufb(p, v3, v3, v3, tmp);
+      break;
+   case 4:
+      scale_signed = 1.0f / 2147483647.0f;
+      scale_unsigned = 1.0f / 4294967295.0f;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   switch (type) {
+   case PIPE_FORMAT_TYPE_FLOAT:
+      break;
+   case PIPE_FORMAT_TYPE_UNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
+      spe_cuflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_SNORM:
+      spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
+      spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
+      spe_csflt(p, v0, v0, 0);
+      spe_fm(p, v0, v0, tmp);
+      break;
+   case PIPE_FORMAT_TYPE_USCALED:
+      spe_cuflt(p, v0, v0, 0);
+      break;
+   case PIPE_FORMAT_TYPE_SSCALED:
+      spe_csflt(p, v0, v0, 0);
+      break;
+   }
+
+
+   if (count < 4) {
+      float_one = spe_allocate_available_register(p);
+      spe_il(p, float_one, 1);
+      spe_cuflt(p, float_one, float_one, 0);
+      
+      if (count < 3) {
+	 float_zero = spe_allocate_available_register(p);
+	 spe_il(p, float_zero, 0);
+      }
+   }
+
+   spe_release_register(p, tmp);
+
+   emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
+
+   switch (count) {
+   case 1:
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
+   case 2:
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
+   case 3:
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
+   }
+
+   if (float_zero != -1) {
+      spe_release_register(p, float_zero);
+   }
+
+   if (float_one != -1) {
+      spe_release_register(p, float_one);
+   }
+}
+#endif
+
+
+void cell_update_vertex_fetch(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct spe_function *p = &cell->attrib_fetch;
+   unsigned function_index[PIPE_MAX_ATTRIBS];
+   unsigned unique_attr_formats;
+   int out_ptr;
+   int in_ptr;
+   int shuf_ptr;
+   unsigned i;
+   unsigned j;
+
+
+   /* Determine how many unique input attribute formats there are.  At the
+    * same time, store the index of the lowest numbered attribute that has
+    * the same format as any non-unique format.
+    */
+   unique_attr_formats = 1;
+   function_index[0] = 0;
+   for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
+
+      for (j = 0; j < i; j++) {
+	 if (curr_fmt == draw->vertex_element[j].src_format) {
+	    break;
+	 }
+      }
+      
+      if (j == i) {
+	 unique_attr_formats++;
+      }
+
+      function_index[i] = j;
+   }
+
+
+   /* Each fetch function can be a maximum of 34 instructions (note: this is
+    * actually a slight over-estimate).
+    */
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
+
+
+   /* Allocate registers for the function's input parameters.
+    */
+   out_ptr = spe_allocate_register(p, 3);
+   in_ptr = spe_allocate_register(p, 4);
+   shuf_ptr = spe_allocate_register(p, 5);
+
+
+   /* Generate code for the individual attribute fetch functions.
+    */
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      unsigned offset;
+
+      if (function_index[i] == i) {
+	 cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr 
+						     - (void *) p->store);
+
+	 offset = 0;
+	 emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
+		    draw->vertex_element[i].src_format);
+	 spe_bi(p, 0, 0, 0);
+
+	 /* Round up to the next 16-byte boundary.
+	  */
+	 if ((((unsigned) p->store) & 0x0f) != 0) {
+	    const unsigned align = ((unsigned) p->store) & 0x0f;
+	    p->store = (uint32_t *) (((void *) p->store) + align);
+	 }
+      } else {
+	 /* Use the same function entry-point as a previously seen attribute
+	  * with the same format.
+	  */
+	 cell->attrib_fetch_offsets[i] = 
+	     cell->attrib_fetch_offsets[function_index[i]];
+      }
+   }
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
new file mode 100644
index 0000000000..2b10c116fa
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -0,0 +1,146 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file cell_vertex_shader.c
+ * Vertex shader interface routines for Cell.
+ *
+ * \author Ian Romanick <idr@us.ibm.com>
+ */
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+
+#include "cell_context.h"
+#include "cell_draw_arrays.h"
+#include "cell_flush.h"
+#include "cell_spu.h"
+#include "cell_batch.h"
+
+#include "cell/common.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+cell_vertex_shader_queue_flush(struct draw_context *draw)
+{
+#if 0
+   struct cell_context *const cell =
+       (struct cell_context *) draw->driver_private;
+   struct cell_command_vs *const vs = &cell_global.command[0].vs;
+   uint64_t *batch;
+   struct cell_array_info *array_info;
+   unsigned i, j;
+   struct cell_attribute_fetch_code *cf;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   draw_update_vertex_fetch(draw);
+   cell_update_vertex_fetch(draw);
+
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*cf));
+   batch[0] = CELL_CMD_STATE_ATTRIB_FETCH;
+   cf = (struct cell_attribute_fetch_code *) (&batch[1]);
+   cf->base = (uint64_t) cell->attrib_fetch.store;
+   cf->size = ROUNDUP16((unsigned)((void *) cell->attrib_fetch.csr 
+				   - (void *) cell->attrib_fetch.store));
+
+
+   for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
+      const enum pipe_format format = draw->vertex_element[i].src_format;
+      const unsigned count = ((pf_size_x(format) != 0)
+			      + (pf_size_y(format) != 0)
+			      + (pf_size_z(format) != 0)
+			      + (pf_size_w(format) != 0));
+      const unsigned size = pf_size_x(format) * count;
+
+      batch = cell_batch_alloc(cell, sizeof(batch[0]) + sizeof(*array_info));
+
+      batch[0] = CELL_CMD_STATE_VS_ARRAY_INFO;
+
+      array_info = (struct cell_array_info *) &batch[1];
+      assert(draw->vertex_fetch.src_ptr[i] != NULL);
+      array_info->base = (uintptr_t) draw->vertex_fetch.src_ptr[i];
+      array_info->attr = i;
+      array_info->pitch = draw->vertex_fetch.pitch[i];
+      array_info->size = size;
+      array_info->function_offset = cell->attrib_fetch_offsets[i];
+   }
+
+   batch = cell_batch_alloc(cell, sizeof(batch[0])
+                            + sizeof(struct pipe_viewport_state));
+   batch[0] = CELL_CMD_STATE_VIEWPORT;
+   (void) memcpy(&batch[1], &draw->viewport,
+                 sizeof(struct pipe_viewport_state));
+
+   {
+      uint64_t uniforms = (uintptr_t) draw->user.constants;
+
+      batch = cell_batch_alloc(cell, 2 *sizeof(batch[0]));
+      batch[0] = CELL_CMD_STATE_UNIFORMS;
+      batch[1] = uniforms;
+   }
+
+   cell_batch_flush(cell);
+
+   vs->opcode = CELL_CMD_VS_EXECUTE;
+   vs->nr_attrs = draw->vertex_fetch.nr_attrs;
+
+   (void) memcpy(vs->plane, draw->plane, sizeof(draw->plane));
+   vs->nr_planes = draw->nr_planes;
+
+   for (i = 0; i < draw->vs.queue_nr; i += SPU_VERTS_PER_BATCH) {
+      const unsigned n = MIN2(SPU_VERTS_PER_BATCH, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         vs->elts[j] = draw->vs.queue[i + j].elt;
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      for (/* empty */; j < SPU_VERTS_PER_BATCH; j++) {
+         vs->elts[j] = vs->elts[0];
+         vs->vOut[j] = (uintptr_t) draw->vs.queue[i + j].vertex;
+      }
+
+      vs->num_elts = n;
+      send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
+
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+   }
+
+   draw->vs.post_nr = draw->vs.queue_nr;
+   draw->vs.queue_nr = 0;
+#else
+   assert(0);
+#endif
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.c b/src/gallium/drivers/cell/ppu/cell_winsys.c
new file mode 100644
index 0000000000..d570bbd2f9
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.c
@@ -0,0 +1,40 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "cell_winsys.h"
+
+
+struct cell_winsys *
+cell_get_winsys(uint format)
+{
+   struct cell_winsys *cws = CALLOC_STRUCT(cell_winsys);
+   if (cws)
+      cws->preferredFormat = format;
+   return cws;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_winsys.h b/src/gallium/drivers/cell/ppu/cell_winsys.h
new file mode 100644
index 0000000000..ae2af5696b
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_winsys.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef CELL_WINSYS_H
+#define CELL_WINSYS_H
+
+#include "pipe/p_compiler.h"
+
+
+/**
+ * Very simple winsys at this time.
+ * Will probably eventually add SPU control info.
+ */
+struct cell_winsys
+{
+   uint preferredFormat;
+};
+
+
+extern struct cell_winsys *
+cell_get_winsys(uint format);
+
+
+
+#endif
diff --git a/src/gallium/drivers/cell/spu/.gitignore b/src/gallium/drivers/cell/spu/.gitignore
new file mode 100644
index 0000000000..2be9a2d324
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/.gitignore
@@ -0,0 +1 @@
+g3d_spu
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
new file mode 100644
index 0000000000..116453b79c
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -0,0 +1,82 @@
+# Gallium3D Cell driver: SPU code
+
+# This makefile builds the g3d_spu.a file that's linked into the
+# PPU code/library.
+
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+PROG = g3d
+
+PROG_SPU = $(PROG)_spu
+PROG_SPU_A = $(PROG)_spu.a
+PROG_SPU_EMBED_O = $(PROG)_spu-embed.o
+
+
+SOURCES = \
+	spu_command.c \
+	spu_dcache.c \
+	spu_funcs.c \
+	spu_main.c \
+	spu_per_fragment_op.c \
+	spu_render.c \
+	spu_texture.c \
+	spu_tile.c \
+	spu_tri.c
+
+OLD_SOURCES = \
+	spu_exec.c \
+	spu_util.c \
+	spu_vertex_fetch.c \
+	spu_vertex_shader.c
+
+
+SPU_OBJECTS = $(SOURCES:.c=.o) \
+
+SPU_ASM_OUT = $(SOURCES:.c=.s) \
+
+INCLUDE_DIRS = \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers
+
+
+.c.o:
+	$(SPU_CC) $(SPU_CFLAGS) -c $<
+
+.c.s:
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
+
+
+# The .a file will be linked into the main/PPU executable
+default: $(PROG_SPU_A)
+
+$(PROG_SPU_A): $(PROG_SPU_EMBED_O)
+	$(SPU_AR) $(SPU_AR_FLAGS) $(PROG_SPU_A) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU_EMBED_O): $(PROG_SPU)
+	$(SPU_EMBED) $(SPU_EMBED_FLAGS) $(PROG_SPU) $(PROG_SPU) $(PROG_SPU_EMBED_O)
+
+$(PROG_SPU): $(SPU_OBJECTS)
+	$(SPU_CC) -o $(PROG_SPU) $(SPU_OBJECTS) $(SPU_LFLAGS)
+
+
+
+asmfiles: $(SPU_ASM_OUT)
+
+
+clean:
+	rm -f *~ *.o *.a *.d *.s $(PROG_SPU)
+
+
+
+depend: $(SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(INCLUDE_DIRS) $(SOURCES) 2> /dev/null
+
+include depend
+
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
new file mode 100644
index 0000000000..d7ce005524
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#ifndef SPU_COLORPACK_H
+#define SPU_COLORPACK_H
+
+
+#include <transpose_matrix4x4.h>
+#include <spu_intrinsics.h>
+
+
+static INLINE unsigned int
+spu_pack_R8G8B8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  0, 4, 8, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0 }) );
+
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_A8R8G8B8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  12, 0, 4, 8, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_B8G8R8A8(vector float rgba)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, ((vector unsigned char) {
+                                  8, 4, 0, 12, 0, 0, 0, 0, 
+                                  0, 0, 0, 0, 0, 0, 0, 0}) );
+  return spu_extract(out, 0);
+}
+
+
+static INLINE unsigned int
+spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
+{
+  vector unsigned int out = spu_convtu(rgba, 32);
+  out = spu_shuffle(out, out, shuffle);
+  return spu_extract(out, 0);
+}
+
+
+static INLINE vector float
+spu_unpack_B8G8R8A8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
+                             0, 0, 0, 0,
+                             3, 3, 3, 3}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+static INLINE vector float
+spu_unpack_A8R8G8B8(uint color)
+{
+   vector unsigned int color_u4 = spu_splats(color);
+   color_u4 = spu_shuffle(color_u4, color_u4,
+                          ((vector unsigned char) {
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
+                             0, 0, 0, 0}) );
+   return spu_convtf(color_u4, 32);
+}
+
+
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
+#endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
new file mode 100644
index 0000000000..8500d19754
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -0,0 +1,815 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU command processing code
+ */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+
+struct spu_vs_context draw;
+
+
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
+    ALIGN16_ATTRIB;
+
+
+
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
+/**
+ * Tell the PPU that this SPU has finished copying a buffer to
+ * local store and that it may be reused by the PPU.
+ * This is done by writting a 16-byte batch-buffer-status block back into
+ * main memory (in cell_context->buffer_status[]).
+ */
+static void
+release_buffer(uint buffer)
+{
+   /* Evidently, using less than a 16-byte status doesn't work reliably */
+   static const vector unsigned int status = {CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE,
+                                              CELL_BUFFER_STATUS_FREE};
+   const uint index = 4 * (spu.init.id * CELL_NUM_BUFFERS + buffer);
+   uint *dst = spu.init.buffer_status + index;
+
+   ASSERT(buffer < CELL_NUM_BUFFERS);
+
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_MISC,            /* tag is unimportant */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Write CELL_FENCE_SIGNALLED back to the fence status qword in main memory.
+ * There's a qword of status per SPU.
+ */
+static void
+cmd_fence(struct cell_command_fence *fence_cmd)
+{
+   static const vector unsigned int status = {CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED,
+                                              CELL_FENCE_SIGNALLED};
+   uint *dst = (uint *) fence_cmd->fence;
+   dst += 4 * spu.init.id;  /* main store/memory address, not local store */
+   ASSERT_ALIGN16(dst);
+   mfc_put((void *) &status,    /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           sizeof(status),      /* size */
+           TAG_FENCE,           /* tag */
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+static void
+cmd_clear_surface(const struct cell_command_clear_surface *clear)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF %u to 0x%08x\n", clear->surface, clear->value);
+
+   if (clear->surface == 0) {
+      spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
+   }
+   else {
+      spu.fb.depth_clear_value = clear->value;
+   }
+
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
+   if (clear->surface == 0) {
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
+   }
+   else {
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
+   }
+
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
+   /*
+   printf("SPU: %s num=%d w=%d h=%d\n",
+          __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
+   */
+
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+   }
+
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
+
+   D_PRINTF(CELL_DEBUG_CMD, "CLEAR SURF done\n");
+}
+
+
+static void
+cmd_release_verts(const struct cell_command_release_verts *release)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "RELEASE VERTS %u\n", release->vertex_buf);
+   ASSERT(release->vertex_buf != ~0U);
+   release_buffer(release->vertex_buf);
+}
+
+
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_OPS\n");
+
+   /* Copy state info (for fallback case only - this will eventually
+    * go away when the fallback case goes away)
+    */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+   memcpy(&spu.blend_color, &fops->blend_color, sizeof(fops->blend_color));
+
+   /* Make sure the SPU knows which buffers it's expected to read when
+    * it's told to pull tiles.
+    */
+   spu.read_depth_stencil = (spu.depth_stencil_alpha.depth.enabled || spu.depth_stencil_alpha.stencil[0].enabled);
+
+   /* If we're forcing the fallback code to be used (for debug purposes),
+    * install that.  Otherwise install the incoming SPU code.
+    */
+   if ((spu.init.debug_flags & CELL_DEBUG_FRAGMENT_OP_FALLBACK) != 0) {
+      static unsigned int warned = 0;
+      if (!warned) {
+         fprintf(stderr, "Cell Warning: using fallback per-fragment code\n");
+         warned = 1;
+      }
+      /* The following two lines aren't really necessary if you
+       * know the debug flags won't change during a run, and if you
+       * know that the function pointers are initialized correctly.
+       * We set them here to allow a person to change the debug
+       * flags during a run (from inside a debugger).
+       */
+      spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+      spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+      return;
+   }
+
+   /* Make sure the SPU code buffer is large enough to hold the incoming code.
+    * Note that we *don't* use align_malloc() and align_free(), because
+    * those utility functions are *not* available in SPU code.
+    * */
+   if (spu.fragment_ops_code_size < fops->total_code_size) {
+      if (spu.fragment_ops_code != NULL) {
+         free(spu.fragment_ops_code);
+      }
+      spu.fragment_ops_code_size = fops->total_code_size;
+      spu.fragment_ops_code = malloc(fops->total_code_size);
+      if (spu.fragment_ops_code == NULL) {
+         /* Whoops. */
+         fprintf(stderr, "CELL Warning: failed to allocate fragment ops code (%d bytes) - using fallback\n", fops->total_code_size);
+         spu.fragment_ops_code = NULL;
+         spu.fragment_ops_code_size = 0;
+         spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+         spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+         return;
+      }
+   }
+
+   /* Copy the SPU code from the command buffer to the spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, fops->total_code_size);
+
+   /* Set the pointers for the front-facing and back-facing fragments
+    * to the specified offsets within the code.  Note that if the
+    * front-facing and back-facing code are the same, they'll have
+    * the same offset.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->front_code_index];
+   spu.fragment_ops[CELL_FACING_BACK] = (spu_fragment_ops_func) &spu.fragment_ops_code[fops->back_code_index];
+}
+
+static void
+cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FRAGMENT_PROGRAM\n");
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_program_code, fp->code,
+          SPU_MAX_FRAGMENT_PROGRAM_INSTS * 4);
+#if 01
+   /* Point function pointer at new code */
+   spu.fragment_program = (spu_fragment_program_func)spu.fragment_program_code;
+#endif
+}
+
+
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      D_PRINTF(CELL_DEBUG_CMD, "  const[%u] = %f\n", i, constants[i]);
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
+static void
+cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FRAMEBUFFER: %d x %d at %p, cformat 0x%x  zformat 0x%x\n",
+             cmd->width,
+             cmd->height,
+             cmd->color_start,
+             cmd->color_format,
+             cmd->depth_format);
+
+   ASSERT_ALIGN16(cmd->color_start);
+   ASSERT_ALIGN16(cmd->depth_start);
+
+   spu.fb.color_start = cmd->color_start;
+   spu.fb.depth_start = cmd->depth_start;
+   spu.fb.color_format = cmd->color_format;
+   spu.fb.depth_format = cmd->depth_format;
+   spu.fb.width = cmd->width;
+   spu.fb.height = cmd->height;
+   spu.fb.width_tiles = (spu.fb.width + TILE_SIZE - 1) / TILE_SIZE;
+   spu.fb.height_tiles = (spu.fb.height + TILE_SIZE - 1) / TILE_SIZE;
+
+   switch (spu.fb.depth_format) {
+   case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
+      break;
+   default:
+      spu.fb.zsize = 0;
+      break;
+   }
+}
+
+
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+}
+
+
+static void
+cmd_state_sampler(const struct cell_command_sampler *sampler)
+{
+   uint unit = sampler->unit;
+
+   D_PRINTF(CELL_DEBUG_CMD, "SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture_2d[unit] = sample_texture_2d_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture_2d[unit] = sample_texture_2d_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture_2d[unit] = spu.mag_sample_texture_2d[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_texture(const struct cell_command_texture *texture)
+{
+   const uint unit = texture->unit;
+   uint i;
+
+   D_PRINTF(CELL_DEBUG_CMD, "TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
+
+      D_PRINTF(CELL_DEBUG_CMD, "  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
+
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
+
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit]);
+}
+
+
+static void
+cmd_state_vertex_info(const struct vertex_info *vinfo)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "VERTEX_INFO num_attribs=%u\n", vinfo->num_attribs);
+   ASSERT(vinfo->num_attribs >= 1);
+   ASSERT(vinfo->num_attribs <= 8);
+   memcpy(&spu.vertex_info, vinfo, sizeof(*vinfo));
+}
+
+
+static void
+cmd_state_vs_array_info(const struct cell_array_info *vs_info)
+{
+   const unsigned attr = vs_info->attr;
+
+   ASSERT(attr < PIPE_MAX_ATTRIBS);
+   draw.vertex_fetch.src_ptr[attr] = vs_info->base;
+   draw.vertex_fetch.pitch[attr] = vs_info->pitch;
+   draw.vertex_fetch.size[attr] = vs_info->size;
+   draw.vertex_fetch.code_offset[attr] = vs_info->function_offset;
+   draw.vertex_fetch.dirty = 1;
+}
+
+
+static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
+cmd_finish(void)
+{
+   D_PRINTF(CELL_DEBUG_CMD, "FINISH\n");
+   really_clear_tiles(0);
+   /* wait for all outstanding DMAs to finish */
+   mfc_write_tag_mask(~0);
+   mfc_read_tag_status_all();
+   /* send mbox message to PPU */
+   spu_write_out_mbox(CELL_CMD_FINISH);
+}
+
+
+/**
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
+ * The opcode param encodes the location of the buffer and its size.
+ */
+static void
+cmd_batch(uint opcode)
+{
+   const uint buf = (opcode >> 8) & 0xff;
+   uint size = (opcode >> 16);
+   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
+   const unsigned usize = size / sizeof(buffer[0]);
+   uint pos;
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
+             buf, size, spu.init.buffers[buf]);
+
+   ASSERT((opcode & CELL_CMD_OPCODE_MASK) == CELL_CMD_BATCH);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   size = ROUNDUP16(size);
+
+   ASSERT_ALIGN16(spu.init.buffers[buf]);
+
+   mfc_get(buffer,  /* dest */
+           (unsigned int) spu.init.buffers[buf],  /* src */
+           size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   /* Tell PPU we're done copying the buffer to local store */
+   D_PRINTF(CELL_DEBUG_CMD, "release batch buf %u\n", buf);
+   release_buffer(buf);
+
+   /*
+    * Loop over commands in the batch buffer
+    */
+   for (pos = 0; pos < usize; /* no incr */) {
+      switch (buffer[pos]) {
+      /*
+       * rendering commands
+       */
+      case CELL_CMD_CLEAR_SURFACE:
+         {
+            struct cell_command_clear_surface *clr
+               = (struct cell_command_clear_surface *) &buffer[pos];
+            cmd_clear_surface(clr);
+            pos += sizeof(*clr) / 8;
+         }
+         break;
+      case CELL_CMD_RENDER:
+         {
+            struct cell_command_render *render
+               = (struct cell_command_render *) &buffer[pos];
+            uint pos_incr;
+            cmd_render(render, &pos_incr);
+            pos += pos_incr;
+         }
+         break;
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
+         {
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            /* This is a variant-sized command */
+            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+         }
+         break;
+      case CELL_CMD_STATE_FRAGMENT_PROGRAM:
+         {
+            struct cell_command_fragment_program *fp
+               = (struct cell_command_fragment_program *) &buffer[pos];
+            cmd_state_fragment_program(fp);
+            pos += sizeof(*fp) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
+      case CELL_CMD_STATE_RASTERIZER:
+         {
+            struct cell_command_rasterizer *rast =
+               (struct cell_command_rasterizer *) &buffer[pos];
+            spu.rasterizer = rast->rasterizer;
+            pos += sizeof(*rast) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_SAMPLER:
+         {
+            struct cell_command_sampler *sampler
+               = (struct cell_command_sampler *) &buffer[pos];
+            cmd_state_sampler(sampler);
+            pos += sizeof(*sampler) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_TEXTURE:
+         {
+            struct cell_command_texture *texture
+               = (struct cell_command_texture *) &buffer[pos];
+            cmd_state_texture(texture);
+            pos += sizeof(*texture) / 8;
+         }
+         break;
+      case CELL_CMD_STATE_VERTEX_INFO:
+         cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         break;
+      case CELL_CMD_STATE_VIEWPORT:
+         (void) memcpy(& draw.viewport, &buffer[pos+1],
+                       sizeof(struct pipe_viewport_state));
+         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         break;
+      case CELL_CMD_STATE_UNIFORMS:
+         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         pos += 2;
+         break;
+      case CELL_CMD_STATE_VS_ARRAY_INFO:
+         cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         break;
+      case CELL_CMD_STATE_BIND_VS:
+#if 0
+         spu_bind_vertex_shader(&draw,
+                                (struct cell_shader_info *) &buffer[pos+1]);
+#endif
+         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         break;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         break;
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_FENCE:
+         {
+            struct cell_command_fence *fence_cmd =
+               (struct cell_command_fence *) &buffer[pos];
+            cmd_fence(fence_cmd);
+            pos += sizeof(*fence_cmd) / 8;
+         }
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
+         break;
+      case CELL_CMD_FLUSH_BUFFER_RANGE: {
+	 struct cell_buffer_range *br = (struct cell_buffer_range *)
+	     &buffer[pos+1];
+
+	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
+         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+	 break;
+      }
+      default:
+         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         ASSERT(0);
+         break;
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "BATCH complete\n");
+}
+
+
+#define PERF 0
+
+
+/**
+ * Main loop for SPEs: Get a command, execute it, repeat.
+ */
+void
+command_loop(void)
+{
+   int exitFlag = 0;
+   uint t0, t1;
+
+   D_PRINTF(CELL_DEBUG_CMD, "Enter command loop\n");
+
+   while (!exitFlag) {
+      unsigned opcode;
+
+      D_PRINTF(CELL_DEBUG_CMD, "Wait for cmd...\n");
+
+      if (PERF)
+         spu_write_decrementer(~0);
+
+      /* read/wait from mailbox */
+      opcode = (unsigned int) spu_read_in_mbox();
+      D_PRINTF(CELL_DEBUG_CMD, "got cmd 0x%x\n", opcode);
+
+      if (PERF)
+         t0 = spu_read_decrementer();
+
+      switch (opcode & CELL_CMD_OPCODE_MASK) {
+      case CELL_CMD_EXIT:
+         D_PRINTF(CELL_DEBUG_CMD, "EXIT\n");
+         exitFlag = 1;
+         break;
+      case CELL_CMD_VS_EXECUTE:
+#if 0
+         spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
+         break;
+      case CELL_CMD_BATCH:
+         cmd_batch(opcode);
+         break;
+      default:
+         printf("Bad opcode 0x%x!\n", opcode & CELL_CMD_OPCODE_MASK);
+      }
+
+      if (PERF) {
+         t1 = spu_read_decrementer();
+         printf("wait mbox time: %gms   batch time: %gms\n",
+                (~0u - t0) * spu.init.inv_timebase,
+                (t0 - t1) * spu.init.inv_timebase);
+      }
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD, "Exit command loop\n");
+
+   if (spu.init.debug_flags & CELL_DEBUG_CACHE)
+      spu_dcache_report();
+}
+
+/* Initialize this module; we manage the fragment ops buffer here. */
+void
+spu_command_init(void)
+{
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function
+    * unless CELL_FORCE_FRAGMENT_OPS_FALLBACK is set.
+    */
+   spu.fragment_ops[CELL_FACING_FRONT] = spu_fallback_fragment_ops;
+   spu.fragment_ops[CELL_FACING_BACK] = spu_fallback_fragment_ops;
+
+   /* Set up the basic empty buffer for code-gen'ed fragment ops */
+   spu.fragment_ops_code = NULL;
+   spu.fragment_ops_code_size = 0;
+}
+
+void
+spu_command_close(void)
+{
+   /* Deallocate the code-gen buffer for fragment ops, and reset the
+    * fragment ops functions to their initial setting (just to leave
+    * things in a good state).
+    */
+   if (spu.fragment_ops_code != NULL) {
+      free(spu.fragment_ops_code);
+   }
+   spu_command_init();
+}
diff --git a/src/gallium/drivers/cell/spu/spu_command.h b/src/gallium/drivers/cell/spu/spu_command.h
new file mode 100644
index 0000000000..83dcdade28
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_command.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+extern void
+command_loop(void);
+
+extern void
+spu_command_init(void);
+
+extern void
+spu_command_close(void);
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.c b/src/gallium/drivers/cell/spu/spu_dcache.c
new file mode 100644
index 0000000000..a6d67634fd
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.c
@@ -0,0 +1,145 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+#define CACHELINE_LOG2SIZE    7
+#define LINE_SIZE             (1U << 7)
+#define ALIGN_MASK            (~(LINE_SIZE - 1))
+
+#define CACHE_NAME            data
+#define CACHED_TYPE           qword
+#define CACHE_TYPE            CACHE_TYPE_RO
+#define CACHE_SET_TAGID(set)  (((set) & 0x03) + TAG_DCACHE0)
+#define CACHE_LOG2NNWAY       2
+#define CACHE_LOG2NSETS       6
+#ifdef DEBUG
+#define CACHE_STATS           1
+#endif
+#include <cache-api.h>
+
+/* Yes folks, this is ugly.
+ */
+#undef CACHE_NWAY
+#undef CACHE_NSETS
+#define CACHE_NAME            data
+#define CACHE_NWAY            4
+#define CACHE_NSETS           (1U << 6)
+
+
+/**
+ * Fetch between arbitrary number of bytes from an unaligned address
+ *
+ * \param dst   Destination data buffer
+ * \param ea    Main memory effective address of source data
+ * \param size  Number of bytes to read
+ *
+ * \warning
+ * As is hinted by the type of the \c dst pointer, this function writes
+ * multiples of 16-bytes.
+ */
+void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size)
+{
+   const int shift = ea & 0x0f;
+   const unsigned read_size = ROUNDUP16(size + shift);
+   const unsigned last_read = ROUNDUP16(ea + size);
+   const qword *const last_write = dst + (ROUNDUP16(size) / 16);
+   unsigned i;
+
+
+   if (shift == 0) {
+      /* Data is already aligned.  Fetch directly into the destination buffer.
+       */
+      for (i = 0; i < size; i += 16) {
+         *(dst++) = cache_rd(data, ea + i);
+      }
+   } else {
+      qword hi;
+
+
+      /* Please exercise extreme caution when modifying this code.  This code
+       * must not read past the end of the page containing the source data,
+       * and it must not write more than ((size + 15) / 16) qwords to the
+       * destination buffer.
+       */
+      ea &= ~0x0f;
+      hi = cache_rd(data, ea);
+      for (i = 16; i < read_size; i += 16) {
+         qword lo = cache_rd(data, ea + i);
+
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift),
+                          (qword) spu_rlmaskqwbyte(lo, shift - 16));
+         hi = lo;
+      }
+
+      if (dst != last_write) {
+         *(dst++) = si_or((qword) spu_slqwbyte(hi, shift), si_il(0));
+      }
+   }
+   
+   ASSERT((ea + i) == last_read);
+   ASSERT(dst == last_write);
+}
+
+
+/**
+ * Notify the cache that a range of main memory may have been modified
+ */
+void
+spu_dcache_mark_dirty(unsigned ea, unsigned size)
+{
+   unsigned i;
+   const unsigned aligned_start = (ea & ALIGN_MASK);
+   const unsigned aligned_end = (ea + size + (LINE_SIZE - 1)) 
+       & ALIGN_MASK;
+
+
+   for (i = 0; i < (CACHE_NWAY * CACHE_NSETS); i++) {
+      const unsigned entry = __cache_dir[i];
+      const unsigned addr = entry & ~0x0f;
+
+      __cache_dir[i] = ((addr >= aligned_start) && (addr < aligned_end))
+          ? (entry & ~CACHELINE_VALID) : entry;
+   }
+}
+
+
+/**
+ * Print cache utilization report
+ */
+void
+spu_dcache_report(void)
+{
+#ifdef CACHE_STATS
+   if (spu.init.id == 0) {
+      printf("SPU 0: Texture cache report:\n");
+      cache_pr_stats(data);
+   }
+#endif
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_dcache.h b/src/gallium/drivers/cell/spu/spu_dcache.h
new file mode 100644
index 0000000000..39a19eb31b
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_dcache.h
@@ -0,0 +1,37 @@
+/*
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SPU_DCACHE_H
+#define SPU_DCACHE_H
+
+extern void
+spu_dcache_fetch_unaligned(qword *dst, unsigned ea, unsigned size);
+
+extern void
+spu_dcache_mark_dirty(unsigned ea, unsigned size);
+
+extern void
+spu_dcache_report(void);
+
+#endif /* SPU_DCACHE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
new file mode 100644
index 0000000000..e27df2dfb3
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -0,0 +1,1933 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include <transpose_matrix4x4.h>
+#include <simdmath/ceilf4.h>
+#include <simdmath/cosf4.h>
+#include <simdmath/divf4.h>
+#include <simdmath/floorf4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+#include <simdmath/sinf4.h>
+#include <simdmath/sqrtf4.h>
+#include <simdmath/truncf4.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+#include "spu_vertex_shader.h"
+#include "spu_dcache.h"
+#include "cell/common.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call spu_exec_machine_run() many times.
+ */
+void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor)
+{
+   const qword zero = si_il(0);
+   const qword not_zero = si_il(~0);
+
+   (void) numSamplers;
+   mach->Samplers = samplers;
+   mach->Processor = processor;
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
+   mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
+   mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
+   mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
+
+   mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
+   mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
+   mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
+   mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
+}
+
+
+static INLINE qword
+micro_abs(qword src)
+{
+   return si_rotmi(si_shli(src, 1), -1);
+}
+
+static INLINE qword
+micro_ceil(qword src)
+{
+   return (qword) _ceilf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_cos(qword src)
+{
+   return (qword) _cosf4((vec_float4) src);
+}
+
+static const qword br_shuf = {
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+   TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
+   TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
+};
+
+static const qword bl_shuf = {
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+   TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
+   TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
+};
+
+static const qword tl_shuf = {
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+   TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
+   TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
+};
+
+static qword
+micro_ddx(qword src)
+{
+   qword bottom_right = si_shufb(src, src, br_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(bottom_right, bottom_left);
+}
+
+static qword
+micro_ddy(qword src)
+{
+   qword top_left = si_shufb(src, src, tl_shuf);
+   qword bottom_left = si_shufb(src, src, bl_shuf);
+
+   return si_fs(top_left, bottom_left);
+}
+
+static INLINE qword
+micro_div(qword src0, qword src1)
+{
+   return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_flr(qword src)
+{
+   return (qword) _floorf4((vec_float4) src);
+}
+
+static qword
+micro_frc(qword src)
+{
+   return si_fs(src, (qword) _floorf4((vec_float4) src));
+}
+
+static INLINE qword
+micro_ge(qword src0, qword src1)
+{
+   return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+}
+
+static qword
+micro_lg2(qword src)
+{
+   return (qword) _log2f4((vec_float4) src);
+}
+
+static INLINE qword
+micro_lt(qword src0, qword src1)
+{
+   const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
+
+   return si_xori(tmp, 0xff);
+}
+
+static INLINE qword
+micro_max(qword src0, qword src1)
+{
+   return si_selb(src1, src0, si_fcgt(src0, src1));
+}
+
+static INLINE qword
+micro_min(qword src0, qword src1)
+{
+   return si_selb(src0, src1, si_fcgt(src0, src1));
+}
+
+static qword
+micro_neg(qword src)
+{
+   return si_xor(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_set_sign(qword src)
+{
+   return si_or(src, (qword) spu_splats(0x80000000));
+}
+
+static qword
+micro_pow(qword src0, qword src1)
+{
+   return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
+}
+
+static qword
+micro_rnd(qword src)
+{
+   const qword half = (qword) spu_splats(0.5f);
+
+   /* May be able to use _roundf4.  There may be some difference, though.
+    */
+   return (qword) _floorf4((vec_float4) si_fa(src, half));
+}
+
+static INLINE qword
+micro_ishr(qword src0, qword src1)
+{
+   return si_rotma(src0, si_sfi(src1, 0));
+}
+
+static qword
+micro_trunc(qword src)
+{
+   return (qword) _truncf4((vec_float4) src);
+}
+
+static qword
+micro_sin(qword src)
+{
+   return (qword) _sinf4((vec_float4) src);
+}
+
+static INLINE qword
+micro_sqrt(qword src)
+{
+   return (qword) _sqrtf4((vec_float4) src);
+}
+
+static void
+fetch_src_file_channel(
+   const struct spu_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union spu_exec_channel *index,
+   union spu_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT: {
+         unsigned i;
+
+         for (i = 0; i < 4; i++) {
+            const float *ptr = mach->Consts[index->i[i]];
+            float tmp[4];
+
+            spu_dcache_fetch_unaligned((qword *) tmp,
+                                       (uintptr_t)(ptr + swizzle),
+                                       sizeof(float));
+
+            chan->f[i] = tmp[0];
+         }
+         break;
+      }
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
+
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         ASSERT( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct spu_exec_machine *mach,
+   union spu_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union spu_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union spu_exec_channel index2;
+      union spu_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle(&reg->SrcRegisterInd,
+                                                   CHAN_X);
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.q = si_a(index.q, indir_index.q);
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.q = si_mpyi(index.q, 17);
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.q = si_shli(index.q, 12);
+         break;
+      default:
+         ASSERT( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union spu_exec_channel index2;
+         union spu_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.q = si_a(index.q, indir_index.q);
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      chan->q = micro_abs(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      chan->q = micro_set_sign(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      chan->q = micro_neg(chan->q);
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
+   }
+}
+
+static void
+store_dest(
+   struct spu_exec_machine *mach,
+   const union spu_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union spu_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      ASSERT( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+      dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      ASSERT( 0 );
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kil(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union spu_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/**
+ * Execute NVIDIA-style KIL which is predicated by a condition code.
+ * Kill fragment if the condition code is TRUE.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+
+   /* TODO: build kilmask from CC mask */
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct spu_sampler *sampler,
+             const union spu_exec_channel *s,
+             const union spu_exec_channel *t,
+             const union spu_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union spu_exec_channel *r,
+             union spu_exec_channel *g,
+             union spu_exec_channel *b,
+             union spu_exec_channel *a )
+{
+   qword rgba[4];
+   qword out[4];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, 
+			(float (*)[4]) rgba);
+
+   _transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
+   r->q = out[0];
+   g->q = out[1];
+   b->q = out[2];
+   a->q = out[3];
+}
+
+
+static void
+exec_tex(struct spu_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod, boolean projected)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union spu_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      if (projected) {
+         FETCH(&r[1], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[1].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      if (projected) {
+         FETCH(&r[3], 0, CHAN_W);
+         r[0].q = micro_div(r[0].q, r[3].q);
+         r[1].q = micro_div(r[1].q, r[3].q);
+         r[2].q = micro_div(r[2].q, r[3].q);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      ASSERT (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+
+static void
+constant_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+static void
+linear_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+static void
+perspective_interpolation(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* interpolation_func)(
+   struct spu_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(struct spu_exec_machine *mach,
+                 const struct tgsi_full_declaration *decl)
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         interpolation_func interp;
+
+         first = decl->DeclarationRange.First;
+         last = decl->DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Declaration.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            interp = constant_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            interp = linear_interpolation;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            interp = perspective_interpolation;
+            break;
+
+         default:
+            ASSERT( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  interp( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     interp( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct spu_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union spu_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+         r[0].q = si_cflts(r[0].q, 0);
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+            r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+         if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+            FETCH( &r[1], 0, CHAN_Y );
+            r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+
+            FETCH( &r[2], 0, CHAN_W );
+            r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
+            r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
+            r[1].q = micro_pow(r[1].q, r[2].q);
+
+            /* r0 = (r0 > 0.0) ? r1 : 0.0
+             */
+            r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
+            r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
+                             r[0].q);
+            STORE( &r[0], 0, CHAN_Z );
+         }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sqrt(r[0].q);
+      r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fm(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fa(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+      r[0].q = si_fm(r[0].q, r[1].q);
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_min(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = micro_max(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ge(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         FETCH( &r[2], 2, chan_index );
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         r[0].q = si_fs(r[0].q, r[1].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         r[1].q = si_fs(r[1].q, r[2].q);
+         r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_frc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_flr(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_rnd(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_lg2(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = micro_pow(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      /* r2 = (r0 * r1) - (r3 * r5)
+       */
+      r[2].q = si_fm(r[3].q, r[5].q);
+      r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+      FETCH(&r[5], 0, CHAN_X);
+
+      /* r3 = (r3 * r2) - (r1 * r5)
+       */
+      r[1].q = si_fm(r[1].q, r[5].q);
+      r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      /* r5 = (r5 * r4) - (r0 * r2)
+       */
+      r[0].q = si_fm(r[0].q, r[2].q);
+      r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       ASSERT (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          r[0].q = micro_abs(r[0].q);
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      r[0].q = si_fa(r[0].q, r[1].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      r[0].q = micro_cos(r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddx(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ddy(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      exec_kil (mach, inst);
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      r[0].q = micro_sin(r[0].q);
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fcgt(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_fceq(r[0].q, r[1].q);
+         r[0].q = si_xori(r[0].q, 0xff);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = load bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         ASSERT(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         ASSERT(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         /* r0 = (r0 < 0.0) ? r1 : r2
+          */
+         r[3].q = si_xor(r[3].q, r[3].q);
+         r[0].q = micro_lt(r[0].q, r[3].q);
+         r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         r[1].q = micro_cos(r[0].q);
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         r[1].q = micro_sin(r[0].q);
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      r[0].q = si_fm(r[0].q, r[1].q);
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         ASSERT(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      ASSERT(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       ASSERT (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_ceil(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_csflt(r[0].q, 0);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = si_xorbi(r[0].q, 0xff);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         r[0].q = micro_trunc(r[0].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+
+         r[0].q = si_shl(r[0].q, r[1].q);
+
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = micro_ishr(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_and(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_or(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         r[0].q = si_xor(r[0].q, r[1].q);
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      ASSERT (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      ASSERT(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         ASSERT(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         ASSERT(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      ASSERT( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      ASSERT( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+spu_exec_machine_run( struct spu_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      for (i = 0; i < mach->NumDeclarations; i++) {
+         union {
+            struct tgsi_full_declaration decl;
+            qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
+         } d ALIGN16_ATTRIB;
+         unsigned ea = (unsigned) (mach->Declarations + pc);
+
+         spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
+
+         exec_declaration( mach, &d.decl );
+      }
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      union {
+         struct tgsi_full_instruction inst;
+         qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
+      } i ALIGN16_ATTRIB;
+      unsigned ea = (unsigned) (mach->Instructions + pc);
+
+      spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
+      exec_instruction( mach, & i.inst, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_exec.h b/src/gallium/drivers/cell/spu/spu_exec.h
new file mode 100644
index 0000000000..8605679940
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_exec.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined SPU_EXEC_H
+#define SPU_EXEC_H
+
+#include "pipe/p_compiler.h"
+#include "tgsi/tgsi_exec.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union spu_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+   qword    q;
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct spu_exec_vector
+{
+   union spu_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct spu_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct spu_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct spu_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct spu_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    */
+   struct spu_exec_vector       Temps[TGSI_EXEC_NUM_TEMPS 
+                                      + TGSI_EXEC_NUM_TEMP_EXTRAS + 1]
+       ALIGN16_ATTRIB;
+
+   struct spu_exec_vector       *Addrs;
+
+   struct spu_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct spu_exec_vector       *Inputs;
+   struct spu_exec_vector       *Outputs;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct spu_interp_coef *InterpCoefs;
+   struct spu_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+};
+
+
+extern void
+spu_exec_machine_init(struct spu_exec_machine *mach,
+                      uint numSamplers,
+                      struct spu_sampler *samplers,
+                      unsigned processor);
+
+extern uint
+spu_exec_machine_run( struct spu_exec_machine *mach );
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* SPU_EXEC_H */
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
new file mode 100644
index 0000000000..ff3d609d25
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -0,0 +1,173 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * SPU functions accessed by shaders.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <string.h>
+#include <libmisc.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <simdmath/exp2f4.h>
+#include <simdmath/log2f4.h>
+#include <simdmath/powf4.h>
+
+#include "cell/common.h"
+#include "spu_main.h"
+#include "spu_funcs.h"
+#include "spu_texture.h"
+
+
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
+
+
+static vector float
+spu_cos(vector float x)
+{
+   return _cos14_v(x);
+}
+
+static vector float
+spu_sin(vector float x)
+{
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   return _powf4(x, y);
+}
+
+static vector float
+spu_exp2(vector float x)
+{
+   return _exp2f4(x);
+}
+
+static vector float
+spu_log2(vector float x)
+{
+   return _log2f4(x);
+}
+
+
+static struct vec_4x4
+spu_tex_2d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_3d(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) r;
+   (void) q;
+   spu.sample_texture_2d[unit](s, t, unit, 0, 0, colors.v);
+   return colors;
+}
+
+static struct vec_4x4
+spu_tex_cube(vector float s, vector float t, vector float r, vector float q,
+           unsigned unit)
+{
+   struct vec_4x4 colors;
+   (void) q;
+   sample_texture_cube(s, t, r, unit, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
+static void
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
+{
+   uint n = spu_functions->num;
+   ASSERT(strlen(name) < 16);
+   strcpy(spu_functions->names[n], name);
+   spu_functions->addrs[n] = (uint) addr;
+   spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
+}
+
+
+/**
+ * Return info about the SPU's function to the PPU / main memory.
+ * The PPU needs to know the address of some SPU-side functions so
+ * that we can generate shader code with function calls.
+ */
+void
+return_function_info(void)
+{
+   struct cell_spu_function_info funcs ALIGN16_ATTRIB;
+   int tag = TAG_MISC;
+
+   ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
+
+   funcs.num = 0;
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_tex_2d", &spu_tex_2d);
+   export_func(&funcs, "spu_tex_3d", &spu_tex_3d);
+   export_func(&funcs, "spu_tex_cube", &spu_tex_cube);
+
+   /* Send the function info back to the PPU / main memory */
+   mfc_put((void *) &funcs,  /* src in local store */
+           (unsigned int) spu.init.spu_functions, /* dst in main memory */
+           sizeof(funcs),  /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << tag);
+}
+
+
+
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.h b/src/gallium/drivers/cell/spu/spu_funcs.h
new file mode 100644
index 0000000000..3adb6ae99f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_funcs.h
@@ -0,0 +1,35 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_FUNCS_H
+#define SPU_FUNCS_H
+
+extern void
+return_function_info(void);
+
+#endif
+
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
new file mode 100644
index 0000000000..97c86d194d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/* main() for Cell SPU code */
+
+
+#include <stdio.h>
+#include <libmisc.h>
+
+#include "pipe/p_defines.h"
+
+#include "spu_funcs.h"
+#include "spu_command.h"
+#include "spu_main.h"
+#include "spu_per_fragment_op.h"
+#include "spu_texture.h"
+//#include "spu_test.h"
+#include "cell/common.h"
+
+
+/*
+helpful headers:
+/usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
+/opt/cell/sdk/usr/include/libmisc.h
+*/
+
+struct spu_global spu;
+
+
+static void
+one_time_init(void)
+{
+   memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
+   memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
+   invalidate_tex_cache();
+}
+
+/* In some versions of the SDK the SPE main takes 'unsigned long' as a
+ * parameter.  In others it takes 'unsigned long long'.  Use a define to
+ * select between the two.
+ */
+#ifdef SPU_MAIN_PARAM_LONG_LONG
+typedef unsigned long long main_param_t;
+#else
+typedef unsigned long main_param_t;
+#endif
+
+/**
+ * SPE entrypoint.
+ */
+int
+main(main_param_t speid, main_param_t argp)
+{
+   int tag = 0;
+
+   (void) speid;
+
+   ASSERT(sizeof(tile_t) == TILE_SIZE * TILE_SIZE * 4);
+   ASSERT(sizeof(struct cell_command_render) % 8 == 0);
+   ASSERT(sizeof(struct cell_command_fragment_ops) % 8 == 0);
+   ASSERT(((unsigned long) &spu.fragment_program_code) % 8 == 0);
+
+   one_time_init();
+   spu_command_init();
+
+   D_PRINTF(CELL_DEBUG_CMD, "main() speid=%lu\n", (unsigned long) speid);
+   D_PRINTF(CELL_DEBUG_FRAGMENT_OP_FALLBACK, "using fragment op fallback\n");
+
+   /* get initialization data */
+   mfc_get(&spu.init,  /* dest */
+           (unsigned int) argp, /* src */
+           sizeof(struct cell_init_info), /* bytes */
+           tag,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask( 1 << tag );
+
+   if (spu.init.id == 0) {
+      return_function_info();
+   }
+
+#if 0
+   if (spu.init.id==0)
+      spu_test_misc(spu.init.id);
+#endif
+
+   command_loop();
+
+   spu_command_close();
+
+   return 0;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
new file mode 100644
index 0000000000..33767e7c51
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -0,0 +1,254 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_MAIN_H
+#define SPU_MAIN_H
+
+
+#include <spu_mfcio.h>
+
+#include "cell/common.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_state.h"
+
+
+#if DEBUG
+/* These debug macros use the unusual construction ", ##__VA_ARGS__"
+ * which expands to the expected comma + args if variadic arguments
+ * are supplied, but swallows the comma if there are no variadic
+ * arguments (which avoids syntax errors that would otherwise occur).
+ */
+#define D_PRINTF(flag, format,...) \
+   if (spu.init.debug_flags & (flag)) \
+      printf("SPU %u: " format, spu.init.id, ##__VA_ARGS__)
+#else
+#define D_PRINTF(...)
+#endif
+
+
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
+typedef union {
+   ushort us[TILE_SIZE][TILE_SIZE];
+   uint   ui[TILE_SIZE][TILE_SIZE];
+   vector unsigned short us8[TILE_SIZE/2][TILE_SIZE/4];
+   vector unsigned int ui4[TILE_SIZE/2][TILE_SIZE/2];
+} tile_t;
+
+
+#define TILE_STATUS_CLEAR   1
+#define TILE_STATUS_DEFINED 2  /**< defined in FB, but not in local store */
+#define TILE_STATUS_CLEAN   3  /**< in local store, but not changed */
+#define TILE_STATUS_DIRTY   4  /**< modified locally, but not put back yet */
+#define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
+
+
+/** Function for sampling textures */
+typedef void (*spu_sample_texture_2d_func)(vector float s,
+                                           vector float t,
+                                           uint unit, uint level, uint face,
+                                           vector float colors[4]);
+
+
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
+
+/** Function for running fragment program */
+typedef vector unsigned int (*spu_fragment_program_func)(vector float *inputs,
+                                                         vector float *outputs,
+                                                         vector float *constants);
+
+
+struct spu_framebuffer
+{
+   void *color_start;              /**< addr of color surface in main memory */
+   void *depth_start;              /**< addr of depth surface in main memory */
+   enum pipe_format color_format;
+   enum pipe_format depth_format;
+   uint width, height;             /**< size in pixels */
+   uint width_tiles, height_tiles; /**< width and height in tiles */
+
+   uint color_clear_value;
+   uint depth_clear_value;
+
+   uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
+} ALIGN16_ATTRIB;
+
+
+/** per-texture level info */
+struct spu_texture_level
+{
+   void *start;
+   ushort width, height, depth;
+   ushort tiles_per_row;
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s, scale_t, scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t, mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
+} ALIGN16_ATTRIB;
+
+
+/**
+ * All SPU global/context state will be in a singleton object of this type:
+ */
+struct spu_global
+{
+   /** One-time init/constant info */
+   struct cell_init_info init;
+
+   /*
+    * Current state
+    */
+   struct spu_framebuffer fb;
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
+   struct pipe_blend_color blend_color;
+   struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_rasterizer_state rasterizer;
+   struct spu_texture texture[PIPE_MAX_SAMPLERS];
+   struct vertex_info vertex_info;
+
+   /** Current color and Z tiles */
+   tile_t ctile ALIGN16_ATTRIB;
+   tile_t ztile ALIGN16_ATTRIB;
+
+   /** Read depth/stencil tiles? */
+   boolean read_depth_stencil;
+
+   /** Current tiles' status */
+   ubyte cur_ctile_status, cur_ztile_status;
+
+   /** Status of all tiles in framebuffer */
+   ubyte ctile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+   ubyte ztile_status[CELL_MAX_HEIGHT/TILE_SIZE][CELL_MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
+
+   /** Current fragment ops machine code, at 8-byte boundary */
+   uint *fragment_ops_code;
+   uint fragment_ops_code_size;
+   /** Current fragment ops functions, 0 = frontfacing, 1 = backfacing */
+   spu_fragment_ops_func fragment_ops[2];
+
+   /** Current fragment program machine code, at 8-byte boundary */
+   uint fragment_program_code[SPU_MAX_FRAGMENT_PROGRAM_INSTS] ALIGN8_ATTRIB;
+   /** Current fragment ops function */
+   spu_fragment_program_func fragment_program;
+
+   /** Current texture sampler function */
+   spu_sample_texture_2d_func sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func min_sample_texture_2d[CELL_MAX_SAMPLERS];
+   spu_sample_texture_2d_func mag_sample_texture_2d[CELL_MAX_SAMPLERS];
+
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
+
+} ALIGN16_ATTRIB;
+
+
+extern struct spu_global spu;
+
+
+
+/* DMA TAGS */
+
+#define TAG_SURFACE_CLEAR     10
+#define TAG_VERTEX_BUFFER     11
+#define TAG_READ_TILE_COLOR   12
+#define TAG_READ_TILE_Z       13
+#define TAG_WRITE_TILE_COLOR  14
+#define TAG_WRITE_TILE_Z      15
+#define TAG_INDEX_BUFFER      16
+#define TAG_BATCH_BUFFER      17
+#define TAG_MISC              18
+#define TAG_DCACHE0           20
+#define TAG_DCACHE1           21
+#define TAG_DCACHE2           22
+#define TAG_DCACHE3           23
+#define TAG_FENCE             24
+
+
+static INLINE void
+wait_on_mask(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_any();
+}
+
+
+static INLINE void
+wait_on_mask_all(unsigned tagMask)
+{
+   mfc_write_tag_mask( tagMask );
+   /* wait for completion of _any_ DMAs specified by tagMask */
+   mfc_read_tag_status_all();
+}
+
+
+
+
+
+static INLINE void
+memset16(ushort *d, ushort value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+static INLINE void
+memset32(uint *d, uint value, uint count)
+{
+   uint i;
+   for (i = 0; i < count; i++)
+      d[i] = value;
+}
+
+
+#endif /* SPU_MAIN_H */
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
new file mode 100644
index 0000000000..683664e8a4
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -0,0 +1,631 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \author Brian Paul
+ */
+
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_format.h"
+#include "spu_main.h"
+#include "spu_colorpack.h"
+#include "spu_per_fragment_op.h"
+
+
+#define LINEAR_QUAD_LAYOUT 1
+
+
+static INLINE vector float
+spu_min(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(a, b, m);
+}
+
+
+static INLINE vector float
+spu_max(vector float a, vector float b)
+{
+   vector unsigned int m;
+   m = spu_cmpgt(a, b);    /* m = a > b ? ~0 : 0 */
+   return spu_sel(b, a, m);
+}
+
+
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask)
+{
+   vector float frag_aos[4];
+   unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
+   unsigned int fragc0, fragc1, fragc2, fragc3;  /* fragment colors */
+
+   /*
+    * Do alpha test
+    */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
+   }
+
+
+   /*
+    * Z and/or stencil testing...
+    */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+   }
+
+
+   /*
+    * If we'll need the current framebuffer/tile colors for blending
+    * or logicop or colormask, fetch them now.
+    */
+   if (spu.blend.blend_enable ||
+       spu.blend.logicop_enable ||
+       spu.blend.colormask != 0xf) {
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+      fbc0 = colorTile->ui[y][x*2+0];
+      fbc1 = colorTile->ui[y][x*2+1];
+      fbc2 = colorTile->ui[y][x*2+2];
+      fbc3 = colorTile->ui[y][x*2+3];
+#else
+      fbc0 = colorTile->ui[y+0][x+0];
+      fbc1 = colorTile->ui[y+0][x+1];
+      fbc2 = colorTile->ui[y+1][x+0];
+      fbc3 = colorTile->ui[y+1][x+1];
+#endif
+   }
+
+
+   /*
+    * Do blending
+    */
+   if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* convert framebuffer colors from packed int to vector float */
+      {
+         vector float temp[4]; /* float colors in AOS form */
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            temp[0] = spu_unpack_B8G8R8A8(fbc0);
+            temp[1] = spu_unpack_B8G8R8A8(fbc1);
+            temp[2] = spu_unpack_B8G8R8A8(fbc2);
+            temp[3] = spu_unpack_B8G8R8A8(fbc3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            temp[0] = spu_unpack_A8R8G8B8(fbc0);
+            temp[1] = spu_unpack_A8R8G8B8(fbc1);
+            temp[2] = spu_unpack_A8R8G8B8(fbc2);
+            temp[3] = spu_unpack_A8R8G8B8(fbc3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, temp); /* fbRGBA = transpose(temp) */
+      }
+
+      /*
+       * Compute Src RGB terms (fragment color * factor)
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term1r = spu_mul(fragR, fbRGBA[0]);
+         term1g = spu_mul(fragG, fbRGBA[1]);
+         term1b = spu_mul(fragB, fbRGBA[1]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1r = spu_mul(fragR, fbRGBA[3]);
+         term1g = spu_mul(fragG, fbRGBA[3]);
+         term1b = spu_mul(fragB, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[0]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[1]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1r = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         term1g = spu_mul(fragG, spu_splats(spu.blend_color.color[3]));
+         term1b = spu_mul(fragB, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term (fragment alpha * factor)
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term1a = spu_mul(fragA, fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term1a = spu_mul(fragR, spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms (framebuffer color * factor)
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fbRGBA[0];
+         term2g = fbRGBA[1];
+         term2b = fbRGBA[2];
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[0]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[1]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[2]);
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fbRGBA[3]);
+         term2g = spu_mul(fbRGBA[1], fbRGBA[3]);
+         term2b = spu_mul(fbRGBA[2], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[0]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[1]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[2]));
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2r = spu_mul(fbRGBA[0], spu_splats(spu.blend_color.color[3]));
+         term2g = spu_mul(fbRGBA[1], spu_splats(spu.blend_color.color[3]));
+         term2b = spu_mul(fbRGBA[2], spu_splats(spu.blend_color.color[3]));
+         break;
+       /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term (framebuffer alpha * factor)
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fbRGBA[3];
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fbRGBA[3]);
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         term2a = spu_mul(fbRGBA[3], spu_splats(spu.blend_color.color[3]));
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragR = spu_sub(term2r, term1r);
+         fragG = spu_sub(term2g, term1g);
+         fragB = spu_sub(term2b, term1b);
+         break;
+      case PIPE_BLEND_MIN:
+         fragR = spu_min(term1r, term2r);
+         fragG = spu_min(term1g, term2g);
+         fragB = spu_min(term1b, term2b);
+         break;
+      case PIPE_BLEND_MAX:
+         fragR = spu_max(term1r, term2r);
+         fragG = spu_max(term1g, term2g);
+         fragB = spu_max(term1b, term2b);
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         fragA = spu_sub(term2a, term1a);
+         break;
+      case PIPE_BLEND_MIN:
+         fragA = spu_min(term1a, term2a);
+         break;
+      case PIPE_BLEND_MAX:
+         fragA = spu_max(term1a, term2a);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+
+
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
+   }
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack fragment float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      fragc0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      fragc1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      fragc2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      fragc3 = spu_pack_A8R8G8B8(frag_aos[3]);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      fragc0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      fragc1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      fragc2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      fragc3 = spu_pack_B8G8R8A8(frag_aos[3]);
+      break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
+   }
+
+
+   /*
+    * Do color masking
+    */
+   if (spu.blend.colormask != 0xf) {
+      uint cmask = 0x0; /* each byte corresponds to a color channel */
+
+      /* Form bitmask depending on color buffer format and colormask bits */
+      switch (spu.fb.color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x00ff0000; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x0000ff00; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0x000000ff; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0xff000000; /* alpha */
+         break;
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         if (spu.blend.colormask & PIPE_MASK_R)
+            cmask |= 0x0000ff00; /* red */
+         if (spu.blend.colormask & PIPE_MASK_G)
+            cmask |= 0x00ff0000; /* green */
+         if (spu.blend.colormask & PIPE_MASK_B)
+            cmask |= 0xff000000; /* blue */
+         if (spu.blend.colormask & PIPE_MASK_A)
+            cmask |= 0x000000ff; /* alpha */
+         break;
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Apply color mask to the 32-bit packed colors.
+       * if (cmask[i])
+       *    frag color[i] = frag color[i];
+       * else
+       *    frag color[i] = framebuffer color[i];
+       */
+      fragc0 = (fragc0 & cmask) | (fbc0 & ~cmask);
+      fragc1 = (fragc1 & cmask) | (fbc1 & ~cmask);
+      fragc2 = (fragc2 & cmask) | (fbc2 & ~cmask);
+      fragc3 = (fragc3 & cmask) | (fbc3 & ~cmask);
+   }
+
+
+   /*
+    * Do logic ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors (fragcx and fbcx) */
+   }
+
+
+   /*
+    * If mask is non-zero, mark tile as dirty.
+    */
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   else {
+      /* write no fragments */
+      return;
+   }
+
+
+   /*
+    * Write new fragment/quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
+    */
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|...
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = fragc3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|...
+    *  +--+--+
+    *  |p2|p3|...
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = fragc0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = fragc1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = fragc2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = fragc3;
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
new file mode 100644
index 0000000000..f817abf046
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -0,0 +1,44 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_PER_FRAGMENT_OP
+#define SPU_PER_FRAGMENT_OP
+
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
+
+#endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
new file mode 100644
index 0000000000..7c225e2f27
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -0,0 +1,295 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <libmisc.h>
+#include <spu_mfcio.h>
+
+#include "spu_main.h"
+#include "spu_render.h"
+#include "spu_tri.h"
+#include "spu_tile.h"
+#include "cell/common.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Given a rendering command's bounding box (in pixels) compute the
+ * location of the corresponding screen tile bounding box.
+ */
+static INLINE void
+tile_bounding_box(const struct cell_command_render *render,
+                  uint *txmin, uint *tymin,
+                  uint *box_num_tiles, uint *box_width_tiles)
+{
+#if 0
+   /* Debug: full-window bounding box */
+   uint txmax = spu.fb.width_tiles - 1;
+   uint tymax = spu.fb.height_tiles - 1;
+   *txmin = 0;
+   *tymin = 0;
+   *box_num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   *box_width_tiles = spu.fb.width_tiles;
+   (void) render;
+   (void) txmax;
+   (void) tymax;
+#else
+   uint txmax, tymax, box_height_tiles;
+
+   *txmin = (uint) render->xmin / TILE_SIZE;
+   *tymin = (uint) render->ymin / TILE_SIZE;
+   txmax = (uint) render->xmax / TILE_SIZE;
+   tymax = (uint) render->ymax / TILE_SIZE;
+   if (txmax >= spu.fb.width_tiles)
+      txmax = spu.fb.width_tiles-1;
+   if (tymax >= spu.fb.height_tiles)
+      tymax = spu.fb.height_tiles-1;
+   *box_width_tiles = txmax - *txmin + 1;
+   box_height_tiles = tymax - *tymin + 1;
+   *box_num_tiles = *box_width_tiles * box_height_tiles;
+#endif
+#if 0
+   printf("SPU %u: bounds: %g, %g  ...  %g, %g\n", spu.init.id,
+          render->xmin, render->ymin, render->xmax, render->ymax);
+   printf("SPU %u: tiles:  %u, %u .. %u, %u\n",
+           spu.init.id, *txmin, *tymin, txmax, tymax);
+   ASSERT(render->xmin <= render->xmax);
+   ASSERT(render->ymin <= render->ymax);
+#endif
+}
+
+
+/** Check if the tile at (tx,ty) belongs to this SPU */
+static INLINE boolean
+my_tile(uint tx, uint ty)
+{
+   return (spu.fb.width_tiles * ty + tx) % spu.init.num_spus == spu.init.id;
+}
+
+
+/**
+ * Start fetching non-clear color/Z tiles from main memory
+ */
+static INLINE void
+get_cz_tiles(uint tx, uint ty)
+{
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status != TILE_STATUS_CLEAR) {
+         //printf("SPU %u: getting Z tile %u, %u\n", spu.init.id, tx, ty);
+         get_tile(tx, ty, &spu.ztile, TAG_READ_TILE_Z, 1);
+         spu.cur_ztile_status = TILE_STATUS_GETTING;
+      }
+   }
+
+   if (spu.cur_ctile_status != TILE_STATUS_CLEAR) {
+      //printf("SPU %u: getting C tile %u, %u\n", spu.init.id, tx, ty);
+      get_tile(tx, ty, &spu.ctile, TAG_READ_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_GETTING;
+   }
+}
+
+
+/**
+ * Start putting dirty color/Z tiles back to main memory
+ */
+static INLINE void
+put_cz_tiles(uint tx, uint ty)
+{
+   if (spu.cur_ztile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty Z tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ztile, TAG_WRITE_TILE_Z, 1);
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ztile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting Z tile %u, %u\n", spu.init.id, tx, ty);
+   }
+
+   if (spu.cur_ctile_status == TILE_STATUS_DIRTY) {
+      /* tile was modified and needs to be written back */
+      //printf("SPU %u: put dirty C tile %u, %u\n", spu.init.id, tx, ty);
+      put_tile(tx, ty, &spu.ctile, TAG_WRITE_TILE_COLOR, 0);
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* tile was never used */
+      spu.cur_ctile_status = TILE_STATUS_DEFINED;
+      //printf("SPU %u: put getting C tile %u, %u\n", spu.init.id, tx, ty);
+   }
+}
+
+
+/**
+ * Wait for 'put' of color/z tiles to complete.
+ */
+static INLINE void
+wait_put_cz_tiles(void)
+{
+   wait_on_mask(1 << TAG_WRITE_TILE_COLOR);
+   if (spu.read_depth_stencil) {
+      wait_on_mask(1 << TAG_WRITE_TILE_Z);
+   }
+}
+
+
+/**
+ * Render primitives
+ * \param pos_incr  returns value indicating how may words to skip after
+ *                  this command in the batch buffer
+ */
+void
+cmd_render(const struct cell_command_render *render, uint *pos_incr)
+{
+   /* we'll DMA into these buffers */
+   ubyte vertex_data[CELL_BUFFER_SIZE] ALIGN16_ATTRIB;
+   const uint vertex_size = render->vertex_size; /* in bytes */
+   /*const*/ uint total_vertex_bytes = render->num_verts * vertex_size;
+   uint index_bytes;
+   const ubyte *vertices;
+   const ushort *indexes;
+   uint i, j;
+   uint num_tiles;
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER prim=%u num_vert=%u num_ind=%u inline_vert=%u\n",
+            render->prim_type,
+            render->num_verts,
+            render->num_indexes,
+            render->inline_verts);
+
+   ASSERT(sizeof(*render) % 4 == 0);
+   ASSERT(total_vertex_bytes % 16 == 0);
+   ASSERT(render->prim_type == PIPE_PRIM_TRIANGLES);
+   ASSERT(render->num_indexes % 3 == 0);
+
+
+   /* indexes are right after the render command in the batch buffer */
+   indexes = (const ushort *) (render + 1);
+   index_bytes = ROUNDUP8(render->num_indexes * 2);
+   *pos_incr = index_bytes / 8 + sizeof(*render) / 8;
+
+
+   if (render->inline_verts) {
+      /* Vertices are after indexes in batch buffer at next 16-byte addr */
+      vertices = (const ubyte *) render + (*pos_incr * 8);
+      vertices = (const ubyte *) align_pointer((void *) vertices, 16);
+      ASSERT_ALIGN16(vertices);
+      *pos_incr = ((vertices + total_vertex_bytes) - (ubyte *) render) / 8;
+   }
+   else {
+      /* Begin DMA fetch of vertex buffer */
+      ubyte *src = spu.init.buffers[render->vertex_buf];
+      ubyte *dest = vertex_data;
+
+      /* skip vertex data we won't use */
+#if 01
+      src += render->min_index * vertex_size;
+      dest += render->min_index * vertex_size;
+      total_vertex_bytes -= render->min_index * vertex_size;
+#endif
+      ASSERT(total_vertex_bytes % 16 == 0);
+      ASSERT_ALIGN16(dest);
+      ASSERT_ALIGN16(src);
+
+      mfc_get(dest,   /* in vertex_data[] array */
+              (unsigned int) src,  /* src in main memory */
+              total_vertex_bytes,  /* size */
+              TAG_VERTEX_BUFFER,
+              0, /* tid */
+              0  /* rid */);
+
+      vertices = vertex_data;
+
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+   }
+
+
+   /**
+    ** find tiles which intersect the prim bounding box
+    **/
+   uint txmin, tymin, box_width_tiles, box_num_tiles;
+   tile_bounding_box(render, &txmin, &tymin,
+                     &box_num_tiles, &box_width_tiles);
+
+
+   /* make sure any pending clears have completed */
+   wait_on_mask(1 << TAG_SURFACE_CLEAR); /* XXX temporary */
+
+
+   num_tiles = 0;
+
+   /**
+    ** loop over tiles, rendering tris
+    **/
+   for (i = 0; i < box_num_tiles; i++) {
+      const uint tx = txmin + i % box_width_tiles;
+      const uint ty = tymin + i / box_width_tiles;
+
+      ASSERT(tx < spu.fb.width_tiles);
+      ASSERT(ty < spu.fb.height_tiles);
+
+      if (!my_tile(tx, ty))
+         continue;
+
+      num_tiles++;
+
+      spu.cur_ctile_status = spu.ctile_status[ty][tx];
+      spu.cur_ztile_status = spu.ztile_status[ty][tx];
+
+      get_cz_tiles(tx, ty);
+
+      uint drawn = 0;
+
+      /* loop over tris */
+      for (j = 0; j < render->num_indexes; j += 3) {
+         const float *v0, *v1, *v2;
+
+         v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
+         v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
+         v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
+
+         drawn += tri_draw(v0, v1, v2, tx, ty);
+      }
+
+      //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
+
+      /* write color/z tiles back to main framebuffer, if dirtied */
+      put_cz_tiles(tx, ty);
+
+      wait_put_cz_tiles(); /* XXX seems unnecessary... */
+
+      spu.ctile_status[ty][tx] = spu.cur_ctile_status;
+      spu.ztile_status[ty][tx] = spu.cur_ztile_status;
+   }
+
+   D_PRINTF(CELL_DEBUG_CMD,
+            "RENDER done (%u tiles hit)\n",
+            num_tiles);
+}
diff --git a/src/gallium/drivers/cell/spu/spu_render.h b/src/gallium/drivers/cell/spu/spu_render.h
new file mode 100644
index 0000000000..493434f087
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_render.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_RENDER_H
+#define SPU_RENDER_H
+
+#include "cell/common.h"
+
+extern void
+cmd_render(const struct cell_command_render *render, uint *pos_incr);
+
+#endif /* SPU_RENDER_H */
+
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
new file mode 100644
index 0000000000..69784c8978
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -0,0 +1,641 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <math.h>
+
+#include "pipe/p_compiler.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_colorpack.h"
+#include "spu_dcache.h"
+
+
+/**
+ * Mark all tex cache entries as invalid.
+ */
+void
+invalidate_tex_cache(void)
+{
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
+
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
+
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
+}
+
+
+/**
+ * Get four texels from locations (x[0], y[0]), (x[1], y[1]) ...
+ *
+ * NOTE: in the typical case of bilinear filtering, the four texels
+ * are in a 2x2 group so we could get by with just two dcache fetches
+ * (two side-by-side texels per fetch).  But when bilinear filtering
+ * wraps around a texture edge, we'll probably need code like we have
+ * now.
+ * FURTHERMORE: since we're rasterizing a quad of 2x2 pixels at a time,
+ * it's quite likely that the four pixels in a quad will need some of the
+ * same texels.  So look into doing texture fetches for four pixels at
+ * a time.
+ */
+static void
+get_four_texels(const struct spu_texture_level *tlevel, uint face,
+                vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
+{
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
+
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
+
+   qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
+   tile_offset = si_mpy((qword) tile_offset, tile_size);
+
+   qword texel_offset = si_a(si_mpyui(offset_y, 32), offset_x);
+   texel_offset = si_mpyui(texel_offset, 4);
+   
+   vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
+   
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
+   spu_dcache_fetch_unaligned((qword *) & texels[0],
+                              texture_ea + spu_extract(offset, 0), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[1],
+                              texture_ea + spu_extract(offset, 1), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[2],
+                              texture_ea + spu_extract(offset, 2), 4);
+   spu_dcache_fetch_unaligned((qword *) & texels[3],
+                              texture_ea + spu_extract(offset, 3), 4);
+}
+
+
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
+/**
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(tlevel, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
+}
+
+
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
+
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
+
+   /* Compute weighting factors in [0,1]
+    * Multiply texcoord by 1024, AND with 1023, convert back to float.
+    */
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int instead of float arithmetic for computing
+ * sample weights.
+ */
+void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 7 fraction bits */
+   vector signed int is = spu_convts(ss, 7);  /* XXX really need floor() here */
+   vector signed int it = spu_convts(tt, 7);  /* XXX really need floor() here */
+
+   /* compute integer texel weights in [0, 127] */
+   vector signed int sWeights0 = spu_and(is, 127);
+   vector signed int tWeights0 = spu_and(it, 127);
+   vector signed int sWeights1 = spu_sub(127, sWeights0);
+   vector signed int tWeights1 = spu_sub(127, tWeights0);
+
+   /* texel coords: is0 = is / 128, it0 = is / 128 */
+   vector signed int is0 = spu_rlmask(is, -7);
+   vector signed int it0 = spu_rlmask(it, -7);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(tlevel, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(tlevel, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(tlevel, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(tlevel, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpy((qword) texel0, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel4, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel8, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel12, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 22);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpy((qword) texel1, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel5, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel9, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel13, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 22);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpy((qword) texel2, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel6, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel10, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel14, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 22);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpy((qword) texel3, si_mpy((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpy((qword) texel7, si_mpy((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpy((qword) texel11, si_mpy((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpy((qword) texel15, si_mpy((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 22);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static INLINE float
+compute_lambda_2d(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+#if 0
+   /* ideal value */
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+#else
+   /* approximation */
+   dsdx = fabsf(dsdx);
+   dsdy = fabsf(dsdy);
+   dtdx = fabsf(dtdx);
+   dtdy = fabsf(dtdy);
+   float rho = (dsdx + dsdy + dtdx + dtdy) * 0.5;
+#endif
+   float lambda = logf(rho) * 1.442695f; /* compute logbase2(rho) */
+   return lambda;
+}
+
+
+/**
+ * Blend two sets of colors according to weight.
+ */
+static void
+blend_colors(vector float c0[4], const vector float c1[4], float weight)
+{
+   vector float t = spu_splats(weight);
+   vector float dc0 = spu_sub(c1[0], c0[0]);
+   vector float dc1 = spu_sub(c1[1], c0[1]);
+   vector float dc2 = spu_sub(c1[2], c0[2]);
+   vector float dc3 = spu_sub(c1[3], c0[3]);
+   c0[0] = spu_madd(dc0, t, c0[0]);
+   c0[1] = spu_madd(dc1, t, c0[1]);
+   c0[2] = spu_madd(dc2, t, c0[2]);
+   c0[3] = spu_madd(dc3, t, c0[3]);
+}
+
+
+/**
+ * Texture sampling with level of detail selection and possibly mipmap
+ * interpolation.
+ */
+void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level_ignored, uint face,
+                      vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda_2d(unit, s, t);
+
+   (void) face;
+   (void) level_ignored;
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture_2d[unit](s, t, unit, 0, face, colors);
+   }
+   else {
+      /* minify */
+      if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+         /* sample two mipmap levels and interpolate */
+         int level = (int) lambda;
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+         if (spu.sampler[unit].min_img_filter == PIPE_TEX_FILTER_LINEAR) {
+            /* sample second mipmap level */
+            float weight = lambda - (float) level;
+            level++;
+            if (level <= (int) spu.texture[unit].max_level) {
+               vector float colors2[4];
+               spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors2);
+               blend_colors(colors, colors2, weight);
+            }
+         }
+      }
+      else {
+         /* sample one mipmap level */
+         int level = (int) (lambda + 0.5f);
+         if (level > (int) spu.texture[unit].max_level)
+            level = spu.texture[unit].max_level;
+         spu.min_sample_texture_2d[unit](s, t, unit, level, face, colors);
+      }
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4])
+{
+   uint p, faces[4], level = 0;
+   float newS[4], newT[4];
+
+   /* Compute cube faces referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      spu.sample_texture_2d[unit](s, t, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         spu.sample_texture_2d[unit](spu_splats(newS[p]), spu_splats(newT[p]),
+                                     unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
new file mode 100644
index 0000000000..7b75b007b5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -0,0 +1,67 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TEXTURE_H
+#define SPU_TEXTURE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+extern void
+invalidate_tex_cache(void);
+
+
+extern void
+sample_texture_2d_nearest(vector float s, vector float t,
+                          uint unit, uint level, uint face,
+                          vector float colors[4]);
+
+
+extern void
+sample_texture_2d_bilinear(vector float s, vector float t,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
+extern void
+sample_texture_2d_bilinear_int(vector float s, vector float t,
+                               uint unit, uint level, uint face,
+                               vector float colors[4]);
+
+
+extern void
+sample_texture_2d_lod(vector float s, vector float t,
+                      uint unit, uint level, uint face,
+                      vector float colors[4]);
+
+
+extern void
+sample_texture_cube(vector float s, vector float t, vector float r,
+                    uint unit, vector float colors[4]);
+
+
+#endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
new file mode 100644
index 0000000000..6905015a48
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+#include "spu_tile.h"
+#include "spu_main.h"
+
+
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
+void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   const ubyte *src = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   src += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("get_tile:  dest: %p  src: 0x%x  size: %d\n",
+          tile, (unsigned int) src, bytesPerTile);
+   */
+   mfc_get(tile->ui,  /* dest in local memory */
+           (unsigned int) src, /* src in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
+void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
+{
+   const uint offset = ty * spu.fb.width_tiles + tx;
+   const uint bytesPerTile = TILE_SIZE * TILE_SIZE * (zBuf ? spu.fb.zsize : 4);
+   ubyte *dst = zBuf ? spu.fb.depth_start : spu.fb.color_start;
+
+   dst += offset * bytesPerTile;
+
+   ASSERT(tx < spu.fb.width_tiles);
+   ASSERT(ty < spu.fb.height_tiles);
+   ASSERT_ALIGN16(tile);
+   /*
+   printf("SPU %u: put_tile:  src: %p  dst: 0x%x  size: %d\n",
+          spu.init.id,
+          tile, (unsigned int) dst, bytesPerTile);
+   */
+   mfc_put((void *) tile->ui,  /* src in local memory */
+           (unsigned int) dst,  /* dst in main memory */
+           bytesPerTile,
+           tag,
+           0, /* tid */
+           0  /* rid */);
+}
+
+
+/**
+ * For tiles whose status is TILE_STATUS_CLEAR, write solid-filled
+ * tiles back to the main framebuffer.
+ */
+void
+really_clear_tiles(uint surfaceIndex)
+{
+   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+   uint i;
+
+   if (surfaceIndex == 0) {
+      clear_c_tile(&spu.ctile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ctile_status[ty][tx] == TILE_STATUS_CLEAR) {
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         }
+      }
+   }
+   else {
+      clear_z_tile(&spu.ztile);
+
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (spu.ztile_status[ty][tx] == TILE_STATUS_CLEAR)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+#if 0
+   wait_on_mask(1 << TAG_SURFACE_CLEAR);
+#endif
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tile.h b/src/gallium/drivers/cell/spu/spu_tile.h
new file mode 100644
index 0000000000..7bfb52be8f
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tile.h
@@ -0,0 +1,75 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SPU_TILE_H
+#define SPU_TILE_H
+
+
+#include <libmisc.h>
+#include <spu_mfcio.h>
+#include "spu_main.h"
+#include "cell/common.h"
+
+
+
+extern void
+get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf);
+
+extern void
+put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf);
+
+extern void
+really_clear_tiles(uint surfaceIndex);
+
+
+static INLINE void
+clear_c_tile(tile_t *ctile)
+{
+   memset32((uint*) ctile->ui,
+            spu.fb.color_clear_value,
+            TILE_SIZE * TILE_SIZE);
+}
+
+
+static INLINE void
+clear_z_tile(tile_t *ztile)
+{
+   if (spu.fb.zsize == 2) {
+      memset16((ushort*) ztile->us,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+   else {
+      ASSERT(spu.fb.zsize != 0);
+      memset32((uint*) ztile->ui,
+               spu.fb.depth_clear_value,
+               TILE_SIZE * TILE_SIZE);
+   }
+}
+
+
+#endif /* SPU_TILE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
new file mode 100644
index 0000000000..22e51a86ae
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -0,0 +1,810 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Triangle rendering within a tile.
+ */
+
+#include <transpose_matrix4x4.h>
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "util/u_math.h"
+#include "spu_colorpack.h"
+#include "spu_main.h"
+#include "spu_texture.h"
+#include "spu_tile.h"
+#include "spu_tri.h"
+
+
+/** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
+typedef vector unsigned int mask_t;
+
+
+
+/**
+ * Simplified types taken from other parts of Gallium
+ */
+struct vertex_header {
+   vector float data[1];
+};
+
+
+
+/* XXX fix this */
+#undef CEILF
+#define CEILF(X) ((float) (int) ((X) + 0.99999))
+
+
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+#define DEBUG_VERTS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+
+struct interp_coef
+{
+   vector float a0;
+   vector float dadx;
+   vector float dady;
+};
+
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const struct vertex_header *vmax;
+   const struct vertex_header *vmid;
+   const struct vertex_header *vmin;
+   const struct vertex_header *vprovoke;
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneOverArea;  /* XXX maybe make into vector? */
+
+   uint facing;
+
+   uint tx, ty;  /**< position of current tile (x, y) */
+
+   int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
+
+   struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+};
+
+
+static struct setup_stage setup;
+
+
+/**
+ * Evaluate attribute coefficients (plane equations) to compute
+ * attribute values for the four fragments in a quad.
+ * Eg: four colors will be computed (in AoS format).
+ */
+static INLINE void
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
+{
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
+   case INTERP_CONSTANT:
+      result[QUAD_TOP_LEFT] =
+      result[QUAD_TOP_RIGHT] =
+      result[QUAD_BOTTOM_LEFT] =
+      result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0;
+      break;
+   case INTERP_LINEAR:
+      {
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         result[QUAD_TOP_LEFT] = topLeft;
+         result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
+         result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
+         result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
+      }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx;
+         vector float dady = setup.coef[slot].dady;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/**
+ * As above, but return 4 vectors in SOA format.
+ * XXX this will all be re-written someday.
+ */
+static INLINE void
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
+{
+   eval_coeff(slot, x, y, w, result);
+   _transpose_matrix4x4(result, result);
+}
+
+
+/** Evalute coefficients to get Z for four pixels in a quad */
+static INLINE vector float
+eval_z(float x, float y)
+{
+   const uint slot = 0;
+   const float dzdx = spu_extract(setup.coef[slot].dadx, 2);
+   const float dzdy = spu_extract(setup.coef[slot].dady, 2);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 2) + x * dzdx + y * dzdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dzdx, dzdy, dzdx + dzdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = spu_extract(setup.coef[slot].dadx, 3);
+   const float dwdy = spu_extract(setup.coef[slot].dady, 3);
+   const float topLeft = spu_extract(setup.coef[slot].a0, 3) + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ * Note: about 1/5 to 1/7 of the time, mask is zero and this function
+ * should be skipped.  But adding the test for that slows things down
+ * overall.
+ */
+static INLINE void
+emit_quad( int x, int y, mask_t mask)
+{
+   /* If any bits in mask are set... */
+   if (spu_extract(spu_orx(mask), 0)) {
+      const int ix = x - setup.cliprect_minx;
+      const int iy = y - setup.cliprect_miny;
+
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
+
+      {
+         /*
+          * Run fragment shader, execute per-fragment ops, update fb/tile.
+          */
+         vector float inputs[4*4], outputs[2*4];
+         vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
+         vector unsigned int kill_mask;
+
+         /* setup inputs */
+#if 0
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
+#else
+         uint i;
+         for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
+         }
+#endif
+         ASSERT(spu.fragment_program);
+         ASSERT(spu.fragment_ops);
+
+         /* Execute the current fragment program */
+         kill_mask = spu.fragment_program(inputs, outputs, spu.constants);
+
+         mask = spu_andc(mask, kill_mask);
+
+         /* Execute per-fragment/quad operations, including:
+          * alpha test, z test, stencil test, blend and framebuffer writing.
+          * Note that there are two different fragment operations functions
+          * that can be called, one for front-facing fragments, and one
+          * for back-facing fragments.  (Often the two are the same;
+          * but in some cases, like two-sided stenciling, they can be
+          * very different.)  So choose the correct function depending
+          * on the calculated facing.
+          */
+         spu.fragment_ops[setup.facing](ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ,
+                          outputs[0*4+0],
+                          outputs[0*4+1],
+                          outputs[0*4+2],
+                          outputs[0*4+3],
+                          mask);
+      }
+   }
+}
+
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int
+block(int x)
+{
+   return x & ~1;
+}
+
+
+/**
+ * Compute mask which indicates which pixels in the 2x2 quad are actually inside
+ * the triangle's bounds.
+ * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
+ */
+static INLINE mask_t
+calculate_mask(int x)
+{
+   /* This is a little tricky.
+    * Use & instead of && to avoid branches.
+    * Use negation to convert true/false to ~0/0 values.
+    */
+   mask_t mask;
+   mask = spu_insert(-((x   >= setup.span.left[0]) & (x   < setup.span.right[0])), mask, 0);
+   mask = spu_insert(-((x+1 >= setup.span.left[0]) & (x+1 < setup.span.right[0])), mask, 1);
+   mask = spu_insert(-((x   >= setup.span.left[1]) & (x   < setup.span.right[1])), mask, 2);
+   mask = spu_insert(-((x+1 >= setup.span.left[1]) & (x+1 < setup.span.right[1])), mask, 3);
+   return mask;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void
+flush_spans(void)
+{
+   int minleft, maxright;
+   int x;
+
+   switch (setup.span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = MIN2(setup.span.left[0], setup.span.left[1]);
+      maxright = MAX2(setup.span.right[0], setup.span.right[1]);
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = setup.span.left[0];
+      maxright = setup.span.right[0];
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = setup.span.left[1];
+      maxright = setup.span.right[1];
+      break;
+
+   default:
+      return;
+   }
+
+   /* OK, we're very likely to need the tile data now.
+    * clear or finish waiting if needed.
+    */
+   if (spu.cur_ctile_status == TILE_STATUS_GETTING) {
+      /* wait for mfc_get() to complete */
+      //printf("SPU: %u: waiting for ctile\n", spu.init.id);
+      wait_on_mask(1 << TAG_READ_TILE_COLOR);
+      spu.cur_ctile_status = TILE_STATUS_CLEAN;
+   }
+   else if (spu.cur_ctile_status == TILE_STATUS_CLEAR) {
+      //printf("SPU %u: clearing C tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+      clear_c_tile(&spu.ctile);
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
+   }
+   ASSERT(spu.cur_ctile_status != TILE_STATUS_DEFINED);
+
+   if (spu.read_depth_stencil) {
+      if (spu.cur_ztile_status == TILE_STATUS_GETTING) {
+         /* wait for mfc_get() to complete */
+         //printf("SPU: %u: waiting for ztile\n", spu.init.id);
+         wait_on_mask(1 << TAG_READ_TILE_Z);
+         spu.cur_ztile_status = TILE_STATUS_CLEAN;
+      }
+      else if (spu.cur_ztile_status == TILE_STATUS_CLEAR) {
+         //printf("SPU %u: clearing Z tile %u, %u\n", spu.init.id, setup.tx, setup.ty);
+         clear_z_tile(&spu.ztile);
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
+      ASSERT(spu.cur_ztile_status != TILE_STATUS_DEFINED);
+   }
+
+   /* XXX this loop could be moved into the above switch cases and
+    * calculate_mask() could be simplified a bit...
+    */
+   for (x = block(minleft); x <= block(maxright); x += 2) {
+      emit_quad( x, setup.span.y, calculate_mask( x ));
+   }
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+}
+
+
+#if DEBUG_VERTS
+static void
+print_vertex(const struct vertex_header *v)
+{
+   uint i;
+   fprintf(stderr, "  Vertex: (%p)\n", v);
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      fprintf(stderr, "    %d: %f %f %f %f\n",  i, 
+              spu_extract(v->data[i], 0),
+              spu_extract(v->data[i], 1),
+              spu_extract(v->data[i], 2),
+              spu_extract(v->data[i], 3));
+   }
+}
+#endif
+
+
+/**
+ * Sort vertices from top to bottom.
+ * Compute area and determine front vs. back facing.
+ * Do coarse clip test against tile bounds
+ * \return  FALSE if tri is totally outside tile, TRUE otherwise
+ */
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
+{
+   float area, sign;
+
+#if DEBUG_VERTS
+   if (spu.init.id==0) {
+      fprintf(stderr, "SPU %u: Triangle:\n", spu.init.id);
+      print_vertex(v0);
+      print_vertex(v1);
+      print_vertex(v2);
+   }
+#endif
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = spu_extract(v0->data[0], 1);
+      float y1 = spu_extract(v1->data[0], 1);
+      float y2 = spu_extract(v2->data[0], 1);
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup.vmin = v0;   
+	    setup.vmid = v1;   
+	    setup.vmax = v2;
+            sign = -1.0f;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup.vmin = v2;   
+	    setup.vmid = v0;   
+	    setup.vmax = v1;   
+            sign = -1.0f;
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup.vmin = v0;   
+	    setup.vmid = v2;   
+	    setup.vmax = v1;  
+            sign = 1.0f;
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup.vmin = v1;   
+	    setup.vmid = v0;   
+	    setup.vmax = v2;  
+            sign = 1.0f;
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup.vmin = v2;   
+	    setup.vmid = v1;   
+	    setup.vmax = v0;  
+            sign = 1.0f;
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup.vmin = v1;   
+	    setup.vmid = v2;   
+	    setup.vmax = v0;
+            sign = -1.0f;
+	 }
+      }
+   }
+
+   /* Check if triangle is completely outside the tile bounds */
+   if (spu_extract(setup.vmin->data[0], 1) > setup.cliprect_maxy)
+      return FALSE;
+   if (spu_extract(setup.vmax->data[0], 1) < setup.cliprect_miny)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmid->data[0], 0) < setup.cliprect_minx &&
+       spu_extract(setup.vmax->data[0], 0) < setup.cliprect_minx)
+      return FALSE;
+   if (spu_extract(setup.vmin->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmid->data[0], 0) > setup.cliprect_maxx &&
+       spu_extract(setup.vmax->data[0], 0) > setup.cliprect_maxx)
+      return FALSE;
+
+   setup.ebot.dx = spu_extract(setup.vmid->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.ebot.dy = spu_extract(setup.vmid->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.emaj.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmin->data[0], 0);
+   setup.emaj.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmin->data[0], 1);
+   setup.etop.dx = spu_extract(setup.vmax->data[0], 0) - spu_extract(setup.vmid->data[0], 0);
+   setup.etop.dy = spu_extract(setup.vmax->data[0], 1) - spu_extract(setup.vmid->data[0], 1);
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    */
+   area = setup.emaj.dx * setup.ebot.dy - setup.ebot.dx * setup.emaj.dy;
+
+   setup.oneOverArea = 1.0f / area;
+
+   /* The product of area * sign indicates front/back orientation (0/1).
+    * Just in case someone gets the bright idea of switching the front
+    * and back constants without noticing that we're assuming their
+    * values in this operation, also assert that the values are
+    * what we think they are.
+    */
+   ASSERT(CELL_FACING_FRONT == 0);
+   ASSERT(CELL_FACING_BACK == 1);
+   setup.facing = (area * sign > 0.0f)
+      ^ (spu.rasterizer.front_winding == PIPE_WINDING_CW);
+
+   setup.vprovoke = v2;
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex->data[slot].
+ * The result will be put into setup.coef[slot].a0.
+ * \param slot  which attribute slot 
+ */
+static INLINE void
+const_coeff4(uint slot)
+{
+   setup.coef[slot].dadx = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].dady = (vector float) {0.0, 0.0, 0.0, 0.0};
+   setup.coef[slot].a0 = setup.vprovoke->data[slot];
+}
+
+
+/**
+ * As above, but interp setup all four vector components.
+ */
+static INLINE void
+tri_linear_coeff4(uint slot)
+{
+   const vector float vmin_d = setup.vmin->data[slot];
+   const vector float vmid_d = setup.vmid->data[slot];
+   const vector float vmax_d = setup.vmax->data[slot];
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void
+tri_persp_coeff4(uint slot)
+{
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
+
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
+
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady, yyyy);
+                         
+   setup.coef[slot].a0 = spu_sub(vmin_d, spu_add(tempx, tempy));
+}
+
+
+
+/**
+ * Compute the setup.coef[] array dadx, dady, a0 values.
+ * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void
+setup_tri_coefficients(void)
+{
+   uint i;
+
+   for (i = 0; i < spu.vertex_info.num_attribs; i++) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
+      case INTERP_NONE:
+         break;
+      case INTERP_CONSTANT:
+         const_coeff4(i);
+         break;
+      case INTERP_POS:
+         /* fall-through */
+      case INTERP_LINEAR:
+         tri_linear_coeff4(i);
+         break;
+      case INTERP_PERSPECTIVE:
+         tri_persp_coeff4(i);
+         break;
+      default:
+         ASSERT(0);
+      }
+   }
+}
+
+
+static void
+setup_tri_edges(void)
+{
+   float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
+   float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
+
+   float vmin_y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
+   float vmid_y = spu_extract(setup.vmid->data[0], 1) - 0.5f;
+   float vmax_y = spu_extract(setup.vmax->data[0], 1) - 0.5f;
+
+   setup.emaj.sy = CEILF(vmin_y);
+   setup.emaj.lines = (int) CEILF(vmax_y - setup.emaj.sy);
+   setup.emaj.dxdy = setup.emaj.dx / setup.emaj.dy;
+   setup.emaj.sx = vmin_x + (setup.emaj.sy - vmin_y) * setup.emaj.dxdy;
+
+   setup.etop.sy = CEILF(vmid_y);
+   setup.etop.lines = (int) CEILF(vmax_y - setup.etop.sy);
+   setup.etop.dxdy = setup.etop.dx / setup.etop.dy;
+   setup.etop.sx = vmid_x + (setup.etop.sy - vmid_y) * setup.etop.dxdy;
+
+   setup.ebot.sy = CEILF(vmin_y);
+   setup.ebot.lines = (int) CEILF(vmid_y - setup.ebot.sy);
+   setup.ebot.dxdy = setup.ebot.dx / setup.ebot.dy;
+   setup.ebot.sx = vmin_x + (setup.ebot.sy - vmin_y) * setup.ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
+{
+   const int minx = setup.cliprect_minx;
+   const int maxx = setup.cliprect_maxx;
+   const int miny = setup.cliprect_miny;
+   const int maxy = setup.cliprect_maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   ASSERT((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   _mesa_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);  
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup.span.y) {
+            flush_spans();
+            setup.span.y = block(_y);
+         }
+
+         setup.span.left[_y&1] = left;
+         setup.span.right[_y&1] = right;
+         setup.span.y_flags |= 1<<(_y&1);
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Draw triangle into tile at (tx, ty) (tile coords)
+ * The tile data should have already been fetched.
+ */
+boolean
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty)
+{
+   setup.tx = tx;
+   setup.ty = ty;
+
+   /* set clipping bounds to tile bounds */
+   setup.cliprect_minx = tx * TILE_SIZE;
+   setup.cliprect_miny = ty * TILE_SIZE;
+   setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
+   setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
+
+   if (!setup_sort_vertices((struct vertex_header *) v0,
+                            (struct vertex_header *) v1,
+                            (struct vertex_header *) v2)) {
+      return FALSE; /* totally clipped */
+   }
+
+   setup_tri_coefficients();
+   setup_tri_edges();
+
+   setup.span.y = 0;
+   setup.span.y_flags = 0;
+   setup.span.right[0] = 0;
+   setup.span.right[1] = 0;
+
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
+      subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
+      subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
+   }
+   else {
+      /* emaj on right */
+      subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
+      subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
+   }
+
+   flush_spans();
+
+   return TRUE;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
new file mode 100644
index 0000000000..aa694dd7c9
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SPU_TRI_H
+#define SPU_TRI_H
+
+
+extern boolean
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+
+
+#endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
new file mode 100644
index 0000000000..b8a0d4a265
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -0,0 +1,167 @@
+
+#include "cell/common.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+#include "tgsi/tgsi_parse.h"
+//#include "tgsi_build.h"
+#include "tgsi/tgsi_util.h"
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      ASSERT( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      ASSERT( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
new file mode 100644
index 0000000000..03375d84a5
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -0,0 +1,145 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * (C) Copyright IBM Corporation 2008
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "spu_exec.h"
+#include "spu_vertex_shader.h"
+#include "spu_main.h"
+#include "spu_dcache.h"
+
+typedef void (*spu_fetch_func)(qword *out, const qword *in,
+			       const qword *shuffle_data);
+
+
+static const qword fetch_shuffle_data[5] ALIGN16_ATTRIB = {
+   /* Shuffle used by CVT_64_FLOAT
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+   },
+
+   /* Shuffle used by CVT_8_USCALED and CVT_8_SSCALED
+    */
+   {
+      0x00, 0x80, 0x80, 0x80, 0x01, 0x80, 0x80, 0x80,
+      0x02, 0x80, 0x80, 0x80, 0x03, 0x80, 0x80, 0x80,
+   },
+   
+   /* Shuffle used by CVT_16_USCALED and CVT_16_SSCALED
+    */
+   {
+      0x00, 0x01, 0x80, 0x80, 0x02, 0x03, 0x80, 0x80,
+      0x04, 0x05, 0x80, 0x80, 0x06, 0x07, 0x80, 0x80,
+   },
+   
+   /* High value shuffle used by trans4x4.
+    */
+   {
+      0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+      0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17
+   },
+
+   /* Low value shuffle used by trans4x4.
+    */
+   {
+      0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B,
+      0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F
+   }
+};
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch(struct spu_vs_context *draw,
+                                 struct spu_exec_machine *machine,
+                                 const unsigned *elts,
+                                 unsigned count)
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   ASSERT(count <= 4);
+
+#if DRAW_DBG
+   printf("SPU: %s count = %u, nr_attrs = %u\n", 
+          __FUNCTION__, count, nr_attrs);
+#endif
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+      const unsigned pitch = draw->vertex_fetch.pitch[attr];
+      const uint64_t src = draw->vertex_fetch.src_ptr[attr];
+      const spu_fetch_func fetch = (spu_fetch_func)
+	  (draw->vertex_fetch.code + draw->vertex_fetch.code_offset[attr]);
+      unsigned i;
+      unsigned idx;
+      const unsigned bytes_per_entry = draw->vertex_fetch.size[attr];
+      const unsigned quads_per_entry = (bytes_per_entry + 15) / 16;
+      qword in[2 * 4] ALIGN16_ATTRIB;
+
+
+      /* Fetch four attributes for four vertices.  
+       */
+      idx = 0;
+      for (i = 0; i < count; i++) {
+         const uint64_t addr = src + (elts[i] * pitch);
+
+#if DRAW_DBG
+         printf("SPU: fetching = 0x%llx\n", addr);
+#endif
+
+         spu_dcache_fetch_unaligned(& in[idx], addr, bytes_per_entry);
+         idx += quads_per_entry;
+      }
+
+      /* Be nice and zero out any missing vertices.
+       */
+      (void) memset(& in[idx], 0, (8 - idx) * sizeof(qword));
+
+
+      /* Convert all 4 vertices to vectors of float.
+       */
+      (*fetch)(&machine->Inputs[attr].xyzw[0].q, in, fetch_shuffle_data);
+   }
+}
+
+
+void spu_update_vertex_fetch( struct spu_vs_context *draw )
+{
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
new file mode 100644
index 0000000000..fbe5b34d39
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -0,0 +1,244 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Brian Paul
+  *   Ian Romanick <idr@us.ibm.com>
+  */
+
+#include <spu_mfcio.h>
+
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "draw/draw_private.h"
+#include "draw/draw_context.h"
+#include "cell/common.h"
+#include "spu_vertex_shader.h"
+#include "spu_exec.h"
+#include "spu_main.h"
+
+
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+#define CLIP_RIGHT_BIT 0x01
+#define CLIP_LEFT_BIT 0x02
+#define CLIP_TOP_BIT 0x04
+#define CLIP_BOTTOM_BIT 0x08
+#define CLIP_FAR_BIT 0x10
+#define CLIP_NEAR_BIT 0x20
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+run_vertex_program(struct spu_vs_context *draw,
+                   unsigned elts[4], unsigned count,
+                   const uint64_t *vOut)
+{
+   struct spu_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct spu_exec_vector, inputs, PIPE_MAX_ATTRIBS);
+   ALIGN16_DECL(struct spu_exec_vector, outputs, PIPE_MAX_ATTRIBS);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   ASSERT(count <= 4);
+
+   machine->Processor = TGSI_PROCESSOR_VERTEX;
+
+   ASSERT_ALIGN16(draw->constants);
+   machine->Consts = (float (*)[4]) draw->constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   spu_vertex_fetch( draw, machine, elts, count );
+
+   /* run shader */
+   spu_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+      unsigned char buffer[sizeof(struct vertex_header)
+          + MAX_VERTEX_SIZE] ALIGN16_ATTRIB;
+      struct vertex_header *const tmpOut =
+          (struct vertex_header *) buffer;
+      const unsigned vert_size = ROUNDUP16(sizeof(struct vertex_header)
+                                           + (sizeof(float) * 4 
+                                              * draw->num_vs_outputs));
+
+      mfc_get(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+      wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = tmpOut->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = tmpOut->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = tmpOut->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = tmpOut->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      tmpOut->clipmask = compute_clipmask(tmpOut->clip, draw->plane,
+					   draw->nr_planes);
+      tmpOut->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      tmpOut->data[0][0] = x * scale[0] + trans[0];
+      tmpOut->data[0][1] = y * scale[1] + trans[1];
+      tmpOut->data[0][2] = z * scale[2] + trans[2];
+      tmpOut->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         tmpOut->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         tmpOut->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         tmpOut->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         tmpOut->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+
+      mfc_put(tmpOut, vOut[j], vert_size, TAG_VERTEX_BUFFER, 0, 0);
+   } /* loop over vertices */
+}
+
+
+unsigned char immediates[(sizeof(float) * 4 * TGSI_EXEC_NUM_IMMEDIATES) + 32]
+    ALIGN16_ATTRIB;
+
+
+void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs)
+{
+   const unsigned immediate_addr = vs->immediates;
+   const unsigned immediate_size = 
+       ROUNDUP16((sizeof(float) * 4 * vs->num_immediates)
+		 + (immediate_addr & 0x0f));
+ 
+
+   mfc_get(immediates, immediate_addr & ~0x0f, immediate_size,
+           TAG_VERTEX_BUFFER, 0, 0);
+
+   draw->machine.Instructions = (struct tgsi_full_instruction *)
+       vs->instructions;
+   draw->machine.NumInstructions = vs->num_instructions;
+
+   draw->machine.Declarations = (struct tgsi_full_declaration *)
+       vs->declarations;
+   draw->machine.NumDeclarations = vs->num_declarations;
+
+   draw->num_vs_outputs = vs->num_outputs;
+
+   /* specify the shader to interpret/execute */
+   spu_exec_machine_init(&draw->machine,
+			 PIPE_MAX_SAMPLERS,
+			 NULL /*samplers*/,
+			 PIPE_SHADER_VERTEX);
+
+   wait_on_mask(1 << TAG_VERTEX_BUFFER);
+
+   (void) memcpy(& draw->machine.Imms, &immediates[immediate_addr & 0x0f],
+                 sizeof(float) * 4 * vs->num_immediates);
+}
+
+
+void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+                          const struct cell_command_vs *vs)
+{
+   unsigned i;
+
+   (void) memcpy(draw->plane, vs->plane, sizeof(float) * 4 * vs->nr_planes);
+   draw->nr_planes = vs->nr_planes;
+   draw->vertex_fetch.nr_attrs = vs->nr_attrs;
+
+   for (i = 0; i < vs->num_elts; i += 4) {
+      const unsigned batch_size = MIN2(vs->num_elts - i, 4);
+
+      run_vertex_program(draw, & vs->elts[i], batch_size, &vs->vOut[i]);
+   }
+}
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.h b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
new file mode 100644
index 0000000000..4c74f5e74d
--- /dev/null
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.h
@@ -0,0 +1,66 @@
+#ifndef SPU_VERTEX_SHADER_H
+#define SPU_VERTEX_SHADER_H
+
+#include "cell/common.h"
+#include "pipe/p_format.h"
+#include "spu_exec.h"
+
+struct spu_vs_context;
+
+typedef void (*spu_full_fetch_func)( struct spu_vs_context *draw,
+				     struct spu_exec_machine *machine,
+				     const unsigned *elts,
+				     unsigned count );
+
+struct spu_vs_context {
+   struct pipe_viewport_state viewport;
+
+   struct {
+      uint64_t src_ptr[PIPE_MAX_ATTRIBS];
+      unsigned pitch[PIPE_MAX_ATTRIBS];
+      unsigned size[PIPE_MAX_ATTRIBS];
+      unsigned code_offset[PIPE_MAX_ATTRIBS];
+      unsigned nr_attrs;
+      boolean dirty;
+
+      spu_full_fetch_func fetch_func;
+      void *code;
+   } vertex_fetch;
+   
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   struct spu_exec_machine machine;
+   const float (*constants)[4];
+
+   unsigned num_vs_outputs;
+};
+
+extern void spu_update_vertex_fetch(struct spu_vs_context *draw);
+
+static INLINE void spu_vertex_fetch(struct spu_vs_context *draw,
+				    struct spu_exec_machine *machine,
+				    const unsigned *elts,
+				    unsigned count)
+{
+   if (draw->vertex_fetch.dirty) {
+      spu_update_vertex_fetch(draw);
+      draw->vertex_fetch.dirty = 0;
+   }
+   
+   (*draw->vertex_fetch.fetch_func)(draw, machine, elts, count);
+}
+
+struct cell_command_vs;
+
+extern void
+spu_bind_vertex_shader(struct spu_vs_context *draw,
+		       struct cell_shader_info *vs);
+
+extern void
+spu_execute_vertex_shader(struct spu_vs_context *draw,
+			  const struct cell_command_vs *vs);
+
+#endif /* SPU_VERTEX_SHADER_H */
diff --git a/src/gallium/drivers/failover/Makefile b/src/gallium/drivers/failover/Makefile
new file mode 100644
index 0000000000..f08b8df07a
--- /dev/null
+++ b/src/gallium/drivers/failover/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = failover
+
+C_SOURCES = \
+	fo_state.c \
+	fo_state_emit.c \
+	fo_context.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/failover/SConscript b/src/gallium/drivers/failover/SConscript
new file mode 100644
index 0000000000..f8e9b1b491
--- /dev/null
+++ b/src/gallium/drivers/failover/SConscript
@@ -0,0 +1,13 @@
+Import('*')
+
+env = env.Clone()
+
+failover = env.ConvenienceLibrary(
+	target = 'failover',
+	source = [
+		'fo_state.c',
+		'fo_state_emit.c',
+		'fo_context.c',
+	])
+
+Export('failover')
diff --git a/src/gallium/drivers/failover/fo_context.c b/src/gallium/drivers/failover/fo_context.c
new file mode 100644
index 0000000000..10c4ffc209
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.c
@@ -0,0 +1,159 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_memory.h"
+#include "pipe/p_context.h"
+
+#include "fo_context.h"
+#include "fo_winsys.h"
+
+
+
+static void failover_destroy( struct pipe_context *pipe )
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   free( failover );
+}
+
+
+
+static boolean failover_draw_elements( struct pipe_context *pipe,
+				       struct pipe_buffer *indexBuffer,
+				       unsigned indexSize,
+				       unsigned prim, unsigned start, unsigned count)
+{
+   struct failover_context *failover = failover_context( pipe );
+
+   /* If there has been any statechange since last time, try hardware
+    * rendering again:
+    */
+   if (failover->dirty) {
+      failover->mode = FO_HW;
+   }
+
+   /* Try hardware:
+    */
+   if (failover->mode == FO_HW) {
+      if (!failover->hw->draw_elements( failover->hw, 
+					indexBuffer, 
+					indexSize, 
+					prim, 
+					start, 
+					count )) {
+
+	 failover->hw->flush( failover->hw, ~0, NULL );
+	 failover->mode = FO_SW;
+      }
+   }
+
+   /* Possibly try software:
+    */
+   if (failover->mode == FO_SW) {
+
+      if (failover->dirty) 
+	 failover_state_emit( failover );
+
+      failover->sw->draw_elements( failover->sw, 
+				   indexBuffer, 
+				   indexSize, 
+				   prim, 
+				   start, 
+				   count );
+
+      /* Be ready to switch back to hardware rendering without an
+       * intervening flush.  Unlikely to be much performance impact to
+       * this:
+       */
+      failover->sw->flush( failover->sw, ~0, NULL );
+   }
+
+   return TRUE;
+}
+
+
+static boolean failover_draw_arrays( struct pipe_context *pipe,
+				     unsigned prim, unsigned start, unsigned count)
+{
+   return failover_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw )
+{
+   struct failover_context *failover = CALLOC_STRUCT(failover_context);
+   if (failover == NULL)
+      return NULL;
+
+   failover->hw = hw;
+   failover->sw = sw;
+   failover->pipe.winsys = hw->winsys;
+   failover->pipe.screen = hw->screen;
+   failover->pipe.destroy = failover_destroy;
+#if 0
+   failover->pipe.is_format_supported = hw->is_format_supported;
+   failover->pipe.get_name = hw->get_name;
+   failover->pipe.get_vendor = hw->get_vendor;
+   failover->pipe.get_param = hw->get_param;
+   failover->pipe.get_paramf = hw->get_paramf;
+#endif
+
+   failover->pipe.draw_arrays = failover_draw_arrays;
+   failover->pipe.draw_elements = failover_draw_elements;
+   failover->pipe.clear = hw->clear;
+
+   /* No software occlusion fallback (or other optional functionality)
+    * at this point - if the hardware doesn't support it, don't
+    * advertise it to the application.
+    */
+   failover->pipe.begin_query = hw->begin_query;
+   failover->pipe.end_query = hw->end_query;
+
+   failover_init_state_functions( failover );
+
+   failover->pipe.surface_copy = hw->surface_copy;
+   failover->pipe.surface_fill = hw->surface_fill;
+
+#if 0
+   failover->pipe.texture_create = hw->texture_create;
+   failover->pipe.texture_release = hw->texture_release;
+   failover->pipe.get_tex_surface = hw->get_tex_surface;
+   failover->pipe.texture_update = hw->texture_update;
+#endif
+
+   failover->pipe.flush = hw->flush;
+
+   failover->dirty = 0;
+
+   return &failover->pipe;
+}
+
diff --git a/src/gallium/drivers/failover/fo_context.h b/src/gallium/drivers/failover/fo_context.h
new file mode 100644
index 0000000000..c6409fe1e1
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_context.h
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef FO_CONTEXT_H
+#define FO_CONTEXT_H
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+
+
+
+#define FO_NEW_VIEWPORT        0x1
+#define FO_NEW_RASTERIZER      0x2
+#define FO_NEW_FRAGMENT_SHADER 0x4
+#define FO_NEW_BLEND           0x8
+#define FO_NEW_CLIP            0x10
+#define FO_NEW_SCISSOR         0x20
+#define FO_NEW_STIPPLE         0x40
+#define FO_NEW_FRAMEBUFFER     0x80
+#define FO_NEW_ALPHA_TEST      0x100
+#define FO_NEW_DEPTH_STENCIL   0x200
+#define FO_NEW_SAMPLER         0x400
+#define FO_NEW_TEXTURE         0x800
+#define FO_NEW_VERTEX          0x2000
+#define FO_NEW_VERTEX_SHADER   0x4000
+#define FO_NEW_BLEND_COLOR     0x8000
+#define FO_NEW_CLEAR_COLOR     0x10000
+#define FO_NEW_VERTEX_BUFFER   0x20000
+#define FO_NEW_VERTEX_ELEMENT  0x40000
+
+
+
+#define FO_HW 0
+#define FO_SW 1
+
+struct fo_state {
+   void *sw_state;
+   void *hw_state;
+};
+struct failover_context {
+   struct pipe_context pipe;  /**< base class */
+
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct fo_state     *blend;
+   const struct fo_state     *sampler[PIPE_MAX_SAMPLERS];
+   const struct fo_state     *depth_stencil;
+   const struct fo_state     *rasterizer;
+   const struct fo_state     *fragment_shader;
+   const struct fo_state     *vertex_shader;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_elements[PIPE_MAX_ATTRIBS];
+
+   uint num_vertex_buffers;
+   uint num_vertex_elements;
+
+   void *sw_sampler_state[PIPE_MAX_SAMPLERS];
+   void *hw_sampler_state[PIPE_MAX_SAMPLERS];
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+
+   unsigned mode;
+   struct pipe_context *hw;
+   struct pipe_context *sw;
+};
+
+
+
+void failover_init_state_functions( struct failover_context *failover );
+void failover_state_emit( struct failover_context *failover );
+
+static INLINE struct failover_context *
+failover_context( struct pipe_context *pipe )
+{
+   return (struct failover_context *)pipe;
+}
+
+
+#endif /* FO_CONTEXT_H */
diff --git a/src/gallium/drivers/failover/fo_state.c b/src/gallium/drivers/failover/fo_state.c
new file mode 100644
index 0000000000..6a79706632
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state.c
@@ -0,0 +1,483 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "pipe/p_inlines.h"
+
+#include "fo_context.h"
+
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+
+static void *
+failover_create_blend_state( struct pipe_context *pipe,
+                             const struct pipe_blend_state *blend )
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_blend_state(failover->sw, blend);
+   state->hw_state = failover->hw->create_blend_state(failover->hw, blend);
+
+   return state;
+}
+
+static void
+failover_bind_blend_state( struct pipe_context *pipe,
+                           void *blend )
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)blend;
+   failover->blend = state;
+   failover->dirty |= FO_NEW_BLEND;
+   failover->sw->bind_blend_state( failover->sw, state->sw_state );
+   failover->hw->bind_blend_state( failover->hw, state->hw_state );
+}
+
+static void
+failover_delete_blend_state( struct pipe_context *pipe,
+                             void *blend )
+{
+   struct fo_state *state = (struct fo_state*)blend;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_blend_state(failover->sw, state->sw_state);
+   failover->hw->delete_blend_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void
+failover_set_blend_color( struct pipe_context *pipe,
+			  const struct pipe_blend_color *blend_color )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->blend_color = *blend_color;
+   failover->dirty |= FO_NEW_BLEND_COLOR;
+   failover->sw->set_blend_color( failover->sw, blend_color );
+   failover->hw->set_blend_color( failover->hw, blend_color );
+}
+
+static void 
+failover_set_clip_state( struct pipe_context *pipe,
+			 const struct pipe_clip_state *clip )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->clip = *clip;
+   failover->dirty |= FO_NEW_CLIP;
+   failover->sw->set_clip_state( failover->sw, clip );
+   failover->hw->set_clip_state( failover->hw, clip );
+}
+
+
+static void *
+failover_create_depth_stencil_state(struct pipe_context *pipe,
+                              const struct pipe_depth_stencil_alpha_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_depth_stencil_alpha_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_depth_stencil_alpha_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state *)depth_stencil;
+   failover->depth_stencil = state;
+   failover->dirty |= FO_NEW_DEPTH_STENCIL;
+   failover->sw->bind_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->bind_depth_stencil_alpha_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_depth_stencil_state(struct pipe_context *pipe,
+                                    void *ds)
+{
+   struct fo_state *state = (struct fo_state*)ds;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_depth_stencil_alpha_state(failover->sw, state->sw_state);
+   failover->hw->delete_depth_stencil_alpha_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void
+failover_set_framebuffer_state(struct pipe_context *pipe,
+			       const struct pipe_framebuffer_state *framebuffer)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->framebuffer = *framebuffer;
+   failover->dirty |= FO_NEW_FRAMEBUFFER;
+   failover->sw->set_framebuffer_state( failover->sw, framebuffer );
+   failover->hw->set_framebuffer_state( failover->hw, framebuffer );
+}
+
+
+static void *
+failover_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_fs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_fs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)fs;
+   failover->fragment_shader = state;
+   failover->dirty |= FO_NEW_FRAGMENT_SHADER;
+   failover->sw->bind_fs_state(failover->sw, state->sw_state);
+   failover->hw->bind_fs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_fs_state(struct pipe_context *pipe,
+                         void *fs)
+{
+   struct fo_state *state = (struct fo_state*)fs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_fs_state(failover->sw, state->sw_state);
+   failover->hw->delete_fs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void *
+failover_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_vs_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_vs_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_vs_state(struct pipe_context *pipe,
+                       void *vs)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)vs;
+   failover->vertex_shader = state;
+   failover->dirty |= FO_NEW_VERTEX_SHADER;
+   failover->sw->bind_vs_state(failover->sw, state->sw_state);
+   failover->hw->bind_vs_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_vs_state(struct pipe_context *pipe,
+                         void *vs)
+{
+   struct fo_state *state = (struct fo_state*)vs;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_vs_state(failover->sw, state->sw_state);
+   failover->hw->delete_vs_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+static void 
+failover_set_polygon_stipple( struct pipe_context *pipe,
+			      const struct pipe_poly_stipple *stipple )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->poly_stipple = *stipple;
+   failover->dirty |= FO_NEW_STIPPLE;
+   failover->sw->set_polygon_stipple( failover->sw, stipple );
+   failover->hw->set_polygon_stipple( failover->hw, stipple );
+}
+
+
+static void *
+failover_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_rasterizer_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_rasterizer_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_rasterizer_state(struct pipe_context *pipe,
+                               void *raster)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   struct fo_state *state = (struct fo_state*)raster;
+   failover->rasterizer = state;
+   failover->dirty |= FO_NEW_RASTERIZER;
+   failover->sw->bind_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->bind_rasterizer_state(failover->hw, state->hw_state);
+}
+
+static void
+failover_delete_rasterizer_state(struct pipe_context *pipe,
+                                 void *raster)
+{
+   struct fo_state *state = (struct fo_state*)raster;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_rasterizer_state(failover->sw, state->sw_state);
+   failover->hw->delete_rasterizer_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+
+static void 
+failover_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->scissor = *scissor;
+   failover->dirty |= FO_NEW_SCISSOR;
+   failover->sw->set_scissor_state( failover->sw, scissor );
+   failover->hw->set_scissor_state( failover->hw, scissor );
+}
+
+
+static void *
+failover_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *templ)
+{
+   struct fo_state *state = malloc(sizeof(struct fo_state));
+   struct failover_context *failover = failover_context(pipe);
+
+   state->sw_state = failover->sw->create_sampler_state(failover->sw, templ);
+   state->hw_state = failover->hw->create_sampler_state(failover->hw, templ);
+
+   return state;
+}
+
+static void
+failover_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct failover_context *failover = failover_context(pipe);
+   struct fo_state *state = (struct fo_state*)sampler;
+   uint i;
+   assert(num <= PIPE_MAX_SAMPLERS);
+   /* Check for no-op */
+   if (num == failover->num_samplers &&
+       !memcmp(failover->sampler, sampler, num * sizeof(void *)))
+      return;
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      failover->sw_sampler_state[i] = i < num ? state[i].sw_state : NULL;
+      failover->hw_sampler_state[i] = i < num ? state[i].hw_state : NULL;
+   }
+   failover->dirty |= FO_NEW_SAMPLER;
+   failover->num_samplers = num;
+   failover->sw->bind_sampler_states(failover->sw, num,
+                                     failover->sw_sampler_state);
+   failover->hw->bind_sampler_states(failover->hw, num,
+                                     failover->hw_sampler_state);
+}
+
+static void
+failover_delete_sampler_state(struct pipe_context *pipe, void *sampler)
+{
+   struct fo_state *state = (struct fo_state*)sampler;
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->sw->delete_sampler_state(failover->sw, state->sw_state);
+   failover->hw->delete_sampler_state(failover->hw, state->hw_state);
+   state->sw_state = 0;
+   state->hw_state = 0;
+   free(state);
+}
+
+
+static void
+failover_set_sampler_textures(struct pipe_context *pipe,
+                              unsigned num,
+                              struct pipe_texture **texture)
+{
+   struct failover_context *failover = failover_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == failover->num_textures &&
+       !memcmp(failover->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &failover->texture[i],
+                             texture[i]);
+   for (i = num; i < failover->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &failover->texture[i],
+                             NULL);
+   failover->dirty |= FO_NEW_TEXTURE;
+   failover->num_textures = num;
+   failover->sw->set_sampler_textures( failover->sw, num, texture );
+   failover->hw->set_sampler_textures( failover->hw, num, texture );
+}
+
+
+static void 
+failover_set_viewport_state( struct pipe_context *pipe,
+			     const struct pipe_viewport_state *viewport )
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   failover->viewport = *viewport; 
+   failover->dirty |= FO_NEW_VIEWPORT;
+   failover->sw->set_viewport_state( failover->sw, viewport );
+   failover->hw->set_viewport_state( failover->hw, viewport );
+}
+
+
+static void
+failover_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *vertex_buffers)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   memcpy(failover->vertex_buffers, vertex_buffers,
+          count * sizeof(vertex_buffers[0]));
+   failover->dirty |= FO_NEW_VERTEX_BUFFER;
+   failover->num_vertex_buffers = count;
+   failover->sw->set_vertex_buffers( failover->sw, count, vertex_buffers );
+   failover->hw->set_vertex_buffers( failover->hw, count, vertex_buffers );
+}
+
+
+static void
+failover_set_vertex_elements(struct pipe_context *pipe,
+                             unsigned count,
+                             const struct pipe_vertex_element *vertex_elements)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   memcpy(failover->vertex_elements, vertex_elements,
+          count * sizeof(vertex_elements[0]));
+
+   failover->dirty |= FO_NEW_VERTEX_ELEMENT;
+   failover->num_vertex_elements = count;
+   failover->sw->set_vertex_elements( failover->sw, count, vertex_elements );
+   failover->hw->set_vertex_elements( failover->hw, count, vertex_elements );
+}
+
+void
+failover_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct failover_context *failover = failover_context(pipe);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   failover->sw->set_constant_buffer(failover->sw, shader, index, buf);
+   failover->hw->set_constant_buffer(failover->hw, shader, index, buf);
+}
+
+
+void
+failover_init_state_functions( struct failover_context *failover )
+{
+   failover->pipe.create_blend_state = failover_create_blend_state;
+   failover->pipe.bind_blend_state   = failover_bind_blend_state;
+   failover->pipe.delete_blend_state = failover_delete_blend_state;
+   failover->pipe.create_sampler_state = failover_create_sampler_state;
+   failover->pipe.bind_sampler_states  = failover_bind_sampler_states;
+   failover->pipe.delete_sampler_state = failover_delete_sampler_state;
+   failover->pipe.create_depth_stencil_alpha_state = failover_create_depth_stencil_state;
+   failover->pipe.bind_depth_stencil_alpha_state   = failover_bind_depth_stencil_state;
+   failover->pipe.delete_depth_stencil_alpha_state = failover_delete_depth_stencil_state;
+   failover->pipe.create_rasterizer_state = failover_create_rasterizer_state;
+   failover->pipe.bind_rasterizer_state = failover_bind_rasterizer_state;
+   failover->pipe.delete_rasterizer_state = failover_delete_rasterizer_state;
+   failover->pipe.create_fs_state = failover_create_fs_state;
+   failover->pipe.bind_fs_state   = failover_bind_fs_state;
+   failover->pipe.delete_fs_state = failover_delete_fs_state;
+   failover->pipe.create_vs_state = failover_create_vs_state;
+   failover->pipe.bind_vs_state   = failover_bind_vs_state;
+   failover->pipe.delete_vs_state = failover_delete_vs_state;
+
+   failover->pipe.set_blend_color = failover_set_blend_color;
+   failover->pipe.set_clip_state = failover_set_clip_state;
+   failover->pipe.set_framebuffer_state = failover_set_framebuffer_state;
+   failover->pipe.set_polygon_stipple = failover_set_polygon_stipple;
+   failover->pipe.set_scissor_state = failover_set_scissor_state;
+   failover->pipe.set_sampler_textures = failover_set_sampler_textures;
+   failover->pipe.set_viewport_state = failover_set_viewport_state;
+   failover->pipe.set_vertex_buffers = failover_set_vertex_buffers;
+   failover->pipe.set_vertex_elements = failover_set_vertex_elements;
+   failover->pipe.set_constant_buffer = failover_set_constant_buffer;
+}
diff --git a/src/gallium/drivers/failover/fo_state_emit.c b/src/gallium/drivers/failover/fo_state_emit.c
new file mode 100644
index 0000000000..bd4fce9d20
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_state_emit.c
@@ -0,0 +1,117 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "fo_context.h"
+
+/* This looks like a lot of work at the moment - we're keeping a
+ * duplicate copy of the state up-to-date.  
+ *
+ * This can change in two ways:
+ * - With constant state objects we would only need to save a pointer,
+ *     not the whole object.
+ * - By adding a callback in the state tracker to re-emit state.  The
+ *     state tracker knows the current state already and can re-emit it 
+ *     without additional complexity.
+ *
+ * This works as a proof-of-concept, but a final version will have
+ * lower overheads.
+ */
+
+
+/* Bring the software pipe uptodate with current state.
+ * 
+ * With constant state objects we would probably just send all state
+ * to both rasterizers all the time???
+ */
+void
+failover_state_emit( struct failover_context *failover )
+{
+   if (failover->dirty & FO_NEW_BLEND)
+      failover->sw->bind_blend_state( failover->sw,
+                                      failover->blend->sw_state );
+
+   if (failover->dirty & FO_NEW_BLEND_COLOR)
+      failover->sw->set_blend_color( failover->sw, &failover->blend_color );
+
+   if (failover->dirty & FO_NEW_CLIP)
+      failover->sw->set_clip_state( failover->sw, &failover->clip );
+
+   if (failover->dirty & FO_NEW_DEPTH_STENCIL)
+      failover->sw->bind_depth_stencil_alpha_state( failover->sw,
+						    failover->depth_stencil->sw_state );
+
+   if (failover->dirty & FO_NEW_FRAMEBUFFER)
+      failover->sw->set_framebuffer_state( failover->sw, &failover->framebuffer );
+
+   if (failover->dirty & FO_NEW_FRAGMENT_SHADER)
+      failover->sw->bind_fs_state( failover->sw,
+                                   failover->fragment_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_VERTEX_SHADER)
+      failover->sw->bind_vs_state( failover->sw,
+                                   failover->vertex_shader->sw_state );
+
+   if (failover->dirty & FO_NEW_STIPPLE)
+      failover->sw->set_polygon_stipple( failover->sw, &failover->poly_stipple );
+
+   if (failover->dirty & FO_NEW_RASTERIZER)
+      failover->sw->bind_rasterizer_state( failover->sw,
+                                           failover->rasterizer->sw_state );
+
+   if (failover->dirty & FO_NEW_SCISSOR)
+      failover->sw->set_scissor_state( failover->sw, &failover->scissor );
+
+   if (failover->dirty & FO_NEW_VIEWPORT)
+      failover->sw->set_viewport_state( failover->sw, &failover->viewport );
+
+   if (failover->dirty & FO_NEW_SAMPLER) {
+      failover->sw->bind_sampler_states( failover->sw, failover->num_samplers,
+                                         failover->sw_sampler_state );
+   }
+
+   if (failover->dirty & FO_NEW_TEXTURE) {
+      failover->sw->set_sampler_textures( failover->sw, failover->num_textures, 
+                                          failover->texture );
+   }
+
+   if (failover->dirty & FO_NEW_VERTEX_BUFFER) {
+      failover->sw->set_vertex_buffers( failover->sw,
+                                        failover->num_vertex_buffers,
+                                        failover->vertex_buffers );
+   }
+
+   if (failover->dirty & FO_NEW_VERTEX_ELEMENT) {
+      failover->sw->set_vertex_elements( failover->sw,
+                                         failover->num_vertex_elements,
+                                         failover->vertex_elements );
+   }
+
+   failover->dirty = 0;
+}
diff --git a/src/gallium/drivers/failover/fo_winsys.h b/src/gallium/drivers/failover/fo_winsys.h
new file mode 100644
index 0000000000..a8ce997a1f
--- /dev/null
+++ b/src/gallium/drivers/failover/fo_winsys.h
@@ -0,0 +1,45 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef FO_WINSYS_H
+#define FO_WINSYS_H
+
+
+/* This is the interface that failover requires any window system
+ * hosting it to implement.  This is the only include file in failover
+ * which is public.
+ */
+
+
+struct pipe_context;
+
+
+struct pipe_context *failover_create( struct pipe_context *hw,
+				      struct pipe_context *sw );
+
+
+#endif /* FO_WINSYS_H */
diff --git a/src/gallium/drivers/i915simple/Makefile b/src/gallium/drivers/i915simple/Makefile
new file mode 100644
index 0000000000..41a61a0020
--- /dev/null
+++ b/src/gallium/drivers/i915simple/Makefile
@@ -0,0 +1,31 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i915simple
+
+C_SOURCES = \
+	i915_blit.c \
+	i915_clear.c \
+	i915_flush.c \
+	i915_context.c \
+	i915_context.c \
+	i915_debug.c \
+	i915_debug_fp.c \
+	i915_state.c \
+	i915_state_immediate.c \
+	i915_state_dynamic.c \
+	i915_state_derived.c \
+	i915_state_emit.c \
+	i915_state_sampler.c \
+	i915_screen.c \
+	i915_prim_emit.c \
+	i915_prim_vbuf.c \
+	i915_texture.c \
+	i915_fpc_emit.c \
+	i915_fpc_translate.c \
+	i915_surface.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/i915simple/SConscript b/src/gallium/drivers/i915simple/SConscript
new file mode 100644
index 0000000000..2366e1247f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/SConscript
@@ -0,0 +1,29 @@
+Import('*')
+
+env = env.Clone()
+
+i915simple = env.ConvenienceLibrary(
+	target = 'i915simple',
+	source = [
+		'i915_blit.c',
+		'i915_clear.c',
+		'i915_context.c',
+		'i915_debug.c',
+		'i915_debug_fp.c',
+		'i915_flush.c',
+		'i915_fpc_emit.c',
+		'i915_fpc_translate.c',
+		'i915_prim_emit.c',
+		'i915_prim_vbuf.c',
+		'i915_screen.c',
+		'i915_state.c',
+		'i915_state_derived.c',
+		'i915_state_dynamic.c',
+		'i915_state_emit.c',
+		'i915_state_immediate.c',
+		'i915_state_sampler.c',
+		'i915_surface.c',
+		'i915_texture.c',
+	])
+
+Export('i915simple')
diff --git a/src/gallium/drivers/i915simple/i915_batch.h b/src/gallium/drivers/i915simple/i915_batch.h
new file mode 100644
index 0000000000..45bf4f4028
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_batch.h
@@ -0,0 +1,116 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BATCH_H
+#define I915_BATCH_H
+
+#include "i915_winsys.h"
+
+struct i915_batchbuffer
+{
+   struct pipe_buffer *buffer;
+   struct i915_winsys *winsys;
+
+   unsigned char *map;
+   unsigned char *ptr;
+
+   size_t size;
+   size_t actual_size;
+
+   size_t relocs;
+   size_t max_relocs;
+};
+
+static INLINE boolean
+i915_batchbuffer_check( struct i915_batchbuffer *batch,
+			size_t dwords,
+			size_t relocs )
+{
+   /** TODO JB: Check relocs */
+   return dwords * 4 <= batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE size_t
+i915_batchbuffer_space( struct i915_batchbuffer *batch )
+{
+   return batch->size - (batch->ptr - batch->map);
+}
+
+static INLINE void
+i915_batchbuffer_dword( struct i915_batchbuffer *batch,
+			unsigned dword )
+{
+   if (i915_batchbuffer_space(batch) < 4)
+      return;
+
+   *(unsigned *)batch->ptr = dword;
+   batch->ptr += 4;
+}
+
+static INLINE void
+i915_batchbuffer_write( struct i915_batchbuffer *batch,
+			void *data,
+			size_t size )
+{
+   if (i915_batchbuffer_space(batch) < size)
+      return;
+
+   memcpy(data, batch->ptr, size);
+   batch->ptr += size;
+}
+
+static INLINE void
+i915_batchbuffer_reloc( struct i915_batchbuffer *batch,
+			struct pipe_buffer *buffer,
+			size_t flags,
+			size_t offset )
+{
+   batch->winsys->batch_reloc( batch->winsys, buffer, flags, offset );
+}
+
+static INLINE void
+i915_batchbuffer_flush( struct i915_batchbuffer *batch,
+			struct pipe_fence_handle **fence )
+{
+   batch->winsys->batch_flush( batch->winsys, fence );
+}
+
+#define BEGIN_BATCH( dwords, relocs ) \
+   (i915_batchbuffer_check( i915->batch, dwords, relocs ))
+
+#define OUT_BATCH( dword ) \
+   i915_batchbuffer_dword( i915->batch, dword )
+
+#define OUT_RELOC( buf, flags, delta ) \
+   i915_batchbuffer_reloc( i915->batch, buf, flags, delta )
+
+#define FLUSH_BATCH(fence) do { 			\
+   i915->winsys->batch_flush( i915->winsys, fence );	\
+   i915->hardware_dirty = ~0;				\
+} while (0)
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_blit.c b/src/gallium/drivers/i915simple/i915_blit.c
new file mode 100644
index 0000000000..45fae4c999
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_blit.c
@@ -0,0 +1,157 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_blit.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+#include "i915_debug.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void
+i915_fill_blit(struct i915_context *i915,
+	       unsigned cpp,
+	       short dst_pitch,
+	       struct pipe_buffer *dst_buffer,
+	       unsigned dst_offset,
+	       short x, short y, 
+	       short w, short h, 
+	       unsigned color)
+{
+   unsigned BR13, CMD;
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = dst_pitch | (0xF0 << 16) | (1 << 24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = dst_pitch | (0xF0 << 16) | (1 << 24) | (1 << 25);
+      CMD = (XY_COLOR_BLT_CMD | XY_COLOR_BLT_WRITE_ALPHA |
+             XY_COLOR_BLT_WRITE_RGB);
+      break;
+   default:
+      return;
+   }
+
+//   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+//       __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
+
+
+   if (!BEGIN_BATCH(6, 1)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(6, 1));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((y << 16) | x);
+   OUT_BATCH(((y + h) << 16) | (x + w));
+   OUT_RELOC( dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_BATCH(color);
+}
+
+
+void
+i915_copy_blit( struct i915_context *i915,
+                  unsigned do_flip,
+                  unsigned cpp,
+                  short src_pitch,
+                  struct pipe_buffer *src_buffer,
+                  unsigned src_offset,
+                  short dst_pitch,
+                  struct pipe_buffer *dst_buffer,
+                  unsigned dst_offset,
+                  short src_x, short src_y,
+                  short dst_x, short dst_y, 
+		  short w, short h )
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+
+
+   I915_DBG(i915,
+       "%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
+       __FUNCTION__,
+       src_buffer, src_pitch, src_offset, src_x, src_y,
+       dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
+
+   src_pitch *= (short) cpp;
+   dst_pitch *= (short) cpp;
+
+   switch (cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (((int) dst_pitch) & 0xffff) | 
+	 (0xCC << 16) | (1 << 24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 =
+         (((int) dst_pitch) & 0xffff) | 
+	 (0xCC << 16) | (1 << 24) | (1 << 25);
+      CMD =
+         (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+          XY_SRC_COPY_BLT_WRITE_RGB);
+      break;
+   default:
+      return;
+   }
+
+   if (dst_y2 < dst_y || 
+       dst_x2 < dst_x) {
+      return;
+   }
+
+   /* Hardware can handle negative pitches but loses the ability to do
+    * proper overlapping blits in that case.  We don't really have a
+    * need for either at this stage.
+    */
+   assert (dst_pitch > 0 && src_pitch > 0);
+
+
+   if (!BEGIN_BATCH(8, 2)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(8, 2));
+   }
+   OUT_BATCH(CMD);
+   OUT_BATCH(BR13);
+   OUT_BATCH((dst_y << 16) | dst_x);
+   OUT_BATCH((dst_y2 << 16) | dst_x2);
+   OUT_RELOC(dst_buffer, I915_BUFFER_ACCESS_WRITE, dst_offset);
+   OUT_BATCH((src_y << 16) | src_x);
+   OUT_BATCH(((int) src_pitch & 0xffff));
+   OUT_RELOC(src_buffer, I915_BUFFER_ACCESS_READ, src_offset);
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_blit.h b/src/gallium/drivers/i915simple/i915_blit.h
new file mode 100644
index 0000000000..6e5b44e124
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_blit.h
@@ -0,0 +1,55 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_BLIT_H
+#define I915_BLIT_H
+
+#include "i915_context.h"
+
+extern void i915_copy_blit(struct i915_context *i915,
+                           unsigned do_flip,
+			   unsigned cpp,
+			   short src_pitch,
+			   struct pipe_buffer *src_buffer,
+			   unsigned src_offset,
+			   short dst_pitch,
+			   struct pipe_buffer *dst_buffer,
+			   unsigned dst_offset,
+			   short srcx, short srcy,
+			   short dstx, short dsty,
+			   short w, short h );
+
+extern void i915_fill_blit(struct i915_context *i915,
+			   unsigned cpp,
+			   short dst_pitch,
+			   struct pipe_buffer *dst_buffer,
+			   unsigned dst_offset,
+			   short x, short y,
+			   short w, short h, unsigned color);
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_clear.c b/src/gallium/drivers/i915simple/i915_clear.c
new file mode 100644
index 0000000000..8a2d3ca43f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_clear.c
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "i915_context.h"
+#include "i915_state.h"
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ */
+void
+i915_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+   pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+   ps->status = PIPE_SURFACE_STATUS_DEFINED;
+}
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
new file mode 100644
index 0000000000..6dd3eda85d
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -0,0 +1,193 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_state.h"
+#include "i915_batch.h"
+#include "i915_texture.h"
+#include "i915_reg.h"
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+static void i915_destroy( struct pipe_context *pipe )
+{
+   struct i915_context *i915 = i915_context( pipe );
+
+   draw_destroy( i915->draw );
+   
+   if(i915->winsys->destroy)
+      i915->winsys->destroy(i915->winsys);
+
+   FREE( i915 );
+}
+
+
+static boolean
+i915_draw_range_elements(struct pipe_context *pipe,
+			     struct pipe_buffer *indexBuffer,
+			     unsigned indexSize,
+			     unsigned min_index,
+			     unsigned max_index,
+			     unsigned prim, unsigned start, unsigned count)
+{
+   struct i915_context *i915 = i915_context( pipe );
+   struct draw_context *draw = i915->draw;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      void *buf
+         = pipe_buffer_map(pipe->screen,
+                                    i915->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes
+         = pipe_buffer_map(pipe->screen, indexBuffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw, indexSize,
+					   min_index,
+					   max_index,
+					   mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+   }
+
+
+   draw_set_mapped_constant_buffer(draw,
+                                   i915->current.constants[PIPE_SHADER_VERTEX],
+                                   ( i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
+                                     4 * sizeof(float) ));
+
+   /* draw! */
+   draw_arrays(i915->draw, prim, start, count);
+
+   /*
+    * unmap vertex/index buffers
+    */
+   for (i = 0; i < i915->num_vertex_buffers; i++) {
+      pipe_buffer_unmap(pipe->screen, i915->vertex_buffer[i].buffer);
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+   }
+   if (indexBuffer) {
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+      draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
+   }
+
+   return TRUE;
+}
+
+static boolean
+i915_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+   return i915_draw_range_elements( pipe, indexBuffer,
+					indexSize,
+					0, 0xffffffff,
+					prim, start, count );
+}
+
+static boolean i915_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+   return i915_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
+struct pipe_context *i915_create_context( struct pipe_screen *screen,
+                                          struct pipe_winsys *pipe_winsys,
+                                          struct i915_winsys *i915_winsys )
+{
+   struct i915_context *i915;
+
+   i915 = CALLOC_STRUCT(i915_context);
+   if (i915 == NULL)
+      return NULL;
+
+   i915->winsys = i915_winsys;
+   i915->pipe.winsys = pipe_winsys;
+   i915->pipe.screen = screen;
+
+   i915->pipe.destroy = i915_destroy;
+
+   i915->pipe.clear = i915_clear;
+
+
+   i915->pipe.draw_arrays = i915_draw_arrays;
+   i915->pipe.draw_elements = i915_draw_elements;
+   i915->pipe.draw_range_elements = i915_draw_range_elements;
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   i915->draw = draw_create();
+   assert(i915->draw);
+   if (!debug_get_bool_option("I915_NO_VBUF", FALSE)) {
+      draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
+   }
+   else {
+      draw_set_rasterize_stage(i915->draw, i915_draw_render_stage(i915));
+   }
+
+   i915_init_surface_functions(i915);
+   i915_init_state_functions(i915);
+   i915_init_flush_functions(i915);
+   i915_init_texture_functions(i915);
+
+   draw_install_aaline_stage(i915->draw, &i915->pipe);
+   draw_install_aapoint_stage(i915->draw, &i915->pipe);
+
+   i915->dirty = ~0;
+   i915->hardware_dirty = ~0;
+
+   /* Batch stream debugging is a bit hacked up at the moment:
+    */
+   i915->batch = i915_winsys->batch_get(i915_winsys);
+   i915->batch->winsys = i915_winsys;
+
+   return &i915->pipe;
+}
+
diff --git a/src/gallium/drivers/i915simple/i915_context.h b/src/gallium/drivers/i915simple/i915_context.h
new file mode 100644
index 0000000000..3cdabe45f9
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_context.h
@@ -0,0 +1,345 @@
+ /**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_CONTEXT_H
+#define I915_CONTEXT_H
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "draw/draw_vertex.h"
+
+#include "tgsi/tgsi_scan.h"
+
+
+#define I915_TEX_UNITS 8
+
+#define I915_DYNAMIC_MODES4       0
+#define I915_DYNAMIC_DEPTHSCALE_0 1 /* just the header */
+#define I915_DYNAMIC_DEPTHSCALE_1 2 
+#define I915_DYNAMIC_IAB          3
+#define I915_DYNAMIC_BC_0         4 /* just the header */
+#define I915_DYNAMIC_BC_1         5
+#define I915_DYNAMIC_BFO_0        6 
+#define I915_DYNAMIC_BFO_1        7
+#define I915_DYNAMIC_STP_0        8 
+#define I915_DYNAMIC_STP_1        9 
+#define I915_DYNAMIC_SC_ENA_0     10 
+#define I915_DYNAMIC_SC_RECT_0    11 
+#define I915_DYNAMIC_SC_RECT_1    12 
+#define I915_DYNAMIC_SC_RECT_2    13 
+#define I915_MAX_DYNAMIC          14
+
+
+#define I915_IMMEDIATE_S0         0
+#define I915_IMMEDIATE_S1         1
+#define I915_IMMEDIATE_S2         2
+#define I915_IMMEDIATE_S3         3
+#define I915_IMMEDIATE_S4         4
+#define I915_IMMEDIATE_S5         5
+#define I915_IMMEDIATE_S6         6
+#define I915_IMMEDIATE_S7         7
+#define I915_MAX_IMMEDIATE        8
+
+/* These must mach the order of LI0_STATE_* bits, as they will be used
+ * to generate hardware packets:
+ */
+#define I915_CACHE_STATIC         0 
+#define I915_CACHE_DYNAMIC        1 /* handled specially */
+#define I915_CACHE_SAMPLER        2
+#define I915_CACHE_MAP            3
+#define I915_CACHE_PROGRAM        4
+#define I915_CACHE_CONSTANTS      5
+#define I915_MAX_CACHE            6
+
+#define I915_MAX_CONSTANT  32
+
+
+/** See constant_flags[] below */
+#define I915_CONSTFLAG_USER 0x1f
+
+
+/**
+ * Subclass of pipe_shader_state
+ */
+struct i915_fragment_shader
+{
+   struct pipe_shader_state state;
+
+   struct tgsi_shader_info info;
+
+   uint *program;
+   uint program_len;
+
+   /**
+    * constants introduced during translation.
+    * These are placed at the end of the constant buffer and grow toward
+    * the beginning (eg: slot 31, 30 29, ...)
+    * User-provided constants start at 0.
+    * This allows both types of constants to co-exist (until there's too many)
+    * and doesn't require regenerating/changing the fragment program to
+    * shuffle constants around.
+    */
+   uint num_constants;
+   float constants[I915_MAX_CONSTANT][4];
+
+   /**
+    * Status of each constant
+    * if I915_CONSTFLAG_PARAM, the value must be taken from the corresponding
+    * slot of the user's constant buffer. (set by pipe->set_constant_buffer())
+    * Else, the bitmask indicates which components are occupied by immediates.
+    */
+   ubyte constant_flags[I915_MAX_CONSTANT];
+};
+
+
+struct i915_cache_context;
+
+/* Use to calculate differences between state emitted to hardware and
+ * current driver-calculated state.  
+ */
+struct i915_state 
+{
+   unsigned immediate[I915_MAX_IMMEDIATE];
+   unsigned dynamic[I915_MAX_DYNAMIC];
+
+   float constants[PIPE_SHADER_TYPES][I915_MAX_CONSTANT][4];
+   /** number of constants passed in through a constant buffer */
+   uint num_user_constants[PIPE_SHADER_TYPES];
+
+   /* texture sampler state */
+   unsigned sampler[I915_TEX_UNITS][3];
+   unsigned sampler_enable_flags;
+   unsigned sampler_enable_nr;
+
+   /* texture image buffers */
+   unsigned texbuffer[I915_TEX_UNITS][2];
+
+   /** Describes the current hardware vertex layout */
+   struct vertex_info vertex_info;
+   
+   unsigned id;			/* track lost context events */
+};
+
+struct i915_blend_state {
+   unsigned iab;
+   unsigned modes4;
+   unsigned LIS5;
+   unsigned LIS6;
+};
+
+struct i915_depth_stencil_state {
+   unsigned stencil_modes4;
+   unsigned bfo[2];
+   unsigned stencil_LIS5;
+   unsigned depth_LIS6;
+};
+
+struct i915_rasterizer_state {
+   int light_twoside : 1;
+   unsigned st;
+   enum interp_mode color_interp;
+
+   unsigned LIS4;
+   unsigned LIS7;
+   unsigned sc[1];
+
+   const struct pipe_rasterizer_state *templ;
+
+   union { float f; unsigned u; } ds[2];
+};
+
+struct i915_sampler_state {
+   unsigned state[3];
+   const struct pipe_sampler_state *templ;
+   unsigned minlod;
+   unsigned maxlod;
+};
+
+
+struct i915_texture {
+   struct pipe_texture base;
+
+   /* Derived from the above:
+    */
+   unsigned stride;
+   unsigned depth_stride;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned tiled;
+
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.  Pretty much have to accept that hardware formats
+    * are going to be so diverse that there is no unified way to
+    * compute the offsets of depth/cube images within a mipmap level,
+    * so have to store them as a lookup table:
+    */
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+};
+
+struct i915_batchbuffer;
+
+struct i915_context
+{
+   struct pipe_context pipe;
+   struct i915_winsys *winsys;
+   struct draw_context *draw;
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct i915_blend_state           *blend;
+   const struct i915_sampler_state         *sampler[PIPE_MAX_SAMPLERS];
+   const struct i915_depth_stencil_state   *depth_stencil;
+   const struct i915_rasterizer_state      *rasterizer;
+
+   struct i915_fragment_shader *fs;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct i915_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+
+   struct i915_batchbuffer *batch;
+
+   /** Vertex buffer */
+   struct pipe_buffer *vbo;
+   size_t vbo_offset;
+   unsigned vbo_flushed;
+
+   struct i915_state current;
+   unsigned hardware_dirty;
+   
+   unsigned debug;
+};
+
+/* A flag for each state_tracker state object:
+ */
+#define I915_NEW_VIEWPORT      0x1
+#define I915_NEW_RASTERIZER    0x2
+#define I915_NEW_FS            0x4
+#define I915_NEW_BLEND         0x8
+#define I915_NEW_CLIP          0x10
+#define I915_NEW_SCISSOR       0x20
+#define I915_NEW_STIPPLE       0x40
+#define I915_NEW_FRAMEBUFFER   0x80
+#define I915_NEW_ALPHA_TEST    0x100
+#define I915_NEW_DEPTH_STENCIL 0x200
+#define I915_NEW_SAMPLER       0x400
+#define I915_NEW_TEXTURE       0x800
+#define I915_NEW_CONSTANTS     0x1000
+#define I915_NEW_VBO           0x2000
+#define I915_NEW_VS            0x4000
+
+
+/* Driver's internally generated state flags:
+ */
+#define I915_NEW_VERTEX_FORMAT    0x10000
+
+
+/* Dirty flags for hardware emit
+ */
+#define I915_HW_STATIC            (1<<I915_CACHE_STATIC)
+#define I915_HW_DYNAMIC           (1<<I915_CACHE_DYNAMIC)
+#define I915_HW_SAMPLER           (1<<I915_CACHE_SAMPLER)
+#define I915_HW_MAP               (1<<I915_CACHE_MAP)
+#define I915_HW_PROGRAM           (1<<I915_CACHE_PROGRAM)
+#define I915_HW_CONSTANTS         (1<<I915_CACHE_CONSTANTS)
+#define I915_HW_IMMEDIATE         (1<<(I915_MAX_CACHE+0))
+#define I915_HW_INVARIENT         (1<<(I915_MAX_CACHE+1))
+
+
+/***********************************************************************
+ * i915_prim_emit.c: 
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_prim_vbuf.c: 
+ */
+struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 );
+
+
+/***********************************************************************
+ * i915_state_emit.c: 
+ */
+void i915_emit_hardware_state(struct i915_context *i915 );
+
+
+
+/***********************************************************************
+ * i915_clear.c: 
+ */
+void i915_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		unsigned clearValue);
+
+
+/***********************************************************************
+ * i915_surface.c: 
+ */
+void i915_init_surface_functions( struct i915_context *i915 );
+
+void i915_init_state_functions( struct i915_context *i915 );
+void i915_init_flush_functions( struct i915_context *i915 );
+void i915_init_string_functions( struct i915_context *i915 );
+
+
+
+
+/***********************************************************************
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static INLINE struct i915_context *
+i915_context( struct pipe_context *pipe )
+{
+   return (struct i915_context *)pipe;
+}
+
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_debug.c b/src/gallium/drivers/i915simple/i915_debug.c
new file mode 100644
index 0000000000..7a4e7051d2
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug.c
@@ -0,0 +1,900 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_debug.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_debug.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static boolean debug( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned i;
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+
+
+static const char *get_prim_name( unsigned val )
+{
+   switch (val & PRIM3D_MASK) {
+   case PRIM3D_TRILIST: return "TRILIST"; break;
+   case PRIM3D_TRISTRIP: return "TRISTRIP"; break;
+   case PRIM3D_TRISTRIP_RVRSE: return "TRISTRIP_RVRSE"; break;
+   case PRIM3D_TRIFAN: return "TRIFAN"; break;
+   case PRIM3D_POLY: return "POLY"; break;
+   case PRIM3D_LINELIST: return "LINELIST"; break;
+   case PRIM3D_LINESTRIP: return "LINESTRIP"; break;
+   case PRIM3D_RECTLIST: return "RECTLIST"; break;
+   case PRIM3D_POINTLIST: return "POINTLIST"; break;
+   case PRIM3D_DIB: return "DIB"; break;
+   case PRIM3D_CLEAR_RECT: return "CLEAR_RECT"; break;
+   case PRIM3D_ZONE_INIT: return "ZONE_INIT"; break;
+   default: return "????"; break;
+   }
+}
+
+static boolean debug_prim( struct debug_stream *stream, const char *name, 
+			     boolean dump_floats,
+			     unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i;
+   
+
+
+   PRINTF(stream, "%s %s (%d dwords):\n", name, prim, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[0]);   
+   for (i = 1; i < len; i++) {
+      if (dump_floats)
+	 PRINTF(stream, "\t0x%08x // %f\n",  ptr[i], *(float *)&ptr[i]);   
+      else
+	 PRINTF(stream, "\t0x%08x\n",  ptr[i]);   
+   }
+
+      
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+   
+
+
+
+static boolean debug_program( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+
+   if (len == 0) {
+      PRINTF(stream, "Error - zero length packet (0x%08x)\n", stream->ptr[0]);
+      assert(0);
+      return FALSE;
+   }
+
+   if (stream->print_addresses)
+      PRINTF(stream, "%08x:  ", stream->offset);
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   i915_disassemble_program( stream, ptr, len );
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static boolean debug_chain( struct debug_stream *stream, const char *name, unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned old_offset = stream->offset + len * sizeof(unsigned);
+   unsigned i;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+
+   stream->offset = ptr[1] & ~0x3;
+   
+   if (stream->offset < old_offset)
+      PRINTF(stream, "\n... skipping backwards from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+   else
+      PRINTF(stream, "\n... skipping from 0x%x --> 0x%x ...\n\n", 
+		   old_offset, stream->offset );
+
+
+   return TRUE;
+}
+
+
+static boolean debug_variable_length_prim( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   const char *prim = get_prim_name( ptr[0] );
+   unsigned i, len;
+
+   ushort *idx = (ushort *)(ptr+1);
+   for (i = 0; idx[i] != 0xffff; i++)
+      ;
+
+   len = 1+(i+2)/2;
+
+   PRINTF(stream, "3DPRIM, %s variable length %d indicies (%d dwords):\n", prim, i, len);
+   for (i = 0; i < len; i++)
+      PRINTF(stream, "\t0x%08x\n",  ptr[i]);
+   PRINTF(stream, "\n");
+
+   stream->offset += len * sizeof(unsigned);
+   return TRUE;
+}
+
+
+static void
+BITS(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             hi,
+   unsigned             lo,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+   unsigned himask = ~0UL >> (31 - (hi));
+
+   PRINTF(stream, "\t\t ");
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+
+   PRINTF(stream, ": 0x%x\n", ((dw) & himask) >> (lo));
+}
+
+#ifdef DEBUG
+#define MBZ( dw, hi, lo) do {							\
+   unsigned x = (dw) >> (lo);				\
+   unsigned lomask = (1 << (lo)) - 1;			\
+   unsigned himask;					\
+   himask = (1UL << (hi)) - 1;				\
+   assert ((x & himask & ~lomask) == 0);	\
+} while (0)
+#else
+#define MBZ( dw, hi, lo) do {							\
+} while (0)
+#endif
+
+static void
+FLAG(
+   struct debug_stream  *stream,
+   unsigned             dw,
+   unsigned             bit,
+   const char           *fmt,
+                        ... )
+{
+   if (((dw) >> (bit)) & 1) {
+      va_list  args;
+
+      PRINTF(stream, "\t\t ");
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+
+      PRINTF(stream, "\n");
+   }
+}
+
+static boolean debug_load_immediate( struct debug_stream *stream,
+				       const char *name,
+				       unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 4) & 0xff;
+   unsigned j = 0;
+   
+   PRINTF(stream, "%s (%d dwords, flags: %x):\n", name, len, bits);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   if (bits & (1<<0)) {
+      PRINTF(stream, "\t  LIS0: 0x%08x\n", ptr[j]);
+      PRINTF(stream, "\t vb address: 0x%08x\n", (ptr[j] & ~0x3));
+      BITS(stream, ptr[j], 0, 0, "vb invalidate disable");
+      j++;
+   }
+   if (bits & (1<<1)) {
+      PRINTF(stream, "\t  LIS1: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 29, 24, "vb dword width");
+      BITS(stream, ptr[j], 21, 16, "vb dword pitch");
+      BITS(stream, ptr[j], 15, 0, "vb max index");
+      j++;
+   }
+   if (bits & (1<<2)) {
+      int i;
+      PRINTF(stream, "\t  LIS2: 0x%08x\n", ptr[j]);
+      for (i = 0; i < 8; i++) {
+	 unsigned tc = (ptr[j] >> (i * 4)) & 0xf;
+	 if (tc != 0xf)
+	    BITS(stream, tc, 3, 0, "tex coord %d", i);
+      }
+      j++;
+   }
+   if (bits & (1<<3)) {
+      PRINTF(stream, "\t  LIS3: 0x%08x\n", ptr[j]);
+      j++;
+   }
+   if (bits & (1<<4)) {
+      PRINTF(stream, "\t  LIS4: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 23, "point width");
+      BITS(stream, ptr[j], 22, 19, "line width");
+      FLAG(stream, ptr[j], 18, "alpha flatshade");
+      FLAG(stream, ptr[j], 17, "fog flatshade");
+      FLAG(stream, ptr[j], 16, "spec flatshade");
+      FLAG(stream, ptr[j], 15, "rgb flatshade");
+      BITS(stream, ptr[j], 14, 13, "cull mode");
+      FLAG(stream, ptr[j], 12, "vfmt: point width");
+      FLAG(stream, ptr[j], 11, "vfmt: specular/fog");
+      FLAG(stream, ptr[j], 10, "vfmt: rgba");
+      FLAG(stream, ptr[j], 9, "vfmt: depth offset");
+      BITS(stream, ptr[j], 8, 6, "vfmt: position (2==xyzw)");
+      FLAG(stream, ptr[j], 5, "force dflt diffuse");
+      FLAG(stream, ptr[j], 4, "force dflt specular");
+      FLAG(stream, ptr[j], 3, "local depth offset enable");
+      FLAG(stream, ptr[j], 2, "vfmt: fp32 fog coord");
+      FLAG(stream, ptr[j], 1, "sprite point");
+      FLAG(stream, ptr[j], 0, "antialiasing");
+      j++;
+   }
+   if (bits & (1<<5)) {
+      PRINTF(stream, "\t  LIS5: 0x%08x\n", ptr[j]);
+      BITS(stream, ptr[j], 31, 28, "rgba write disables");
+      FLAG(stream, ptr[j], 27,     "force dflt point width");
+      FLAG(stream, ptr[j], 26,     "last pixel enable");
+      FLAG(stream, ptr[j], 25,     "global z offset enable");
+      FLAG(stream, ptr[j], 24,     "fog enable");
+      BITS(stream, ptr[j], 23, 16, "stencil ref");
+      BITS(stream, ptr[j], 15, 13, "stencil test");
+      BITS(stream, ptr[j], 12, 10, "stencil fail op");
+      BITS(stream, ptr[j], 9, 7,   "stencil pass z fail op");
+      BITS(stream, ptr[j], 6, 4,   "stencil pass z pass op");
+      FLAG(stream, ptr[j], 3,      "stencil write enable");
+      FLAG(stream, ptr[j], 2,      "stencil test enable");
+      FLAG(stream, ptr[j], 1,      "color dither enable");
+      FLAG(stream, ptr[j], 0,      "logiop enable");
+      j++;
+   }
+   if (bits & (1<<6)) {
+      PRINTF(stream, "\t  LIS6: 0x%08x\n", ptr[j]);
+      FLAG(stream, ptr[j], 31,      "alpha test enable");
+      BITS(stream, ptr[j], 30, 28,  "alpha func");
+      BITS(stream, ptr[j], 27, 20,  "alpha ref");
+      FLAG(stream, ptr[j], 19,      "depth test enable");
+      BITS(stream, ptr[j], 18, 16,  "depth func");
+      FLAG(stream, ptr[j], 15,      "blend enable");
+      BITS(stream, ptr[j], 14, 12,  "blend func");
+      BITS(stream, ptr[j], 11, 8,   "blend src factor");
+      BITS(stream, ptr[j], 7,  4,   "blend dst factor");
+      FLAG(stream, ptr[j], 3,       "depth write enable");
+      FLAG(stream, ptr[j], 2,       "color write enable");
+      BITS(stream, ptr[j], 1,  0,   "provoking vertex"); 
+      j++;
+   }
+
+
+   PRINTF(stream, "\n");
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 
+
+
+static boolean debug_load_indirect( struct debug_stream *stream,
+				      const char *name,
+				      unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned bits = (ptr[0] >> 8) & 0x3f;
+   unsigned i, j = 0;
+   
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   for (i = 0; i < 6; i++) {
+      if (bits & (1<<i)) {
+	 switch (1<<(8+i)) {
+	 case LI0_STATE_STATIC_INDIRECT:
+	    PRINTF(stream, "        STATIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_DYNAMIC_INDIRECT:
+	    PRINTF(stream, "       DYNAMIC: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    break;
+	 case LI0_STATE_SAMPLER:
+	    PRINTF(stream, "       SAMPLER: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_MAP:
+	    PRINTF(stream, "           MAP: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_PROGRAM:
+	    PRINTF(stream, "       PROGRAM: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 case LI0_STATE_CONSTANTS:
+	    PRINTF(stream, "     CONSTANTS: 0x%08x | %x\n", ptr[j]&~3, ptr[j]&3); j++;
+	    PRINTF(stream, "                0x%08x\n", ptr[j++]);
+	    break;
+	 default:
+	    assert(0);
+	    break;
+	 }
+      }
+   }
+
+   if (bits == 0) {
+      PRINTF(stream, "\t  DUMMY: 0x%08x\n", ptr[j++]);
+   }
+
+   PRINTF(stream, "\n");
+
+
+   assert(j == len);
+
+   stream->offset += len * sizeof(unsigned);
+   
+   return TRUE;
+}
+ 	
+static void BR13( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   FLAG(stream, val, 30, "clipping enable");
+   BITS(stream, val, 25, 24, "color depth (3==32bpp)");
+   BITS(stream, val, 23, 16, "raster op");
+   BITS(stream, val, 15, 0,  "dest pitch");
+}
+
+
+static void BR22( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y1");
+   BITS(stream, val, 15, 0,  "dest x1");
+}
+
+static void BR23( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "dest y2");
+   BITS(stream, val, 15, 0,  "dest x2");
+}
+
+static void BR09( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- dest address\n",  val);
+}
+
+static void BR26( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 31, 16, "src y1");
+   BITS(stream, val, 15, 0,  "src x1");
+}
+
+static void BR11( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x\n",  val);
+   BITS(stream, val, 15, 0,  "src pitch");
+}
+
+static void BR12( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- src address\n",  val);
+}
+
+static void BR16( struct debug_stream *stream,
+		  unsigned val )
+{
+   PRINTF(stream, "\t0x%08x -- color\n",  val);
+}
+   
+static boolean debug_copy_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR26(stream, ptr[j++]);
+   BR11(stream, ptr[j++]);
+   BR12(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_color_blit( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   BR13(stream, ptr[j++]);
+   BR22(stream, ptr[j++]);
+   BR23(stream, ptr[j++]);
+   BR09(stream, ptr[j++]);
+   BR16(stream, ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_modes4( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+   BITS(stream, ptr[j], 21, 18, "logicop func");
+   FLAG(stream, ptr[j], 17, "stencil test mask modify-enable");
+   FLAG(stream, ptr[j], 16, "stencil write mask modify-enable");
+   BITS(stream, ptr[j], 15, 8, "stencil test mask");
+   BITS(stream, ptr[j], 7, 0,  "stencil write mask");
+   j++;
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_map_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "map mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TMn.0: 0x%08x\n", ptr[j]);
+	 PRINTF(stream, "\t map address: 0x%08x\n", (ptr[j] & ~0x3));
+	 FLAG(stream, ptr[j], 1, "vertical line stride");
+	 FLAG(stream, ptr[j], 0, "vertical line stride offset");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TMn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "height");
+	 BITS(stream, ptr[j], 20, 10, "width");
+	 BITS(stream, ptr[j], 9, 7, "surface format");
+	 BITS(stream, ptr[j], 6, 3, "texel format");
+	 FLAG(stream, ptr[j], 2, "use fence regs");
+	 FLAG(stream, ptr[j], 1, "tiled surface");
+	 FLAG(stream, ptr[j], 0, "tile walk ymajor");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TMn.2: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 21, "dword pitch");
+	 BITS(stream, ptr[j], 20, 15, "cube face enables");
+	 BITS(stream, ptr[j], 14, 9, "max lod");
+	 FLAG(stream, ptr[j], 8,     "mip layout right");
+	 BITS(stream, ptr[j], 7, 0, "depth");
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_sampler_state( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+   
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 15, 0,   "sampler mask");
+      j++;
+   }
+
+   while (j < len) {
+      {
+	 PRINTF(stream, "\t  TSn.0: 0x%08x\n", ptr[j]);
+	 FLAG(stream, ptr[j], 31, "reverse gamma");
+	 FLAG(stream, ptr[j], 30, "planar to packed");
+	 FLAG(stream, ptr[j], 29, "yuv->rgb");
+	 BITS(stream, ptr[j], 28, 27, "chromakey index");
+	 BITS(stream, ptr[j], 26, 22, "base mip level");
+	 BITS(stream, ptr[j], 21, 20, "mip mode filter");
+	 BITS(stream, ptr[j], 19, 17, "mag mode filter");
+	 BITS(stream, ptr[j], 16, 14, "min mode filter");
+	 BITS(stream, ptr[j], 13, 5,  "lod bias (s4.4)");
+	 FLAG(stream, ptr[j], 4,      "shadow enable");
+	 FLAG(stream, ptr[j], 3,      "max-aniso-4");
+	 BITS(stream, ptr[j], 2, 0,   "shadow func");
+	 j++;
+      }
+
+      {
+	 PRINTF(stream, "\t  TSn.1: 0x%08x\n", ptr[j]);
+	 BITS(stream, ptr[j], 31, 24, "min lod");
+	 MBZ( ptr[j], 23, 18 );
+	 FLAG(stream, ptr[j], 17,     "kill pixel enable");
+	 FLAG(stream, ptr[j], 16,     "keyed tex filter mode");
+	 FLAG(stream, ptr[j], 15,     "chromakey enable");
+	 BITS(stream, ptr[j], 14, 12, "tcx wrap mode");
+	 BITS(stream, ptr[j], 11, 9,  "tcy wrap mode");
+	 BITS(stream, ptr[j], 8,  6,  "tcz wrap mode");
+	 FLAG(stream, ptr[j], 5,      "normalized coords");
+	 BITS(stream, ptr[j], 4,  1,  "map (surface) index");
+	 FLAG(stream, ptr[j], 0,      "EAST deinterlacer enable");
+	 j++;
+      }
+      {
+	 PRINTF(stream, "\t  TSn.2: 0x%08x  (default color)\n", ptr[j]);
+	 j++;
+      }
+   }
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_dest_vars( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      FLAG(stream, ptr[j], 31,     "early classic ztest");
+      FLAG(stream, ptr[j], 30,     "opengl tex default color");
+      FLAG(stream, ptr[j], 29,     "bypass iz");
+      FLAG(stream, ptr[j], 28,     "lod preclamp");
+      BITS(stream, ptr[j], 27, 26, "dither pattern");
+      FLAG(stream, ptr[j], 25,     "linear gamma blend");
+      FLAG(stream, ptr[j], 24,     "debug dither");
+      BITS(stream, ptr[j], 23, 20, "dstorg x");
+      BITS(stream, ptr[j], 19, 16, "dstorg y");
+      MBZ (ptr[j], 15, 15 );
+      BITS(stream, ptr[j], 14, 12, "422 write select");
+      BITS(stream, ptr[j], 11, 8,  "cbuf format");
+      BITS(stream, ptr[j], 3, 2,   "zbuf format");
+      FLAG(stream, ptr[j], 1,      "vert line stride");
+      FLAG(stream, ptr[j], 1,      "vert line stride offset");
+      j++;
+   }
+   
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean debug_buf_info( struct debug_stream *stream,
+				  const char *name,
+				  unsigned len )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   int j = 0;
+
+   PRINTF(stream, "%s (%d dwords):\n", name, len);
+   PRINTF(stream, "\t0x%08x\n",  ptr[j++]);
+
+   {
+      PRINTF(stream, "\t0x%08x\n",  ptr[j]);
+      BITS(stream, ptr[j], 28, 28, "aux buffer id");
+      BITS(stream, ptr[j], 27, 24, "buffer id (7=depth, 3=back)");
+      FLAG(stream, ptr[j], 23,     "use fence regs");
+      FLAG(stream, ptr[j], 22,     "tiled surface");
+      FLAG(stream, ptr[j], 21,     "tile walk ymajor");
+      MBZ (ptr[j], 20, 14);
+      BITS(stream, ptr[j], 13, 2,  "dword pitch");
+      MBZ (ptr[j], 2,  0);
+      j++;
+   }
+   
+   PRINTF(stream, "\t0x%08x -- buffer base address\n",  ptr[j++]);
+
+   stream->offset += len * sizeof(unsigned);
+   assert(j == len);
+   return TRUE;
+}
+
+static boolean i915_debug_packet( struct debug_stream *stream )
+{
+   unsigned *ptr = (unsigned *)(stream->ptr + stream->offset);
+   unsigned cmd = *ptr;
+   
+   switch (((cmd >> 29) & 0x7)) {
+   case 0x0:
+      switch ((cmd >> 23) & 0x3f) {
+      case 0x0:
+	 return debug(stream, "MI_NOOP", 1);
+      case 0x3:
+	 return debug(stream, "MI_WAIT_FOR_EVENT", 1);
+      case 0x4:
+	 return debug(stream, "MI_FLUSH", 1);
+      case 0xA:
+	 debug(stream, "MI_BATCH_BUFFER_END", 1);
+	 return FALSE;
+      case 0x22:
+	 return debug(stream, "MI_LOAD_REGISTER_IMM", 3);
+      case 0x31:
+	 return debug_chain(stream, "MI_BATCH_BUFFER_START", 2);
+      default:
+         (void)debug(stream, "UNKNOWN 0x0 case!", 1);
+         assert(0);
+	 break;
+      }
+      break;
+   case 0x1:
+      (void) debug(stream, "UNKNOWN 0x1 case!", 1);
+      assert(0);
+      break;
+   case 0x2:
+      switch ((cmd >> 22) & 0xff) {	 
+      case 0x50:
+	 return debug_color_blit(stream, "XY_COLOR_BLT", (cmd & 0xff) + 2);
+      case 0x53:
+	 return debug_copy_blit(stream, "XY_SRC_COPY_BLT", (cmd & 0xff) + 2);
+      default:
+	 return debug(stream, "blit command", (cmd & 0xff) + 2);
+      }
+      break;
+   case 0x3:
+      switch ((cmd >> 24) & 0x1f) {	 
+      case 0x6:
+	 return debug(stream, "3DSTATE_ANTI_ALIASING", 1);
+      case 0x7:
+	 return debug(stream, "3DSTATE_RASTERIZATION_RULES", 1);
+      case 0x8:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_OPS", 2);
+      case 0x9:
+	 return debug(stream, "3DSTATE_BACKFACE_STENCIL_MASKS", 1);
+      case 0xb:
+	 return debug(stream, "3DSTATE_INDEPENDENT_ALPHA_BLEND", 1);
+      case 0xc:
+	 return debug(stream, "3DSTATE_MODES5", 1);	 
+      case 0xd:
+	 return debug_modes4(stream, "3DSTATE_MODES4", 1);
+      case 0x15:
+	 return debug(stream, "3DSTATE_FOG_COLOR", 1);
+      case 0x16:
+	 return debug(stream, "3DSTATE_COORD_SET_BINDINGS", 1);
+      case 0x1c:
+	 /* 3DState16NP */
+	 switch((cmd >> 19) & 0x1f) {
+	 case 0x10:
+	    return debug(stream, "3DSTATE_SCISSOR_ENABLE", 1);
+	 case 0x11:
+	    return debug(stream, "3DSTATE_DEPTH_SUBRECTANGLE_DISABLE", 1);
+	 default:
+            (void) debug(stream, "UNKNOWN 0x1c case!", 1);
+            assert(0);
+	    break;
+	 }
+	 break;
+      case 0x1d:
+	 /* 3DStateMW */
+	 switch ((cmd >> 16) & 0xff) {
+	 case 0x0:
+	    return debug_map_state(stream, "3DSTATE_MAP_STATE", (cmd & 0x1f) + 2);
+	 case 0x1:
+	    return debug_sampler_state(stream, "3DSTATE_SAMPLER_STATE", (cmd & 0x1f) + 2);
+	 case 0x4:
+	    return debug_load_immediate(stream, "3DSTATE_LOAD_STATE_IMMEDIATE", (cmd & 0xf) + 2);
+	 case 0x5:
+	    return debug_program(stream, "3DSTATE_PIXEL_SHADER_PROGRAM", (cmd & 0x1ff) + 2);
+	 case 0x6:
+	    return debug(stream, "3DSTATE_PIXEL_SHADER_CONSTANTS", (cmd & 0xff) + 2);
+	 case 0x7:
+	    return debug_load_indirect(stream, "3DSTATE_LOAD_INDIRECT", (cmd & 0xff) + 2);
+	 case 0x80:
+	    return debug(stream, "3DSTATE_DRAWING_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x81:
+	    return debug(stream, "3DSTATE_SCISSOR_RECTANGLE", (cmd & 0xffff) + 2);
+	 case 0x83:
+	    return debug(stream, "3DSTATE_SPAN_STIPPLE", (cmd & 0xffff) + 2);
+	 case 0x85:
+	    return debug_dest_vars(stream, "3DSTATE_DEST_BUFFER_VARS", (cmd & 0xffff) + 2);
+	 case 0x88:
+	    return debug(stream, "3DSTATE_CONSTANT_BLEND_COLOR", (cmd & 0xffff) + 2);
+	 case 0x89:
+	    return debug(stream, "3DSTATE_FOG_MODE", (cmd & 0xffff) + 2);
+	 case 0x8e:
+	    return debug_buf_info(stream, "3DSTATE_BUFFER_INFO", (cmd & 0xffff) + 2);
+	 case 0x97:
+	    return debug(stream, "3DSTATE_DEPTH_OFFSET_SCALE", (cmd & 0xffff) + 2);
+	 case 0x98:
+	    return debug(stream, "3DSTATE_DEFAULT_Z", (cmd & 0xffff) + 2);
+	 case 0x99:
+	    return debug(stream, "3DSTATE_DEFAULT_DIFFUSE", (cmd & 0xffff) + 2);
+	 case 0x9a:
+	    return debug(stream, "3DSTATE_DEFAULT_SPECULAR", (cmd & 0xffff) + 2);
+	 case 0x9c:
+	    return debug(stream, "3DSTATE_CLEAR_PARAMETERS", (cmd & 0xffff) + 2);
+	 default:
+	    assert(0);
+	    return 0;
+	 }
+	 break;
+      case 0x1e:
+	 if (cmd & (1 << 23))
+	    return debug(stream, "???", (cmd & 0xffff) + 1);
+	 else
+	    return debug(stream, "", 1);
+	 break;
+      case 0x1f:
+	 if ((cmd & (1 << 23)) == 0)	
+	    return debug_prim(stream, "3DPRIM (inline)", 1, (cmd & 0x1ffff) + 2);
+	 else if (cmd & (1 << 17)) 
+	 {
+	    if ((cmd & 0xffff) == 0)
+	       return debug_variable_length_prim(stream);
+	    else
+	       return debug_prim(stream, "3DPRIM (indexed)", 0, (((cmd & 0xffff) + 1) / 2) + 1);
+	 }
+	 else
+	    return debug_prim(stream, "3DPRIM  (indirect sequential)", 0, 2); 
+	 break;
+      default:
+	 return debug(stream, "", 0);
+      }
+   default:
+      assert(0);
+      return 0;
+   }
+
+   assert(0);
+   return 0;
+}
+
+
+
+void
+i915_dump_batchbuffer( struct i915_context *i915 )
+{
+   struct debug_stream stream;
+   /* TODO fix me */
+   unsigned *start = 0;/*i915->batch_start;*/
+   unsigned *end = 0;/*i915->winsys->batch_start( i915->winsys, 0, 0 );*/
+   unsigned long bytes = (unsigned long) (end - start) * 4;
+   boolean done = FALSE;
+
+   stream.offset = 0;
+   stream.ptr = (char *)start;
+   stream.print_addresses = 0;
+   stream.winsys = i915->pipe.winsys;
+
+   if (!start || !end) {
+      debug_printf( "\n\nBATCH: ???\n");
+      return;
+   }
+   
+   debug_printf( "\n\nBATCH: (%d)\n", bytes / 4);
+
+   while (!done &&
+	  stream.offset < bytes)
+   {
+      if (!i915_debug_packet( &stream ))
+	 break;
+
+      assert(stream.offset <= bytes &&
+	     stream.offset >= 0);
+   }
+
+   debug_printf( "END-BATCH\n\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_debug.h b/src/gallium/drivers/i915simple/i915_debug.h
new file mode 100644
index 0000000000..afb63edabf
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug.h
@@ -0,0 +1,115 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_DEBUG_H
+#define I915_DEBUG_H
+
+#include <stdarg.h>
+
+struct i915_context;
+
+struct debug_stream 
+{
+   unsigned offset;		/* current gtt offset */
+   char *ptr;		/* pointer to gtt offset zero */
+   char *end;		/* pointer to gtt offset zero */
+   unsigned print_addresses;
+   struct pipe_winsys *winsys;
+};
+
+
+/* Internal functions
+ */
+void i915_disassemble_program(struct debug_stream *stream, 
+			      const unsigned *program, unsigned sz);
+
+void i915_print_ureg(const char *msg, unsigned ureg);
+
+
+#define DEBUG_BATCH	 0x1
+#define DEBUG_BLIT       0x2
+#define DEBUG_BUFFER     0x4
+#define DEBUG_CONSTANTS  0x8
+#define DEBUG_CONTEXT    0x10
+#define DEBUG_DRAW	 0x20
+#define DEBUG_DYNAMIC	 0x40
+#define DEBUG_FLUSH      0x80
+#define DEBUG_MAP	 0x100
+#define DEBUG_PROGRAM	 0x200
+#define DEBUG_REGIONS    0x400
+#define DEBUG_SAMPLER	 0x800
+#define DEBUG_STATIC	 0x1000
+#define DEBUG_SURFACE    0x2000
+#define DEBUG_WINSYS     0x4000
+
+#include "pipe/p_compiler.h"
+
+#if defined(DEBUG) && defined(FILE_DEBUG_FLAG)
+
+#include "pipe/p_winsys.h"
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   if ((i915)->debug & FILE_DEBUG_FLAG) {
+      va_list  args;
+
+      va_start( args, fmt );
+      debug_vprintf( fmt, args );
+      va_end( args );
+   }
+}
+
+#else
+
+static INLINE void
+I915_DBG(
+   struct i915_context  *i915,
+   const char           *fmt,
+                        ... )
+{
+   (void) i915;
+   (void) fmt;
+}
+
+#endif
+
+
+void i915_dump_batchbuffer( struct i915_context *i915 );
+
+
+
+void i915_debug_init( struct i915_context *i915 );
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_debug_fp.c b/src/gallium/drivers/i915simple/i915_debug_fp.c
new file mode 100644
index 0000000000..48be3e1472
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_debug_fp.c
@@ -0,0 +1,363 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_debug.h"
+#include "pipe/p_winsys.h"
+#include "util/u_memory.h"
+
+
+static void
+PRINTF(
+   struct debug_stream  *stream,
+   const char           *fmt,
+                        ... )
+{
+   va_list  args;
+
+   va_start( args, fmt );
+   debug_vprintf( fmt, args );
+   va_end( args );
+}
+
+
+static const char *opcodes[0x20] = {
+   "NOP",
+   "ADD",
+   "MOV",
+   "MUL",
+   "MAD",
+   "DP2ADD",
+   "DP3",
+   "DP4",
+   "FRC",
+   "RCP",
+   "RSQ",
+   "EXP",
+   "LOG",
+   "CMP",
+   "MIN",
+   "MAX",
+   "FLR",
+   "MOD",
+   "TRC",
+   "SGE",
+   "SLT",
+   "TEXLD",
+   "TEXLDP",
+   "TEXLDB",
+   "TEXKILL",
+   "DCL",
+   "0x1a",
+   "0x1b",
+   "0x1c",
+   "0x1d",
+   "0x1e",
+   "0x1f",
+};
+
+
+static const int args[0x20] = {
+   0,                           /* 0 nop */
+   2,                           /* 1 add */
+   1,                           /* 2 mov */
+   2,                           /* 3 m ul */
+   3,                           /* 4 mad */
+   3,                           /* 5 dp2add */
+   2,                           /* 6 dp3 */
+   2,                           /* 7 dp4 */
+   1,                           /* 8 frc */
+   1,                           /* 9 rcp */
+   1,                           /* a rsq */
+   1,                           /* b exp */
+   1,                           /* c log */
+   3,                           /* d cmp */
+   2,                           /* e min */
+   2,                           /* f max */
+   1,                           /* 10 flr */
+   1,                           /* 11 mod */
+   1,                           /* 12 trc */
+   2,                           /* 13 sge */
+   2,                           /* 14 slt */
+   1,
+   1,
+   1,
+   1,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+   0,
+};
+
+
+static const char *regname[0x8] = {
+   "R",
+   "T",
+   "CONST",
+   "S",
+   "OC",
+   "OD",
+   "U",
+   "UNKNOWN",
+};
+
+static void
+print_reg_type_nr(struct debug_stream *stream, unsigned type, unsigned nr)
+{
+   switch (type) {
+   case REG_TYPE_T:
+      switch (nr) {
+      case T_DIFFUSE:
+         PRINTF(stream, "T_DIFFUSE");
+         return;
+      case T_SPECULAR:
+         PRINTF(stream, "T_SPECULAR");
+         return;
+      case T_FOG_W:
+         PRINTF(stream, "T_FOG_W");
+         return;
+      default:
+         PRINTF(stream, "T_TEX%d", nr);
+         return;
+      }
+   case REG_TYPE_OC:
+      if (nr == 0) {
+         PRINTF(stream, "oC");
+         return;
+      }
+      break;
+   case REG_TYPE_OD:
+      if (nr == 0) {
+         PRINTF(stream, "oD");
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   PRINTF(stream, "%s[%d]", regname[type], nr);
+}
+
+#define REG_SWIZZLE_MASK 0x7777
+#define REG_NEGATE_MASK 0x8888
+
+#define REG_SWIZZLE_XYZW ((SRC_X << A2_SRC2_CHANNEL_X_SHIFT) |	\
+		      (SRC_Y << A2_SRC2_CHANNEL_Y_SHIFT) |	\
+		      (SRC_Z << A2_SRC2_CHANNEL_Z_SHIFT) |	\
+		      (SRC_W << A2_SRC2_CHANNEL_W_SHIFT))
+
+
+static void
+print_reg_neg_swizzle(struct debug_stream *stream, unsigned reg)
+{
+   int i;
+
+   if ((reg & REG_SWIZZLE_MASK) == REG_SWIZZLE_XYZW &&
+       (reg & REG_NEGATE_MASK) == 0)
+      return;
+
+   PRINTF(stream, ".");
+
+   for (i = 3; i >= 0; i--) {
+      if (reg & (1 << ((i * 4) + 3)))
+         PRINTF(stream, "-");
+
+      switch ((reg >> (i * 4)) & 0x7) {
+      case 0:
+         PRINTF(stream, "x");
+         break;
+      case 1:
+         PRINTF(stream, "y");
+         break;
+      case 2:
+         PRINTF(stream, "z");
+         break;
+      case 3:
+         PRINTF(stream, "w");
+         break;
+      case 4:
+         PRINTF(stream, "0");
+         break;
+      case 5:
+         PRINTF(stream, "1");
+         break;
+      default:
+         PRINTF(stream, "?");
+         break;
+      }
+   }
+}
+
+
+static void
+print_src_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A2_SRC2_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A2_SRC2_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   print_reg_neg_swizzle(stream, dword);
+}
+
+
+static void
+print_dest_reg(struct debug_stream *stream, unsigned dword)
+{
+   unsigned nr = (dword >> A0_DEST_NR_SHIFT) & REG_NR_MASK;
+   unsigned type = (dword >> A0_DEST_TYPE_SHIFT) & REG_TYPE_MASK;
+   print_reg_type_nr(stream, type, nr);
+   if ((dword & A0_DEST_CHANNEL_ALL) == A0_DEST_CHANNEL_ALL)
+      return;
+   PRINTF(stream, ".");
+   if (dword & A0_DEST_CHANNEL_X)
+      PRINTF(stream, "x");
+   if (dword & A0_DEST_CHANNEL_Y)
+      PRINTF(stream, "y");
+   if (dword & A0_DEST_CHANNEL_Z)
+      PRINTF(stream, "z");
+   if (dword & A0_DEST_CHANNEL_W)
+      PRINTF(stream, "w");
+}
+
+
+#define GET_SRC0_REG(r0, r1) ((r0<<14)|(r1>>A1_SRC0_CHANNEL_W_SHIFT))
+#define GET_SRC1_REG(r0, r1) ((r0<<8)|(r1>>A2_SRC1_CHANNEL_W_SHIFT))
+#define GET_SRC2_REG(r)      (r)
+
+
+static void
+print_arith_op(struct debug_stream *stream, 
+	       unsigned opcode, const unsigned * program)
+{
+   if (opcode != A0_NOP) {
+      print_dest_reg(stream, program[0]);
+      if (program[0] & A0_DEST_SATURATE)
+         PRINTF(stream, " = SATURATE ");
+      else
+         PRINTF(stream, " = ");
+   }
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   print_src_reg(stream, GET_SRC0_REG(program[0], program[1]));
+   if (args[opcode] == 1) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC1_REG(program[1], program[2]));
+   if (args[opcode] == 2) {
+      PRINTF(stream, "\n");
+      return;
+   }
+
+   PRINTF(stream, ", ");
+   print_src_reg(stream, GET_SRC2_REG(program[2]));
+   PRINTF(stream, "\n");
+   return;
+}
+
+
+static void
+print_tex_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   print_dest_reg(stream, program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, " = ");
+
+   PRINTF(stream, "%s ", opcodes[opcode]);
+
+   PRINTF(stream, "S[%d],", program[0] & T0_SAMPLER_NR_MASK);
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_texkil_op(struct debug_stream *stream, 
+                unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "TEXKIL ");
+
+   print_reg_type_nr(stream, 
+		     (program[1] >> T1_ADDRESS_REG_TYPE_SHIFT) &
+                     REG_TYPE_MASK,
+                     (program[1] >> T1_ADDRESS_REG_NR_SHIFT) & REG_NR_MASK);
+   PRINTF(stream, "\n");
+}
+
+static void
+print_dcl_op(struct debug_stream *stream, 
+	     unsigned opcode, const unsigned * program)
+{
+   PRINTF(stream, "%s ", opcodes[opcode]);
+   print_dest_reg(stream, 
+		  program[0] | A0_DEST_CHANNEL_ALL);
+   PRINTF(stream, "\n");
+}
+
+
+void
+i915_disassemble_program(struct debug_stream *stream, 
+			 const unsigned * program, unsigned sz)
+{
+   unsigned i;
+
+   PRINTF(stream, "\t\tBEGIN\n");
+
+   assert((program[0] & 0x1ff) + 2 == sz);
+
+   program++;
+   for (i = 1; i < sz; i += 3, program += 3) {
+      unsigned opcode = program[0] & (0x1f << 24);
+
+      PRINTF(stream, "\t\t");
+
+      if ((int) opcode >= A0_NOP && opcode <= A0_SLT)
+         print_arith_op(stream, opcode >> 24, program);
+      else if (opcode >= T0_TEXLD && opcode < T0_TEXKILL)
+         print_tex_op(stream, opcode >> 24, program);
+      else if (opcode == T0_TEXKILL)
+         print_texkil_op(stream, opcode >> 24, program);
+      else if (opcode == D0_DCL)
+         print_dcl_op(stream, opcode >> 24, program);
+      else
+         PRINTF(stream, "Unknown opcode 0x%x\n", opcode);
+   }
+
+   PRINTF(stream, "\t\tEND\n\n");
+}
+
+
diff --git a/src/gallium/drivers/i915simple/i915_flush.c b/src/gallium/drivers/i915simple/i915_flush.c
new file mode 100644
index 0000000000..472e0ab774
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_flush.c
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_batch.h"
+
+
+static void i915_flush( struct pipe_context *pipe,
+                        unsigned flags,
+                        struct pipe_fence_handle **fence )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   draw_flush(i915->draw);
+
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      unsigned flush = MI_FLUSH;
+      
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush |= INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush |= FLUSH_MAP_CACHE;
+
+      if (!BEGIN_BATCH(1, 0)) {
+	 FLUSH_BATCH(NULL);
+	 assert(BEGIN_BATCH(1, 0));
+      }
+      OUT_BATCH( flush );
+   }
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH(fence);
+   i915->vbo_flushed = 1;
+}
+
+
+
+void i915_init_flush_functions( struct i915_context *i915 )
+{
+   i915->pipe.flush = i915_flush;
+}
diff --git a/src/gallium/drivers/i915simple/i915_fpc.h b/src/gallium/drivers/i915simple/i915_fpc.h
new file mode 100644
index 0000000000..2f0f99d046
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc.h
@@ -0,0 +1,207 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_FPC_H
+#define I915_FPC_H
+
+
+#include "i915_context.h"
+#include "i915_reg.h"
+
+
+
+#define I915_PROGRAM_SIZE 192
+
+
+
+/**
+ * Program translation state
+ */
+struct i915_fp_compile {
+   struct i915_fragment_shader *shader;  /* the shader we're compiling */
+
+   boolean used_constants[I915_MAX_CONSTANT];
+
+   /** maps TGSI immediate index to constant slot */
+   uint num_immediates;
+   uint immediates_map[I915_MAX_CONSTANT];
+   float immediates[I915_MAX_CONSTANT][4];
+
+   boolean first_instruction;
+
+   uint declarations[I915_PROGRAM_SIZE];
+   uint program[I915_PROGRAM_SIZE];
+
+   uint *csr;            /**< Cursor, points into program. */
+
+   uint *decl;           /**< Cursor, points into declarations. */
+
+   uint decl_s;          /**< flags for which s regs need to be decl'd */
+   uint decl_t;          /**< flags for which t regs need to be decl'd */
+
+   uint temp_flag;       /**< Tracks temporary regs which are in use */
+   uint utemp_flag;      /**< Tracks TYPE_U temporary regs which are in use */
+
+   uint nr_tex_indirect;
+   uint nr_tex_insn;
+   uint nr_alu_insn;
+   uint nr_decl_insn;
+
+   boolean error;      /**< Set if i915_program_error() is called */
+   uint wpos_tex;
+   uint NumNativeInstructions;
+   uint NumNativeAluInstructions;
+   uint NumNativeTexInstructions;
+   uint NumNativeTexIndirections;
+};
+
+
+/* Having zero and one in here makes the definition of swizzle a lot
+ * easier.
+ */
+#define UREG_TYPE_SHIFT               29
+#define UREG_NR_SHIFT                 24
+#define UREG_CHANNEL_X_NEGATE_SHIFT   23
+#define UREG_CHANNEL_X_SHIFT          20
+#define UREG_CHANNEL_Y_NEGATE_SHIFT   19
+#define UREG_CHANNEL_Y_SHIFT          16
+#define UREG_CHANNEL_Z_NEGATE_SHIFT   15
+#define UREG_CHANNEL_Z_SHIFT          12
+#define UREG_CHANNEL_W_NEGATE_SHIFT   11
+#define UREG_CHANNEL_W_SHIFT          8
+#define UREG_CHANNEL_ZERO_NEGATE_MBZ  5
+#define UREG_CHANNEL_ZERO_SHIFT       4
+#define UREG_CHANNEL_ONE_NEGATE_MBZ   1
+#define UREG_CHANNEL_ONE_SHIFT        0
+
+#define UREG_BAD          0xffffffff    /* not a valid ureg */
+
+#define X    SRC_X
+#define Y    SRC_Y
+#define Z    SRC_Z
+#define W    SRC_W
+#define ZERO SRC_ZERO
+#define ONE  SRC_ONE
+
+/* Construct a ureg:
+ */
+#define UREG( type, nr ) (((type)<< UREG_TYPE_SHIFT) |		\
+			  ((nr)  << UREG_NR_SHIFT) |		\
+			  (X     << UREG_CHANNEL_X_SHIFT) |	\
+			  (Y     << UREG_CHANNEL_Y_SHIFT) |	\
+			  (Z     << UREG_CHANNEL_Z_SHIFT) |	\
+			  (W     << UREG_CHANNEL_W_SHIFT) |	\
+			  (ZERO  << UREG_CHANNEL_ZERO_SHIFT) |	\
+			  (ONE   << UREG_CHANNEL_ONE_SHIFT))
+
+#define GET_CHANNEL_SRC( reg, channel ) ((reg<<(channel*4)) & (0xf<<20))
+#define CHANNEL_SRC( src, channel ) (src>>(channel*4))
+
+#define GET_UREG_TYPE(reg) (((reg)>>UREG_TYPE_SHIFT)&REG_TYPE_MASK)
+#define GET_UREG_NR(reg)   (((reg)>>UREG_NR_SHIFT)&REG_NR_MASK)
+
+
+
+#define UREG_XYZW_CHANNEL_MASK 0x00ffff00
+
+/* One neat thing about the UREG representation:  
+ */
+static INLINE int
+swizzle(int reg, uint x, uint y, uint z, uint w)
+{
+   assert(x <= SRC_ONE);
+   assert(y <= SRC_ONE);
+   assert(z <= SRC_ONE);
+   assert(w <= SRC_ONE);
+   return ((reg & ~UREG_XYZW_CHANNEL_MASK) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, x), 0) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, y), 1) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, z), 2) |
+           CHANNEL_SRC(GET_CHANNEL_SRC(reg, w), 3));
+}
+
+
+
+/***********************************************************************
+ * Public interface for the compiler
+ */
+extern void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs);
+
+
+
+extern uint i915_get_temp(struct i915_fp_compile *p);
+extern uint i915_get_utemp(struct i915_fp_compile *p);
+extern void i915_release_utemps(struct i915_fp_compile *p);
+
+
+extern uint i915_emit_texld(struct i915_fp_compile *p,
+                              uint dest,
+                              uint destmask,
+                              uint sampler, uint coord, uint op);
+
+extern uint i915_emit_arith(struct i915_fp_compile *p,
+                              uint op,
+                              uint dest,
+                              uint mask,
+                              uint saturate,
+                              uint src0, uint src1, uint src2);
+
+extern uint i915_emit_decl(struct i915_fp_compile *p,
+                             uint type, uint nr, uint d0_flags);
+
+
+extern uint i915_emit_const1f(struct i915_fp_compile *p, float c0);
+
+extern uint i915_emit_const2f(struct i915_fp_compile *p,
+                                float c0, float c1);
+
+extern uint i915_emit_const4fv(struct i915_fp_compile *p,
+                                 const float * c);
+
+extern uint i915_emit_const4f(struct i915_fp_compile *p,
+                                float c0, float c1,
+                                float c2, float c3);
+
+
+/*======================================================================
+ * i915_fpc_debug.c
+ */
+extern void i915_disassemble_program(const uint * program, uint sz);
+
+
+/*======================================================================
+ * i915_fpc_translate.c
+ */
+
+extern void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...);
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_fpc_emit.c b/src/gallium/drivers/i915simple/i915_fpc_emit.c
new file mode 100644
index 0000000000..b054ce41d3
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc_emit.c
@@ -0,0 +1,375 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+#include "util/u_math.h"
+
+
+#define A0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define D0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define T0_DEST( reg ) (((reg)&UREG_TYPE_NR_MASK)>>UREG_A0_DEST_SHIFT_LEFT)
+#define A0_SRC0( reg ) (((reg)&UREG_MASK)>>UREG_A0_SRC0_SHIFT_LEFT)
+#define A1_SRC0( reg ) (((reg)&UREG_MASK)<<UREG_A1_SRC0_SHIFT_RIGHT)
+#define A1_SRC1( reg ) (((reg)&UREG_MASK)>>UREG_A1_SRC1_SHIFT_LEFT)
+#define A2_SRC1( reg ) (((reg)&UREG_MASK)<<UREG_A2_SRC1_SHIFT_RIGHT)
+#define A2_SRC2( reg ) (((reg)&UREG_MASK)>>UREG_A2_SRC2_SHIFT_LEFT)
+
+/* These are special, and don't have swizzle/negate bits.
+ */
+#define T0_SAMPLER( reg )     (GET_UREG_NR(reg)<<T0_SAMPLER_NR_SHIFT)
+#define T1_ADDRESS_REG( reg ) ((GET_UREG_NR(reg)<<T1_ADDRESS_REG_NR_SHIFT) | \
+			       (GET_UREG_TYPE(reg)<<T1_ADDRESS_REG_TYPE_SHIFT))
+
+
+/* Macros for translating UREG's into the various register fields used
+ * by the I915 programmable unit.
+ */
+#define UREG_A0_DEST_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_DEST_TYPE_SHIFT)
+#define UREG_A0_SRC0_SHIFT_LEFT  (UREG_TYPE_SHIFT - A0_SRC0_TYPE_SHIFT)
+#define UREG_A1_SRC0_SHIFT_RIGHT (A1_SRC0_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A1_SRC1_SHIFT_LEFT  (UREG_TYPE_SHIFT - A1_SRC1_TYPE_SHIFT)
+#define UREG_A2_SRC1_SHIFT_RIGHT (A2_SRC1_CHANNEL_W_SHIFT - UREG_CHANNEL_W_SHIFT)
+#define UREG_A2_SRC2_SHIFT_LEFT  (UREG_TYPE_SHIFT - A2_SRC2_TYPE_SHIFT)
+
+#define UREG_MASK         0xffffff00
+#define UREG_TYPE_NR_MASK ((REG_TYPE_MASK << UREG_TYPE_SHIFT) | \
+  			   (REG_NR_MASK << UREG_NR_SHIFT))
+
+
+uint
+i915_get_temp(struct i915_fp_compile *p)
+{
+   int bit = ffs(~p->temp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_temp: out of temporaries\n");
+      return 0;
+   }
+
+   p->temp_flag |= 1 << (bit - 1);
+   return bit - 1;
+}
+
+
+static void
+i915_release_temp(struct i915_fp_compile *p, int reg)
+{
+   p->temp_flag &= ~(1 << reg);
+}
+
+
+/**
+ * Get unpreserved temporary, a temp whose value is not preserved between
+ * PS program phases.
+ */
+uint
+i915_get_utemp(struct i915_fp_compile * p)
+{
+   int bit = ffs(~p->utemp_flag);
+   if (!bit) {
+      i915_program_error(p, "i915_get_utemp: out of temporaries\n");
+      return 0;
+   }
+
+   p->utemp_flag |= 1 << (bit - 1);
+   return UREG(REG_TYPE_U, (bit - 1));
+}
+
+void
+i915_release_utemps(struct i915_fp_compile *p)
+{
+   p->utemp_flag = ~0x7;
+}
+
+
+uint
+i915_emit_decl(struct i915_fp_compile *p,
+               uint type, uint nr, uint d0_flags)
+{
+   uint reg = UREG(type, nr);
+
+   if (type == REG_TYPE_T) {
+      if (p->decl_t & (1 << nr))
+         return reg;
+
+      p->decl_t |= (1 << nr);
+   }
+   else if (type == REG_TYPE_S) {
+      if (p->decl_s & (1 << nr))
+         return reg;
+
+      p->decl_s |= (1 << nr);
+   }
+   else
+      return reg;
+
+   *(p->decl++) = (D0_DCL | D0_DEST(reg) | d0_flags);
+   *(p->decl++) = D1_MBZ;
+   *(p->decl++) = D2_MBZ;
+
+   p->nr_decl_insn++;
+   return reg;
+}
+
+uint
+i915_emit_arith(struct i915_fp_compile * p,
+                uint op,
+                uint dest,
+                uint mask,
+                uint saturate, uint src0, uint src1, uint src2)
+{
+   uint c[3];
+   uint nr_const = 0;
+
+   assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+   dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest));
+   assert(dest);
+
+   if (GET_UREG_TYPE(src0) == REG_TYPE_CONST)
+      c[nr_const++] = 0;
+   if (GET_UREG_TYPE(src1) == REG_TYPE_CONST)
+      c[nr_const++] = 1;
+   if (GET_UREG_TYPE(src2) == REG_TYPE_CONST)
+      c[nr_const++] = 2;
+
+   /* Recursively call this function to MOV additional const values
+    * into temporary registers.  Use utemp registers for this -
+    * currently shouldn't be possible to run out, but keep an eye on
+    * this.
+    */
+   if (nr_const > 1) {
+      uint s[3], first, i, old_utemp_flag;
+
+      s[0] = src0;
+      s[1] = src1;
+      s[2] = src2;
+      old_utemp_flag = p->utemp_flag;
+
+      first = GET_UREG_NR(s[c[0]]);
+      for (i = 1; i < nr_const; i++) {
+         if (GET_UREG_NR(s[c[i]]) != first) {
+            uint tmp = i915_get_utemp(p);
+
+            i915_emit_arith(p, A0_MOV, tmp, A0_DEST_CHANNEL_ALL, 0,
+                            s[c[i]], 0, 0);
+            s[c[i]] = tmp;
+         }
+      }
+
+      src0 = s[0];
+      src1 = s[1];
+      src2 = s[2];
+      p->utemp_flag = old_utemp_flag;   /* restore */
+   }
+
+   *(p->csr++) = (op | A0_DEST(dest) | mask | saturate | A0_SRC0(src0));
+   *(p->csr++) = (A1_SRC0(src0) | A1_SRC1(src1));
+   *(p->csr++) = (A2_SRC1(src1) | A2_SRC2(src2));
+
+   p->nr_alu_insn++;
+   return dest;
+}
+
+
+/**
+ * Emit a texture load or texkill instruction.
+ * \param dest  the dest i915 register
+ * \param destmask  the dest register writemask
+ * \param sampler  the i915 sampler register
+ * \param coord  the i915 source texcoord operand
+ * \param opcode  the instruction opcode
+ */
+uint i915_emit_texld( struct i915_fp_compile *p,
+			uint dest,
+			uint destmask,
+			uint sampler,
+			uint coord,
+			uint opcode )
+{
+   const uint k = UREG(GET_UREG_TYPE(coord), GET_UREG_NR(coord));
+   int temp = -1;
+
+   if (coord != k) {
+      /* texcoord is swizzled or negated.  Need to allocate a new temporary
+       * register (a utemp / unpreserved temp) won't do.
+       */
+      uint tempReg;
+
+      temp = i915_get_temp(p);           /* get temp reg index */
+      tempReg = UREG(REG_TYPE_R, temp);  /* make i915 register */
+
+      i915_emit_arith( p, A0_MOV,
+                       tempReg, A0_DEST_CHANNEL_ALL, /* dest reg, writemask */
+                       0,                            /* saturate */
+                       coord, 0, 0 );                /* src0, src1, src2 */
+
+      /* new src texcoord is tempReg */
+      coord = tempReg;
+   }
+
+   /* Don't worry about saturate as we only support  
+    */
+   if (destmask != A0_DEST_CHANNEL_ALL) {
+      /* if not writing to XYZW... */
+      uint tmp = i915_get_utemp(p);
+      i915_emit_texld( p, tmp, A0_DEST_CHANNEL_ALL, sampler, coord, opcode );
+      i915_emit_arith( p, A0_MOV, dest, destmask, 0, tmp, 0, 0 );
+      /* XXX release utemp here? */
+   }
+   else {
+      assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
+      assert(dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+
+      /* is the sampler coord a texcoord input reg? */
+      if (GET_UREG_TYPE(coord) != REG_TYPE_T) {
+	 p->nr_tex_indirect++;
+      }
+
+      *(p->csr++) = (opcode | 
+		     T0_DEST( dest ) |
+		     T0_SAMPLER( sampler ));
+
+      *(p->csr++) = T1_ADDRESS_REG( coord );
+      *(p->csr++) = T2_MBZ;
+
+      p->nr_tex_insn++;
+   }
+
+   if (temp >= 0)
+      i915_release_temp(p, temp);
+
+   return dest;
+}
+
+
+uint
+i915_emit_const1f(struct i915_fp_compile * p, float c0)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ZERO, ZERO, ZERO, ZERO);
+   if (c0 == 1.0)
+      return swizzle(UREG(REG_TYPE_R, 0), ONE, ONE, ONE, ONE);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 4; idx++) {
+         if (!(ifs->constant_flags[reg] & (1 << idx)) ||
+             ifs->constants[reg][idx] == c0) {
+            ifs->constants[reg][idx] = c0;
+            ifs->constant_flags[reg] |= 1 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, ZERO, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const1f: out of constants\n");
+   return 0;
+}
+
+uint
+i915_emit_const2f(struct i915_fp_compile * p, float c0, float c1)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg, idx;
+
+   if (c0 == 0.0)
+      return swizzle(i915_emit_const1f(p, c1), ZERO, X, Z, W);
+   if (c0 == 1.0)
+      return swizzle(i915_emit_const1f(p, c1), ONE, X, Z, W);
+
+   if (c1 == 0.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ZERO, Z, W);
+   if (c1 == 1.0)
+      return swizzle(i915_emit_const1f(p, c0), X, ONE, Z, W);
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf ||
+          ifs->constant_flags[reg] == I915_CONSTFLAG_USER)
+         continue;
+      for (idx = 0; idx < 3; idx++) {
+         if (!(ifs->constant_flags[reg] & (3 << idx))) {
+            ifs->constants[reg][idx + 0] = c0;
+            ifs->constants[reg][idx + 1] = c1;
+            ifs->constant_flags[reg] |= 3 << idx;
+            if (reg + 1 > ifs->num_constants)
+               ifs->num_constants = reg + 1;
+            return swizzle(UREG(REG_TYPE_CONST, reg), idx, idx + 1, ZERO, ONE);
+         }
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const2f: out of constants\n");
+   return 0;
+}
+
+
+
+uint
+i915_emit_const4f(struct i915_fp_compile * p,
+                  float c0, float c1, float c2, float c3)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned reg;
+
+   for (reg = 0; reg < I915_MAX_CONSTANT; reg++) {
+      if (ifs->constant_flags[reg] == 0xf &&
+          ifs->constants[reg][0] == c0 &&
+          ifs->constants[reg][1] == c1 &&
+          ifs->constants[reg][2] == c2 &&
+          ifs->constants[reg][3] == c3) {
+         return UREG(REG_TYPE_CONST, reg);
+      }
+      else if (ifs->constant_flags[reg] == 0) {
+
+         ifs->constants[reg][0] = c0;
+         ifs->constants[reg][1] = c1;
+         ifs->constants[reg][2] = c2;
+         ifs->constants[reg][3] = c3;
+         ifs->constant_flags[reg] = 0xf;
+         if (reg + 1 > ifs->num_constants)
+            ifs->num_constants = reg + 1;
+         return UREG(REG_TYPE_CONST, reg);
+      }
+   }
+
+   i915_program_error(p, "i915_emit_const4f: out of constants\n");
+   return 0;
+}
+
+
+uint
+i915_emit_const4fv(struct i915_fp_compile * p, const float * c)
+{
+   return i915_emit_const4f(p, c[0], c[1], c[2], c[3]);
+}
diff --git a/src/gallium/drivers/i915simple/i915_fpc_translate.c b/src/gallium/drivers/i915simple/i915_fpc_translate.c
new file mode 100644
index 0000000000..43d62c5176
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_fpc_translate.c
@@ -0,0 +1,1190 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_fpc.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Simple pass-through fragment shader to use when we don't have
+ * a real shader (or it fails to compile for some reason).
+ */
+static unsigned passthrough[] = 
+{
+   _3DSTATE_PIXEL_SHADER_PROGRAM | ((2*3)-1),
+
+   /* declare input color:
+    */
+   (D0_DCL | 
+    (REG_TYPE_T << D0_TYPE_SHIFT) | 
+    (T_DIFFUSE << D0_NR_SHIFT) | 
+    D0_CHANNEL_ALL),
+   0,
+   0,
+
+   /* move to output color:
+    */
+   (A0_MOV | 
+    (REG_TYPE_OC << A0_DEST_TYPE_SHIFT) | 
+    A0_DEST_CHANNEL_ALL | 
+    (REG_TYPE_T << A0_SRC0_TYPE_SHIFT) |
+    (T_DIFFUSE << A0_SRC0_NR_SHIFT)),
+   0x01230000,			/* .xyzw */
+   0
+};
+
+
+/* 1, -1/3!, 1/5!, -1/7! */
+static const float sin_constants[4] = { 1.0,
+   -1.0f / (3 * 2 * 1),
+   1.0f / (5 * 4 * 3 * 2 * 1),
+   -1.0f / (7 * 6 * 5 * 4 * 3 * 2 * 1)
+};
+
+/* 1, -1/2!, 1/4!, -1/6! */
+static const float cos_constants[4] = { 1.0,
+   -1.0f / (2 * 1),
+   1.0f / (4 * 3 * 2 * 1),
+   -1.0f / (6 * 5 * 4 * 3 * 2 * 1)
+};
+
+
+
+/**
+ * component-wise negation of ureg
+ */
+static INLINE int
+negate(int reg, int x, int y, int z, int w)
+{
+   /* Another neat thing about the UREG representation */
+   return reg ^ (((x & 1) << UREG_CHANNEL_X_NEGATE_SHIFT) |
+                 ((y & 1) << UREG_CHANNEL_Y_NEGATE_SHIFT) |
+                 ((z & 1) << UREG_CHANNEL_Z_NEGATE_SHIFT) |
+                 ((w & 1) << UREG_CHANNEL_W_NEGATE_SHIFT));
+}
+
+
+/**
+ * In the event of a translation failure, we'll generate a simple color
+ * pass-through program.
+ */
+static void
+i915_use_passthrough_shader(struct i915_fragment_shader *fs)
+{
+   fs->program = (uint *) MALLOC(sizeof(passthrough));
+   if (fs->program) {
+      memcpy(fs->program, passthrough, sizeof(passthrough));
+      fs->program_len = Elements(passthrough);
+   }
+   fs->num_constants = 0;
+}
+
+
+void
+i915_program_error(struct i915_fp_compile *p, const char *msg, ...)
+{
+   va_list args;
+   char buffer[1024];
+
+   debug_printf("i915_program_error: ");
+   va_start( args, msg );  
+   util_vsnprintf( buffer, sizeof(buffer), msg, args );
+   va_end( args );
+   debug_printf(buffer);
+   debug_printf("\n");
+
+   p->error = 1;
+}
+
+
+
+/**
+ * Construct a ureg for the given source register.  Will emit
+ * constants, apply swizzling and negation as needed.
+ */
+static uint
+src_vector(struct i915_fp_compile *p,
+           const struct tgsi_full_src_register *source)
+{
+   uint index = source->SrcRegister.Index;
+   uint src = 0, sem_name, sem_ind;
+
+   switch (source->SrcRegister.File) {
+   case TGSI_FILE_TEMPORARY:
+      if (source->SrcRegister.Index >= I915_MAX_TEMPORARY) {
+         i915_program_error(p, "Exceeded max temporary reg");
+         return 0;
+      }
+      src = UREG(REG_TYPE_R, index);
+      break;
+   case TGSI_FILE_INPUT:
+      /* XXX: Packing COL1, FOGC into a single attribute works for
+       * texenv programs, but will fail for real fragment programs
+       * that use these attributes and expect them to be a full 4
+       * components wide.  Could use a texcoord to pass these
+       * attributes if necessary, but that won't work in the general
+       * case.
+       * 
+       * We also use a texture coordinate to pass wpos when possible.
+       */
+
+      sem_name = p->shader->info.input_semantic_name[index];
+      sem_ind = p->shader->info.input_semantic_index[index];
+
+      switch (sem_name) {
+      case TGSI_SEMANTIC_POSITION:
+         debug_printf("SKIP SEM POS\n");
+         /*
+         assert(p->wpos_tex != -1);
+         src = i915_emit_decl(p, REG_TYPE_T, p->wpos_tex, D0_CHANNEL_ALL);
+         */
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         if (sem_ind == 0) {
+            src = i915_emit_decl(p, REG_TYPE_T, T_DIFFUSE, D0_CHANNEL_ALL);
+         }
+         else {
+            /* secondary color */
+            assert(sem_ind == 1);
+            src = i915_emit_decl(p, REG_TYPE_T, T_SPECULAR, D0_CHANNEL_XYZ);
+            src = swizzle(src, X, Y, Z, ONE);
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         src = i915_emit_decl(p, REG_TYPE_T, T_FOG_W, D0_CHANNEL_W);
+         src = swizzle(src, W, W, W, W);
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         src = i915_emit_decl(p, REG_TYPE_T, T_TEX0 + sem_ind, D0_CHANNEL_ALL);
+         break;
+      default:
+         i915_program_error(p, "Bad source->Index");
+         return 0;
+      }
+      break;
+
+   case TGSI_FILE_IMMEDIATE:
+      assert(index < p->num_immediates);
+      index = p->immediates_map[index];
+      /* fall-through */
+   case TGSI_FILE_CONSTANT:
+      src = UREG(REG_TYPE_CONST, index);
+      break;
+
+   default:
+      i915_program_error(p, "Bad source->File");
+      return 0;
+   }
+
+   if (source->SrcRegister.Extended) {
+      src = swizzle(src,
+                    source->SrcRegisterExtSwz.ExtSwizzleX,
+                    source->SrcRegisterExtSwz.ExtSwizzleY,
+                    source->SrcRegisterExtSwz.ExtSwizzleZ,
+                    source->SrcRegisterExtSwz.ExtSwizzleW);
+   }
+   else {
+      src = swizzle(src,
+                    source->SrcRegister.SwizzleX,
+                    source->SrcRegister.SwizzleY,
+                    source->SrcRegister.SwizzleZ,
+                    source->SrcRegister.SwizzleW);
+   }
+
+
+   /* There's both negate-all-components and per-component negation.
+    * Try to handle both here.
+    */
+   {
+      int nx = source->SrcRegisterExtSwz.NegateX;
+      int ny = source->SrcRegisterExtSwz.NegateY;
+      int nz = source->SrcRegisterExtSwz.NegateZ;
+      int nw = source->SrcRegisterExtSwz.NegateW;
+      if (source->SrcRegister.Negate) {
+         nx = !nx;
+         ny = !ny;
+         nz = !nz;
+         nw = !nw;
+      }
+      src = negate(src, nx, ny, nz, nw);
+   }
+
+   /* no abs() or post-abs negation */
+#if 0
+   /* XXX assertions disabled to allow arbfplight.c to run */
+   /* XXX enable these assertions, or fix things */
+   assert(!source->SrcRegisterExtMod.Absolute);
+   assert(!source->SrcRegisterExtMod.Negate);
+#endif
+   return src;
+}
+
+
+/**
+ * Construct a ureg for a destination register.
+ */
+static uint
+get_result_vector(struct i915_fp_compile *p,
+                  const struct tgsi_full_dst_register *dest)
+{
+   switch (dest->DstRegister.File) {
+   case TGSI_FILE_OUTPUT:
+      {
+         uint sem_name = p->shader->info.output_semantic_name[dest->DstRegister.Index];
+         switch (sem_name) {
+         case TGSI_SEMANTIC_POSITION:
+            return UREG(REG_TYPE_OD, 0);
+         case TGSI_SEMANTIC_COLOR:
+            return UREG(REG_TYPE_OC, 0);
+         default:
+            i915_program_error(p, "Bad inst->DstReg.Index/semantics");
+            return 0;
+         }
+      }
+   case TGSI_FILE_TEMPORARY:
+      return UREG(REG_TYPE_R, dest->DstRegister.Index);
+   default:
+      i915_program_error(p, "Bad inst->DstReg.File");
+      return 0;
+   }
+}
+
+
+/**
+ * Compute flags for saturation and writemask.
+ */
+static uint
+get_result_flags(const struct tgsi_full_instruction *inst)
+{
+   const uint writeMask
+      = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   uint flags = 0x0;
+
+   if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+      flags |= A0_DEST_SATURATE;
+
+   if (writeMask & TGSI_WRITEMASK_X)
+      flags |= A0_DEST_CHANNEL_X;
+   if (writeMask & TGSI_WRITEMASK_Y)
+      flags |= A0_DEST_CHANNEL_Y;
+   if (writeMask & TGSI_WRITEMASK_Z)
+      flags |= A0_DEST_CHANNEL_Z;
+   if (writeMask & TGSI_WRITEMASK_W)
+      flags |= A0_DEST_CHANNEL_W;
+
+   return flags;
+}
+
+
+/**
+ * Convert TGSI_TEXTURE_x token to DO_SAMPLE_TYPE_x token
+ */
+static uint
+translate_tex_src_target(struct i915_fp_compile *p, uint tex)
+{
+   switch (tex) {
+   case TGSI_TEXTURE_1D:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_2D:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_RECT:
+      return D0_SAMPLE_TYPE_2D;
+   case TGSI_TEXTURE_3D:
+      return D0_SAMPLE_TYPE_VOLUME;
+   case TGSI_TEXTURE_CUBE:
+      return D0_SAMPLE_TYPE_CUBE;
+   default:
+      i915_program_error(p, "TexSrc type");
+      return 0;
+   }
+}
+
+
+/**
+ * Generate texel lookup instruction.
+ */
+static void
+emit_tex(struct i915_fp_compile *p,
+         const struct tgsi_full_instruction *inst,
+         uint opcode)
+{
+   uint texture = inst->InstructionExtTexture.Texture;
+   uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   uint tex = translate_tex_src_target( p, texture );
+   uint sampler = i915_emit_decl(p, REG_TYPE_S, unit, tex);
+   uint coord = src_vector( p, &inst->FullSrcRegisters[0]);
+
+   i915_emit_texld( p,
+                    get_result_vector( p, &inst->FullDstRegisters[0] ),
+                    get_result_flags( inst ),
+                    sampler,
+                    coord,
+                    opcode);
+}
+
+
+/**
+ * Generate a simple arithmetic instruction
+ * \param opcode  the i915 opcode
+ * \param numArgs  the number of input/src arguments
+ */
+static void
+emit_simple_arith(struct i915_fp_compile *p,
+                  const struct tgsi_full_instruction *inst,
+                  uint opcode, uint numArgs)
+{
+   uint arg1, arg2, arg3;
+
+   assert(numArgs <= 3);
+
+   arg1 = (numArgs < 1) ? 0 : src_vector( p, &inst->FullSrcRegisters[0] );
+   arg2 = (numArgs < 2) ? 0 : src_vector( p, &inst->FullSrcRegisters[1] );
+   arg3 = (numArgs < 3) ? 0 : src_vector( p, &inst->FullSrcRegisters[2] );
+
+   i915_emit_arith( p,
+                    opcode,
+                    get_result_vector( p, &inst->FullDstRegisters[0]),
+                    get_result_flags( inst ), 0,
+                    arg1,
+                    arg2,
+                    arg3 );
+}
+
+
+/** As above, but swap the first two src regs */
+static void
+emit_simple_arith_swap2(struct i915_fp_compile *p,
+                        const struct tgsi_full_instruction *inst,
+                        uint opcode, uint numArgs)
+{
+   struct tgsi_full_instruction inst2;
+
+   assert(numArgs == 2);
+
+   /* transpose first two registers */
+   inst2 = *inst;
+   inst2.FullSrcRegisters[0] = inst->FullSrcRegisters[1];
+   inst2.FullSrcRegisters[1] = inst->FullSrcRegisters[0];
+
+   emit_simple_arith(p, &inst2, opcode, numArgs);
+}
+
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+/*
+ * Translate TGSI instruction to i915 instruction.
+ *
+ * Possible concerns:
+ *
+ * SIN, COS -- could use another taylor step?
+ * LIT      -- results seem a little different to sw mesa
+ * LOG      -- different to mesa on negative numbers, but this is conformant.
+ */ 
+static void
+i915_translate_instruction(struct i915_fp_compile *p,
+                           const struct tgsi_full_instruction *inst)
+{
+   uint writemask;
+   uint src0, src1, src2, flags;
+   uint tmp = 0;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      i915_emit_arith(p,
+                      A0_MAX,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src0, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_ADD:
+      emit_simple_arith(p, inst, A0_ADD, 2);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      i915_emit_arith(p, A0_CMP, 
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 
+                      0, src0, src2, src1);   /* NOTE: order of src2, src1 */
+      break;
+
+   case TGSI_OPCODE_COS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, 1
+       * t0 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * result = DP4 t0, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, Y, X, ONE),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XYZ, 0,
+                      swizzle(tmp, X, X, Z, ONE),
+                      swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, ONE, Z, Y, X),
+                      i915_emit_const4fv(p, cos_constants), 0);
+      break;
+
+   case TGSI_OPCODE_DP3:
+      emit_simple_arith(p, inst, A0_DP3, 2);
+      break;
+
+   case TGSI_OPCODE_DP4:
+      emit_simple_arith(p, inst, A0_DP4, 2);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, Y, Z, ONE), src1, 0);
+      break;
+
+   case TGSI_OPCODE_DST:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      /* result[0] = 1    * 1;
+       * result[1] = a[1] * b[1];
+       * result[2] = a[2] * 1;
+       * result[3] = 1    * b[3];
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, ONE, Y, Z, ONE),
+                      swizzle(src1, ONE, Y, ONE, W), 0);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_EX2:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_FLR:
+      emit_simple_arith(p, inst, A0_FLR, 1);
+      break;
+
+   case TGSI_OPCODE_FRC:
+      emit_simple_arith(p, inst, A0_FRC, 1);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* kill if src[0].x < 0 || src[0].y < 0 ... */
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_texld(p,
+                      tmp,                   /* dest reg: a dummy reg */
+                      A0_DEST_CHANNEL_ALL,   /* dest writemask */
+                      0,                     /* sampler */
+                      src0,                  /* coord*/
+                      T0_TEXKILL);           /* opcode */
+      break;
+
+   case TGSI_OPCODE_KILP:
+      assert(0); /* not tested yet */
+      break;
+
+   case TGSI_OPCODE_LG2:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_LOG,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_LIT:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      /* tmp = max( a.xyzw, a.00zw )
+       * XXX: Clamp tmp.w to -128..128
+       * tmp.y = log(tmp.y)
+       * tmp.y = tmp.w * tmp.y
+       * tmp.y = exp(tmp.y)
+       * result = cmp (a.11-x1, a.1x01, a.1xy1 )
+       */
+      i915_emit_arith(p, A0_MAX, tmp, A0_DEST_CHANNEL_ALL, 0,
+                      src0, swizzle(src0, ZERO, ZERO, Z, W), 0);
+
+      i915_emit_arith(p, A0_LOG, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, ZERO, Y, ZERO, ZERO),
+                      swizzle(tmp, ZERO, W, ZERO, ZERO), 0);
+
+      i915_emit_arith(p, A0_EXP, tmp, A0_DEST_CHANNEL_Y, 0,
+                      swizzle(tmp, Y, Y, Y, Y), 0, 0);
+
+      i915_emit_arith(p, A0_CMP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      negate(swizzle(tmp, ONE, ONE, X, ONE), 0, 0, 1, 0),
+                      swizzle(tmp, ONE, X, ZERO, ONE),
+                      swizzle(tmp, ONE, X, Y, ONE));
+
+      break;
+
+   case TGSI_OPCODE_LRP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      src2 = src_vector(p, &inst->FullSrcRegisters[2]);
+      flags = get_result_flags(inst);
+      tmp = i915_get_utemp(p);
+
+      /* b*a + c*(1-a)
+       *
+       * b*a + c - ca 
+       *
+       * tmp = b*a + c, 
+       * result = (-c)*a + tmp 
+       */
+      i915_emit_arith(p, A0_MAD, tmp,
+                      flags & A0_DEST_CHANNEL_ALL, 0, src1, src0, src2);
+
+      i915_emit_arith(p, A0_MAD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, negate(src2, 1, 1, 1, 1), src0, tmp);
+      break;
+
+   case TGSI_OPCODE_MAD:
+      emit_simple_arith(p, inst, A0_MAD, 3);
+      break;
+
+   case TGSI_OPCODE_MAX:
+      emit_simple_arith(p, inst, A0_MAX, 2);
+      break;
+
+   case TGSI_OPCODE_MIN:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      i915_emit_arith(p,
+                      A0_MAX,
+                      tmp, flags & A0_DEST_CHANNEL_ALL, 0,
+                      negate(src0, 1, 1, 1, 1),
+                      negate(src1, 1, 1, 1, 1), 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, negate(tmp, 1, 1, 1, 1), 0, 0);
+      break;
+
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+      emit_simple_arith(p, inst, A0_MOV, 1);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      emit_simple_arith(p, inst, A0_MUL, 2);
+      break;
+
+   case TGSI_OPCODE_POW:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+      flags = get_result_flags(inst);
+
+      /* XXX: masking on intermediate values, here and elsewhere.
+       */
+      i915_emit_arith(p,
+                      A0_LOG,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+
+      i915_emit_arith(p, A0_MUL, tmp, A0_DEST_CHANNEL_X, 0, tmp, src1, 0);
+
+      i915_emit_arith(p,
+                      A0_EXP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      flags, 0, swizzle(tmp, X, X, X, X), 0, 0);
+      break;
+      
+   case TGSI_OPCODE_RET:
+      /* XXX: no-op? */
+      break;
+      
+   case TGSI_OPCODE_RCP:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_RCP,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                         get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_RSQ:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+
+      i915_emit_arith(p,
+                      A0_RSQ,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, X, X, X, X), 0, 0);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * scs.x = DP4 t1, sin_constants
+       * t1 = MUL t0.xxz1 t0.z111    ; x^6 x^4 x^2 1
+       * scs.y = DP4 t1, cos_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(src0, X, X, ONE, ONE),
+                      swizzle(src0, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      writemask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+
+      if (writemask & TGSI_WRITEMASK_Y) {
+         uint tmp1;
+
+         if (writemask & TGSI_WRITEMASK_X)
+            tmp1 = i915_get_utemp(p);
+         else
+            tmp1 = tmp;
+
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp1, A0_DEST_CHANNEL_ALL, 0,
+                         swizzle(tmp, X, Y, Y, W),
+                         swizzle(tmp, X, Z, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         A0_DEST_CHANNEL_Y, 0,
+                         swizzle(tmp1, W, Z, Y, X),
+                         i915_emit_const4fv(p, sin_constants), 0);
+      }
+
+      if (writemask & TGSI_WRITEMASK_X) {
+         i915_emit_arith(p,
+                         A0_MUL,
+                         tmp, A0_DEST_CHANNEL_XYZ, 0,
+                         swizzle(tmp, X, X, Z, ONE),
+                         swizzle(tmp, Z, ONE, ONE, ONE), 0);
+
+         i915_emit_arith(p,
+                         A0_DP4,
+                         get_result_vector(p, &inst->FullDstRegisters[0]),
+                         A0_DEST_CHANNEL_X, 0,
+                         swizzle(tmp, ONE, Z, Y, X),
+                         i915_emit_const4fv(p, cos_constants), 0);
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+      emit_simple_arith(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SLE:
+      /* like SGE, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SGE, 2);
+      break;
+
+   case TGSI_OPCODE_SIN:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      src0, i915_emit_const1f(p, 1.0f / (float) (M_PI * 2.0)), 0);
+
+      i915_emit_arith(p, A0_MOD, tmp, A0_DEST_CHANNEL_X, 0, tmp, 0, 0);
+
+      /* By choosing different taylor constants, could get rid of this mul:
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_X, 0,
+                      tmp, i915_emit_const1f(p, (float) (M_PI * 2.0)), 0);
+
+      /* 
+       * t0.xy = MUL x.xx11, x.x1111  ; x^2, x, 1, 1
+       * t0 = MUL t0.xyxy t0.xx11 ; x^4, x^3, x^2, x
+       * t1 = MUL t0.xyyw t0.yz11    ; x^7 x^5 x^3 x
+       * result = DP4 t1.wzyx, sin_constants
+       */
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_XY, 0,
+                      swizzle(tmp, X, X, ONE, ONE),
+                      swizzle(tmp, X, ONE, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, X, Y),
+                      swizzle(tmp, X, X, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(tmp, X, Y, Y, W),
+                      swizzle(tmp, X, Z, ONE, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_DP4,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(tmp, W, Z, Y, X),
+                      i915_emit_const4fv(p, sin_constants), 0);
+      break;
+
+   case TGSI_OPCODE_SLT:
+      emit_simple_arith(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      /* like SLT, but swap reg0, reg1 */
+      emit_simple_arith_swap2(p, inst, A0_SLT, 2);
+      break;
+
+   case TGSI_OPCODE_SUB:
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+
+      i915_emit_arith(p,
+                      A0_ADD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      src0, negate(src1, 1, 1, 1, 1), 0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      emit_tex(p, inst, T0_TEXLD);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      emit_tex(p, inst, T0_TEXLDB);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      emit_tex(p, inst, T0_TEXLDP);
+      break;
+
+   case TGSI_OPCODE_XPD:
+      /* Cross product:
+       *      result.x = src0.y * src1.z - src0.z * src1.y;
+       *      result.y = src0.z * src1.x - src0.x * src1.z;
+       *      result.z = src0.x * src1.y - src0.y * src1.x;
+       *      result.w = undef;
+       */
+      src0 = src_vector(p, &inst->FullSrcRegisters[0]);
+      src1 = src_vector(p, &inst->FullSrcRegisters[1]);
+      tmp = i915_get_utemp(p);
+
+      i915_emit_arith(p,
+                      A0_MUL,
+                      tmp, A0_DEST_CHANNEL_ALL, 0,
+                      swizzle(src0, Z, X, Y, ONE),
+                      swizzle(src1, Y, Z, X, ONE), 0);
+
+      i915_emit_arith(p,
+                      A0_MAD,
+                      get_result_vector(p, &inst->FullDstRegisters[0]),
+                      get_result_flags(inst), 0,
+                      swizzle(src0, Y, Z, X, ONE),
+                      swizzle(src1, Z, X, Y, ONE),
+                      negate(tmp, 1, 1, 1, 0));
+      break;
+
+   default:
+      i915_program_error(p, "bad opcode %d", inst->Instruction.Opcode);
+      p->error = 1;
+      return;
+   }
+
+   i915_release_utemps(p);
+}
+
+
+/**
+ * Translate TGSI fragment shader into i915 hardware instructions.
+ * \param p  the translation state
+ * \param tokens  the TGSI token array
+ */
+static void
+i915_translate_instructions(struct i915_fp_compile *p,
+                            const struct tgsi_token *tokens)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   struct tgsi_parse_context parse;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_CONSTANT) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
+                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+                 i++) {
+               assert(ifs->constant_flags[i] == 0x0);
+               ifs->constant_flags[i] = I915_CONSTFLAG_USER;
+               ifs->num_constants = MAX2(ifs->num_constants, i + 1);
+            }
+         }
+         else if (parse.FullToken.FullDeclaration.Declaration.File
+                  == TGSI_FILE_TEMPORARY) {
+            uint i;
+            for (i = parse.FullToken.FullDeclaration.DeclarationRange.First;
+                 i <= parse.FullToken.FullDeclaration.DeclarationRange.Last;
+                 i++) {
+               assert(i < I915_MAX_TEMPORARY);
+               /* XXX just use shader->info->file_mask[TGSI_FILE_TEMPORARY] */
+               p->temp_flag |= (1 << i); /* mark temp as used */
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            const struct tgsi_full_immediate *imm
+               = &parse.FullToken.FullImmediate;
+            const uint pos = p->num_immediates++;
+            uint j;
+            for (j = 0; j < imm->Immediate.Size; j++) {
+               p->immediates[pos][j] = imm->u.ImmediateFloat32[j].Float;
+            }
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if (p->first_instruction) {
+            /* resolve location of immediates */
+            uint i, j;
+            for (i = 0; i < p->num_immediates; i++) {
+               /* find constant slot for this immediate */
+               for (j = 0; j < I915_MAX_CONSTANT; j++) {
+                  if (ifs->constant_flags[j] == 0x0) {
+                     memcpy(ifs->constants[j],
+                            p->immediates[i],
+                            4 * sizeof(float));
+                     /*printf("immediate %d maps to const %d\n", i, j);*/
+                     ifs->constant_flags[j] = 0xf;  /* all four comps used */
+                     p->immediates_map[i] = j;
+                     ifs->num_constants = MAX2(ifs->num_constants, j + 1);
+                     break;
+                  }
+               }
+            }
+
+            p->first_instruction = FALSE;
+         }
+
+         i915_translate_instruction(p, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+   } /* while */
+
+   tgsi_parse_free (&parse);
+}
+
+
+static struct i915_fp_compile *
+i915_init_compile(struct i915_context *i915,
+                  struct i915_fragment_shader *ifs)
+{
+   struct i915_fp_compile *p = CALLOC_STRUCT(i915_fp_compile);
+
+   p->shader = ifs;
+
+   /* Put new constants at end of const buffer, growing downward.
+    * The problem is we don't know how many user-defined constants might
+    * be specified with pipe->set_constant_buffer().
+    * Should pre-scan the user's program to determine the highest-numbered
+    * constant referenced.
+    */
+   ifs->num_constants = 0;
+   memset(ifs->constant_flags, 0, sizeof(ifs->constant_flags));
+
+   p->first_instruction = TRUE;
+
+   p->nr_tex_indirect = 1;      /* correct? */
+   p->nr_tex_insn = 0;
+   p->nr_alu_insn = 0;
+   p->nr_decl_insn = 0;
+
+   p->csr = p->program;
+   p->decl = p->declarations;
+   p->decl_s = 0;
+   p->decl_t = 0;
+   p->temp_flag = ~0x0 << I915_MAX_TEMPORARY;
+   p->utemp_flag = ~0x7;
+
+   p->wpos_tex = -1;
+
+   /* initialize the first program word */
+   *(p->decl++) = _3DSTATE_PIXEL_SHADER_PROGRAM;
+
+   return p;
+}
+
+
+/* Copy compile results to the fragment program struct and destroy the
+ * compilation context.
+ */
+static void
+i915_fini_compile(struct i915_context *i915, struct i915_fp_compile *p)
+{
+   struct i915_fragment_shader *ifs = p->shader;
+   unsigned long program_size = (unsigned long) (p->csr - p->program);
+   unsigned long decl_size = (unsigned long) (p->decl - p->declarations);
+
+   if (p->nr_tex_indirect > I915_MAX_TEX_INDIRECT)
+      i915_program_error(p, "Exceeded max nr indirect texture lookups");
+
+   if (p->nr_tex_insn > I915_MAX_TEX_INSN)
+      i915_program_error(p, "Exceeded max TEX instructions");
+
+   if (p->nr_alu_insn > I915_MAX_ALU_INSN)
+      i915_program_error(p, "Exceeded max ALU instructions");
+
+   if (p->nr_decl_insn > I915_MAX_DECL_INSN)
+      i915_program_error(p, "Exceeded max DECL instructions");
+
+   if (p->error) {
+      p->NumNativeInstructions = 0;
+      p->NumNativeAluInstructions = 0;
+      p->NumNativeTexInstructions = 0;
+      p->NumNativeTexIndirections = 0;
+
+      i915_use_passthrough_shader(ifs);
+   }
+   else {
+      p->NumNativeInstructions
+         = p->nr_alu_insn + p->nr_tex_insn + p->nr_decl_insn;
+      p->NumNativeAluInstructions = p->nr_alu_insn;
+      p->NumNativeTexInstructions = p->nr_tex_insn;
+      p->NumNativeTexIndirections = p->nr_tex_indirect;
+
+      /* patch in the program length */
+      p->declarations[0] |= program_size + decl_size - 2;
+
+      /* Copy compilation results to fragment program struct: 
+       */
+      assert(!ifs->program);
+      ifs->program
+         = (uint *) MALLOC((program_size + decl_size) * sizeof(uint));
+      if (ifs->program) {
+         ifs->program_len = program_size + decl_size;
+
+         memcpy(ifs->program,
+                p->declarations, 
+                decl_size * sizeof(uint));
+
+         memcpy(ifs->program + decl_size, 
+                p->program, 
+                program_size * sizeof(uint));
+      }
+   }
+
+   /* Release the compilation struct: 
+    */
+   FREE(p);
+}
+
+
+/**
+ * Find an unused texture coordinate slot to use for fragment WPOS.
+ * Update p->fp->wpos_tex with the result (-1 if no used texcoord slot is found).
+ */
+static void
+i915_find_wpos_space(struct i915_fp_compile *p)
+{
+#if 0
+   const uint inputs
+      = p->shader->inputs_read | (1 << TGSI_ATTRIB_POS); /*XXX hack*/
+   uint i;
+
+   p->wpos_tex = -1;
+
+   if (inputs & (1 << TGSI_ATTRIB_POS)) {
+      for (i = 0; i < I915_TEX_UNITS; i++) {
+	 if ((inputs & (1 << (TGSI_ATTRIB_TEX0 + i))) == 0) {
+	    p->wpos_tex = i;
+	    return;
+	 }
+      }
+
+      i915_program_error(p, "No free texcoord for wpos value");
+   }
+#else
+   if (p->shader->info.input_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      /* frag shader using the fragment position input */
+#if 0
+      assert(0);
+#endif
+   }
+#endif
+}
+
+
+
+
+/**
+ * Rather than trying to intercept and jiggle depth writes during
+ * emit, just move the value into its correct position at the end of
+ * the program:
+ */
+static void
+i915_fixup_depth_write(struct i915_fp_compile *p)
+{
+   /* XXX assuming pos/depth is always in output[0] */
+   if (p->shader->info.output_semantic_name[0] == TGSI_SEMANTIC_POSITION) {
+      const uint depth = UREG(REG_TYPE_OD, 0);
+
+      i915_emit_arith(p,
+                      A0_MOV,                     /* opcode */
+                      depth,                      /* dest reg */
+                      A0_DEST_CHANNEL_W,          /* write mask */
+                      0,                          /* saturate? */
+                      swizzle(depth, X, Y, Z, Z), /* src0 */
+                      0, 0 /* src1, src2 */);
+   }
+}
+
+
+void
+i915_translate_fragment_program( struct i915_context *i915,
+                                 struct i915_fragment_shader *fs)
+{
+   struct i915_fp_compile *p = i915_init_compile(i915, fs);
+   const struct tgsi_token *tokens = fs->state.tokens;
+
+   i915_find_wpos_space(p);
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   i915_translate_instructions(p, tokens);
+   i915_fixup_depth_write(p);
+
+   i915_fini_compile(i915, p);
+}
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
new file mode 100644
index 0000000000..8f1f58b2dd
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -0,0 +1,220 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "draw/draw_pipe.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_batch.h"
+
+
+
+/**
+ * Primitive emit to hardware.  No support for vertex buffers or any
+ * nice fast paths.
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct i915_context *i915;   
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+/**
+ * Extract the needed fields from vertex_header and emit i915 dwords.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.
+ */
+static INLINE void
+emit_hw_vertex( struct i915_context *i915,
+                const struct vertex_header *vertex)
+{
+   const struct vertex_info *vinfo = &i915->current.vertex_info;
+   uint i;
+   uint count = 0;  /* for debug/sanity */
+
+   assert(!i915->dirty);
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      const uint j = vinfo->attrib[i].src_index;
+      const float *attrib = vertex->data[j];
+      switch (vinfo->attrib[i].emit) {
+      case EMIT_1F:
+         OUT_BATCH( fui(attrib[0]) );
+         count++;
+         break;
+      case EMIT_2F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         count += 2;
+         break;
+      case EMIT_3F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         count += 3;
+         break;
+      case EMIT_4F:
+         OUT_BATCH( fui(attrib[0]) );
+         OUT_BATCH( fui(attrib[1]) );
+         OUT_BATCH( fui(attrib[2]) );
+         OUT_BATCH( fui(attrib[3]) );
+         count += 4;
+         break;
+      case EMIT_4UB:
+         OUT_BATCH( pack_ub4(float_to_ubyte( attrib[2] ),
+                             float_to_ubyte( attrib[1] ),
+                             float_to_ubyte( attrib[0] ),
+                             float_to_ubyte( attrib[3] )) );
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   assert(count == vinfo->size);
+}
+
+
+
+static INLINE void 
+emit_prim( struct draw_stage *stage, 
+	   struct prim_header *prim,
+	   unsigned hwprim,
+	   unsigned nr )
+{
+   struct i915_context *i915 = setup_stage(stage)->i915;
+   unsigned vertex_size;
+   unsigned i;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   /* need to do this after validation! */
+   vertex_size = i915->current.vertex_info.size * 4; /* in bytes */
+   assert(vertex_size >= 12); /* never smaller than 12 bytes */
+
+   if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+
+      if (!BEGIN_BATCH( 1 + nr * vertex_size / 4, 0 )) {
+	 assert(0);
+	 return;
+      }
+   }
+
+   /* Emit each triangle as a single primitive.  I told you this was
+    * simple.
+    */
+   OUT_BATCH(_3DPRIMITIVE | 
+	     hwprim |
+	     ((4 + vertex_size * nr)/4 - 2));
+
+   for (i = 0; i < nr; i++)
+      emit_hw_vertex(i915, prim->v[i]);
+}
+
+
+static void 
+setup_tri( struct draw_stage *stage, struct prim_header *prim )
+{
+   emit_prim( stage, prim, PRIM3D_TRILIST, 3 );
+}
+
+
+static void
+setup_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_LINELIST, 2 );
+}
+
+
+static void
+setup_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   emit_prim( stage, prim, PRIM3D_POINTLIST, 1 );
+}
+
+
+static void setup_flush( struct draw_stage *stage, unsigned flags )
+{
+}
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+static void render_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.  This gets plugged into
+ * the 'draw' module's pipeline.
+ */
+struct draw_stage *i915_draw_render_stage( struct i915_context *i915 )
+{
+   struct setup_stage *setup = CALLOC_STRUCT(setup_stage);
+
+   setup->i915 = i915;
+   setup->stage.draw = i915->draw;
+   setup->stage.point = setup_point;
+   setup->stage.line = setup_line;
+   setup->stage.tri = setup_tri;
+   setup->stage.flush = setup_flush;
+   setup->stage.reset_stipple_counter = reset_stipple_counter;
+   setup->stage.destroy = render_destroy;
+
+   return &setup->stage;
+}
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
new file mode 100644
index 0000000000..4fda1ab64f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -0,0 +1,547 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_winsys.h"
+#include "i915_batch.h"
+#include "i915_state.h"
+
+
+/**
+ * Primitive renderer for i915.
+ */
+struct i915_vbuf_render {
+   struct vbuf_render base;
+
+   struct i915_context *i915;   
+
+   /** Vertex size in bytes */
+   unsigned vertex_size;
+
+   /** Software primitive */
+   unsigned prim;
+
+   /** Hardware primitive */
+   unsigned hwprim;
+
+   /** Genereate a vertex list */
+   unsigned fallback;
+
+   /* Stuff for the vbo */
+   struct pipe_buffer *vbo;
+   size_t vbo_size;
+   size_t vbo_offset;
+   void *vbo_ptr;
+   size_t vbo_alloc_size;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct i915_vbuf_render *
+i915_vbuf_render( struct vbuf_render *render )
+{
+   assert(render);
+   return (struct i915_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+i915_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+
+   if (i915->dirty) {
+      /* make sure we have up to date vertex layout */
+      i915_update_derived( i915 );
+   }
+
+   return &i915->current.vertex_info;
+}
+
+
+static void *
+i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
+                                    ushort vertex_size,
+                                    ushort nr_vertices )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   struct pipe_screen *screen = i915->pipe.screen;
+   size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+   /* FIXME: handle failure */
+   assert(!i915->vbo);
+
+   if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
+   } else {
+      i915->vbo_flushed = 0;
+      pipe_buffer_reference(screen, &i915_render->vbo, NULL);
+   }
+
+   if (!i915_render->vbo) {
+      i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
+      i915_render->vbo_offset = 0;
+      i915_render->vbo = pipe_buffer_create(screen,
+                                            64,
+                                            I915_BUFFER_USAGE_LIT_VERTEX,
+                                            i915_render->vbo_size);
+      i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                             i915_render->vbo,
+                                             PIPE_BUFFER_USAGE_CPU_WRITE);
+      pipe_buffer_unmap(screen, i915_render->vbo);
+   }
+
+   i915->vbo = i915_render->vbo;
+   i915->vbo_offset = i915_render->vbo_offset;
+   i915->dirty |= I915_NEW_VBO;
+
+   return (unsigned char *)i915_render->vbo_ptr + i915->vbo_offset;
+}
+
+
+static boolean
+i915_vbuf_render_set_primitive( struct vbuf_render *render, 
+                                unsigned prim )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   i915_render->prim = prim;
+
+   switch(prim) {
+   case PIPE_PRIM_POINTS:
+      i915_render->hwprim = PRIM3D_POINTLIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINES:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_LINE_LOOP:
+      i915_render->hwprim = PRIM3D_LINELIST;
+      i915_render->fallback = PIPE_PRIM_LINE_LOOP;
+      return TRUE;
+   case PIPE_PRIM_LINE_STRIP:
+      i915_render->hwprim = PRIM3D_LINESTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLES:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      i915_render->hwprim = PRIM3D_TRISTRIP;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      i915_render->hwprim = PRIM3D_TRIFAN;
+      i915_render->fallback = 0;
+      return TRUE;
+   case PIPE_PRIM_QUADS:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUADS;
+      return TRUE;
+   case PIPE_PRIM_QUAD_STRIP:
+      i915_render->hwprim = PRIM3D_TRILIST;
+      i915_render->fallback = PIPE_PRIM_QUAD_STRIP;
+      return TRUE;
+   case PIPE_PRIM_POLYGON:
+      i915_render->hwprim = PRIM3D_POLY;
+      i915_render->fallback = 0;
+      return TRUE;
+   default:
+      assert((int)"Error unkown primtive type" & 0);
+      /* Actually, can handle a lot more just fine...  Fixme.
+       */
+      return FALSE;
+   }
+}
+
+
+
+/**
+ * Used for fallbacks in draw_arrays
+ */
+static void
+draw_arrays_generate_indices( struct vbuf_render *render,
+                              unsigned start, uint nr,
+                              unsigned type )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+   unsigned end = start + nr;
+   switch(type) {
+   case 0:
+      for (i = start; i+1 < end; i += 2)
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+      if (i < end)
+	 OUT_BATCH( i );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2) {
+	 for (i = start + 1; i < end; i++)
+	    OUT_BATCH( (i-0) | (i+0) << 16 );
+	 OUT_BATCH( (i-0) | (  start) << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = start; i + 3 < end; i += 4) {
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+	 OUT_BATCH( (i+3) | (i+1) << 16 );
+	 OUT_BATCH( (i+2) | (i+3) << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = start; i + 3 < end; i += 2) {
+	 OUT_BATCH( (i+0) | (i+1) << 16 );
+	 OUT_BATCH( (i+3) | (i+2) << 16 );
+	 OUT_BATCH( (i+0) | (i+3) << 16 );
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+static unsigned
+draw_arrays_calc_nr_indices( uint nr, unsigned type )
+{
+   switch (type) {
+   case 0:
+      return nr;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr >= 2)
+	 return nr * 2;
+      else
+	 return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void
+draw_arrays_fallback( struct vbuf_render *render,
+                      unsigned start,
+                      uint nr )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned nr_indices;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   nr_indices = draw_arrays_calc_nr_indices( nr, i915_render->fallback );
+   if (!nr_indices)
+      return;
+
+   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+	 assert(0);
+	 goto out;
+      }
+   }
+   OUT_BATCH( _3DPRIMITIVE |
+	      PRIM_INDIRECT |
+	      i915_render->hwprim |
+	      PRIM_INDIRECT_ELTS |
+	      nr_indices );
+
+   draw_arrays_generate_indices( render, start, nr, i915_render->fallback );
+
+out:
+   return;
+}
+
+static void
+i915_vbuf_render_draw_arrays( struct vbuf_render *render,
+                              unsigned start,
+                              uint nr )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+
+   if (i915_render->fallback) {
+      draw_arrays_fallback( render, start, nr );
+      return;
+   }
+
+   /* JB: TODO submit direct cmds */
+   draw_arrays_fallback( render, start, nr );
+}
+
+/**
+ * Used for normal and fallback emitting of indices
+ * If type is zero normal operation assumed.
+ */
+static void
+draw_generate_indices( struct vbuf_render *render,
+                       const ushort *indices,
+                       uint nr_indices,
+                       unsigned type )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned i;
+
+   switch(type) {
+   case 0:
+      for (i = 0; i + 1 < nr_indices; i += 2) {
+	 OUT_BATCH( indices[i] | indices[i+1] << 16 );
+      }
+      if (i < nr_indices) {
+	 OUT_BATCH( indices[i] );
+      }
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2) {
+	 for (i = 1; i < nr_indices; i++)
+	    OUT_BATCH( indices[i-1] | indices[i] << 16 );
+	 OUT_BATCH( indices[i-1] | indices[0] << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i + 3 < nr_indices; i += 4) {
+	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+3] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+2] | indices[i+3] << 16 );
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i + 3 < nr_indices; i += 2) {
+	 OUT_BATCH( indices[i+0] | indices[i+1] << 16 );
+	 OUT_BATCH( indices[i+3] | indices[i+2] << 16 );
+	 OUT_BATCH( indices[i+0] | indices[i+3] << 16 );
+      }
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+static unsigned
+draw_calc_nr_indices( uint nr_indices, unsigned type )
+{
+   switch (type) {
+   case 0:
+      return nr_indices;
+   case PIPE_PRIM_LINE_LOOP:
+      if (nr_indices >= 2)
+	 return nr_indices * 2;
+      else
+	 return 0;
+   case PIPE_PRIM_QUADS:
+      return (nr_indices / 4) * 6;
+   case PIPE_PRIM_QUAD_STRIP:
+      return ((nr_indices - 2) / 2) * 6;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static void 
+i915_vbuf_render_draw( struct vbuf_render *render,
+                       const ushort *indices,
+                       uint nr_indices)
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   unsigned save_nr_indices;
+
+   save_nr_indices = nr_indices;
+
+   nr_indices = draw_calc_nr_indices( nr_indices, i915_render->fallback );
+   if (!nr_indices)
+      return;
+
+   if (i915->dirty)
+      i915_update_derived( i915 );
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state( i915 );
+
+   if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush: 
+       */
+      i915_update_derived( i915 );
+      i915_emit_hardware_state( i915 );
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH( 1 + (nr_indices + 1)/2, 1 )) {
+	 assert(0);
+     goto out;
+      }
+   }
+
+   OUT_BATCH( _3DPRIMITIVE |
+	      PRIM_INDIRECT |
+	      i915_render->hwprim |
+	      PRIM_INDIRECT_ELTS |
+	      nr_indices );
+   draw_generate_indices( render,
+			  indices,
+			  save_nr_indices,
+			  i915_render->fallback );
+
+out:
+   return;
+}
+
+
+static void
+i915_vbuf_render_release_vertices( struct vbuf_render *render,
+			           void *vertices, 
+			           unsigned vertex_size,
+			           unsigned vertices_used )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
+   size_t size = (size_t)vertex_size * (size_t)vertices_used;
+
+   assert(i915->vbo);
+
+   i915_render->vbo_offset += size;
+   i915->vbo = NULL;
+   i915->dirty |= I915_NEW_VBO;
+}
+
+
+static void
+i915_vbuf_render_destroy( struct vbuf_render *render )
+{
+   struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   FREE(i915_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+i915_vbuf_render_create( struct i915_context *i915 )
+{
+   struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
+   struct pipe_screen *screen = i915->pipe.screen;
+
+   i915_render->i915 = i915;
+   
+   i915_render->base.max_vertex_buffer_bytes = 128*1024;
+   
+   /* NOTE: it must be such that state and vertices indices fit in a single 
+    * batch buffer.
+    */
+   i915_render->base.max_indices = 16*1024;
+
+   i915_render->base.get_vertex_info = i915_vbuf_render_get_vertex_info;
+   i915_render->base.allocate_vertices = i915_vbuf_render_allocate_vertices;
+   i915_render->base.set_primitive = i915_vbuf_render_set_primitive;
+   i915_render->base.draw = i915_vbuf_render_draw;
+   i915_render->base.draw_arrays = i915_vbuf_render_draw_arrays;
+   i915_render->base.release_vertices = i915_vbuf_render_release_vertices;
+   i915_render->base.destroy = i915_vbuf_render_destroy;
+
+   i915_render->vbo_alloc_size = 128 * 4096;
+   i915_render->vbo_size = i915_render->vbo_alloc_size;
+   i915_render->vbo_offset = 0;
+   i915_render->vbo = pipe_buffer_create(screen,
+                                         64,
+                                         I915_BUFFER_USAGE_LIT_VERTEX,
+                                         i915_render->vbo_size);
+   i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                          i915_render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   pipe_buffer_unmap(screen, i915_render->vbo);
+
+   return &i915_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *i915_draw_vbuf_stage( struct i915_context *i915 )
+{
+   struct vbuf_render *render;
+   struct draw_stage *stage;
+   
+   render = i915_vbuf_render_create(i915);
+   if(!render)
+      return NULL;
+   
+   stage = draw_vbuf_stage( i915->draw, render );
+   if(!stage) {
+      render->destroy(render);
+      return NULL;
+   }
+   /** TODO JB: this shouldn't be here */
+   draw_set_render(i915->draw, render);
+
+   return stage;
+}
diff --git a/src/gallium/drivers/i915simple/i915_reg.h b/src/gallium/drivers/i915simple/i915_reg.h
new file mode 100644
index 0000000000..04620fec68
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_reg.h
@@ -0,0 +1,978 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_REG_H
+#define I915_REG_H
+
+
+#define I915_SET_FIELD( var, mask, value ) (var &= ~(mask), var |= value)
+
+#define CMD_3D (0x3<<29)
+
+#define PRIM3D_INLINE		(CMD_3D | (0x1f<<24))
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_CLEAR_RECT	(0xa<<18)
+#define PRIM3D_ZONE_INIT	(0xd<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+/* p137 */
+#define _3DSTATE_AA_CMD			(CMD_3D | (0x06<<24))
+#define AA_LINE_ECAAR_WIDTH_ENABLE	(1<<16)
+#define AA_LINE_ECAAR_WIDTH_0_5 	0
+#define AA_LINE_ECAAR_WIDTH_1_0		(1<<14)
+#define AA_LINE_ECAAR_WIDTH_2_0 	(2<<14)
+#define AA_LINE_ECAAR_WIDTH_4_0 	(3<<14)
+#define AA_LINE_REGION_WIDTH_ENABLE	(1<<8)
+#define AA_LINE_REGION_WIDTH_0_5	0
+#define AA_LINE_REGION_WIDTH_1_0	(1<<6)
+#define AA_LINE_REGION_WIDTH_2_0	(2<<6)
+#define AA_LINE_REGION_WIDTH_4_0	(3<<6)
+
+/* 3DSTATE_BACKFACE_STENCIL_OPS, p138*/
+#define _3DSTATE_BACKFACE_STENCIL_OPS    (CMD_3D | (0x8<<24))
+#define BFO_ENABLE_STENCIL_REF          (1<<23)
+#define BFO_STENCIL_REF_SHIFT           15
+#define BFO_STENCIL_REF_MASK            (0xff<<15)
+#define BFO_ENABLE_STENCIL_FUNCS        (1<<14)
+#define BFO_STENCIL_TEST_SHIFT          11
+#define BFO_STENCIL_TEST_MASK           (0x7<<11)
+#define BFO_STENCIL_FAIL_SHIFT          8
+#define BFO_STENCIL_FAIL_MASK           (0x7<<8)
+#define BFO_STENCIL_PASS_Z_FAIL_SHIFT   5
+#define BFO_STENCIL_PASS_Z_FAIL_MASK    (0x7<<5)
+#define BFO_STENCIL_PASS_Z_PASS_SHIFT   2
+#define BFO_STENCIL_PASS_Z_PASS_MASK    (0x7<<2)
+#define BFO_ENABLE_STENCIL_TWO_SIDE     (1<<1)
+#define BFO_STENCIL_TWO_SIDE            (1<<0)
+
+
+/* 3DSTATE_BACKFACE_STENCIL_MASKS, p140 */
+#define _3DSTATE_BACKFACE_STENCIL_MASKS    (CMD_3D | (0x9<<24))
+#define BFM_ENABLE_STENCIL_TEST_MASK      (1<<17)
+#define BFM_ENABLE_STENCIL_WRITE_MASK     (1<<16)
+#define BFM_STENCIL_TEST_MASK_SHIFT       8
+#define BFM_STENCIL_TEST_MASK_MASK        (0xff<<8)
+#define BFM_STENCIL_WRITE_MASK_SHIFT      0
+#define BFM_STENCIL_WRITE_MASK_MASK       (0xff<<0)
+
+
+
+/* 3DSTATE_BIN_CONTROL p141 */
+
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
+
+/* 3DSTATE_CHROMA_KEY */
+
+/* 3DSTATE_CLEAR_PARAMETERS, p150 */
+#define _3DSTATE_CLEAR_PARAMETERS	(CMD_3D | (0x1d<<24) | (0x9c<<16) | 5)
+/* Dword 1 */
+#define CLEARPARAM_CLEAR_RECT		(1 << 16)
+#define CLEARPARAM_ZONE_INIT		(0 << 16)
+#define CLEARPARAM_WRITE_COLOR		(1 << 2)
+#define CLEARPARAM_WRITE_DEPTH		(1 << 1)
+#define CLEARPARAM_WRITE_STENCIL	(1 << 0)
+
+/* 3DSTATE_CONSTANT_BLEND_COLOR, p153 */
+#define _3DSTATE_CONST_BLEND_COLOR_CMD	(CMD_3D | (0x1d<<24) | (0x88<<16))
+
+
+
+/* 3DSTATE_COORD_SET_BINDINGS, p154 */
+#define _3DSTATE_COORD_SET_BINDINGS      (CMD_3D | (0x16<<24))
+#define CSB_TCB(iunit, eunit)           ((eunit)<<(iunit*3))
+
+/* p156 */
+#define _3DSTATE_DFLT_DIFFUSE_CMD	(CMD_3D | (0x1d<<24) | (0x99<<16))
+
+/* p157 */
+#define _3DSTATE_DFLT_SPEC_CMD		(CMD_3D | (0x1d<<24) | (0x9a<<16))
+
+/* p158 */
+#define _3DSTATE_DFLT_Z_CMD		(CMD_3D | (0x1d<<24) | (0x98<<16))
+
+
+/* 3DSTATE_DEPTH_OFFSET_SCALE, p159 */
+#define _3DSTATE_DEPTH_OFFSET_SCALE       (CMD_3D | (0x1d<<24) | (0x97<<16))
+/* scale in dword 1 */
+
+
+/* 3DSTATE_DEPTH_SUBRECT_DISABLE, p160 */
+#define _3DSTATE_DEPTH_SUBRECT_DISABLE    (CMD_3D | (0x1c<<24) | (0x11<<19) | 0x2)
+
+/* p161 */
+#define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
+/* Dword 1 */
+#define TEX_DEFAULT_COLOR_OGL           (0<<30)
+#define TEX_DEFAULT_COLOR_D3D           (1<<30)
+#define ZR_EARLY_DEPTH                  (1<<29)
+#define LOD_PRECLAMP_OGL                (1<<28)
+#define LOD_PRECLAMP_D3D                (0<<28)
+#define DITHER_FULL_ALWAYS              (0<<26)
+#define DITHER_FULL_ON_FB_BLEND         (1<<26)
+#define DITHER_CLAMPED_ALWAYS           (2<<26)
+#define LINEAR_GAMMA_BLEND_32BPP        (1<<25)
+#define DEBUG_DISABLE_ENH_DITHER        (1<<24)
+#define DSTORG_HORT_BIAS(x)		((x)<<20)
+#define DSTORG_VERT_BIAS(x)		((x)<<16)
+#define COLOR_4_2_2_CHNL_WRT_ALL	0
+#define COLOR_4_2_2_CHNL_WRT_Y		(1<<12)
+#define COLOR_4_2_2_CHNL_WRT_CR		(2<<12)
+#define COLOR_4_2_2_CHNL_WRT_CB		(3<<12)
+#define COLOR_4_2_2_CHNL_WRT_CRCB	(4<<12)
+#define COLOR_BUF_8BIT			0
+#define COLOR_BUF_RGB555 		(1<<8)
+#define COLOR_BUF_RGB565 		(2<<8)
+#define COLOR_BUF_ARGB8888		(3<<8)
+#define DEPTH_FRMT_16_FIXED		0
+#define DEPTH_FRMT_16_FLOAT		(1<<2)
+#define DEPTH_FRMT_24_FIXED_8_OTHER	(2<<2)
+#define VERT_LINE_STRIDE_1		(1<<1)
+#define VERT_LINE_STRIDE_0		(0<<1)
+#define VERT_LINE_STRIDE_OFS_1		1
+#define VERT_LINE_STRIDE_OFS_0		0
+
+/* p166 */
+#define _3DSTATE_DRAW_RECT_CMD		(CMD_3D|(0x1d<<24)|(0x80<<16)|3)
+/* Dword 1 */
+#define DRAW_RECT_DIS_DEPTH_OFS 	(1<<30)
+#define DRAW_DITHER_OFS_X(x)		((x)<<26)
+#define DRAW_DITHER_OFS_Y(x)		((x)<<24)
+/* Dword 2 */
+#define DRAW_YMIN(x)			((x)<<16)
+#define DRAW_XMIN(x)			(x)
+/* Dword 3 */
+#define DRAW_YMAX(x)			((x)<<16)
+#define DRAW_XMAX(x)			(x)
+/* Dword 4 */
+#define DRAW_YORG(x)			((x)<<16)
+#define DRAW_XORG(x)			(x)
+
+
+/* 3DSTATE_FILTER_COEFFICIENTS_4X4, p170 */
+
+/* 3DSTATE_FILTER_COEFFICIENTS_6X5, p172 */
+
+
+/* _3DSTATE_FOG_COLOR, p173 */
+#define _3DSTATE_FOG_COLOR_CMD		(CMD_3D|(0x15<<24))
+#define FOG_COLOR_RED(x)		((x)<<16)
+#define FOG_COLOR_GREEN(x)		((x)<<8)
+#define FOG_COLOR_BLUE(x)		(x)
+
+/* _3DSTATE_FOG_MODE, p174 */
+#define _3DSTATE_FOG_MODE_CMD		(CMD_3D|(0x1d<<24)|(0x89<<16)|2)
+/* Dword 1 */
+#define FMC1_FOGFUNC_MODIFY_ENABLE	(1<<31)
+#define FMC1_FOGFUNC_VERTEX		(0<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP		(1<<28)
+#define FMC1_FOGFUNC_PIXEL_EXP2		(2<<28)
+#define FMC1_FOGFUNC_PIXEL_LINEAR	(3<<28)
+#define FMC1_FOGFUNC_MASK		(3<<28)
+#define FMC1_FOGINDEX_MODIFY_ENABLE     (1<<27)
+#define FMC1_FOGINDEX_Z		        (0<<25)
+#define FMC1_FOGINDEX_W   		(1<<25)
+#define FMC1_C1_C2_MODIFY_ENABLE	(1<<24)
+#define FMC1_DENSITY_MODIFY_ENABLE	(1<<23)
+#define FMC1_C1_ONE      	        (1<<13)
+#define FMC1_C1_MASK		        (0xffff<<4)
+/* Dword 2 */
+#define FMC2_C2_ONE		        (1<<16)
+/* Dword 3 */
+#define FMC3_D_ONE      		(1<<16)
+
+
+
+/* _3DSTATE_INDEPENDENT_ALPHA_BLEND, p177 */
+#define _3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD	(CMD_3D|(0x0b<<24))
+#define IAB_MODIFY_ENABLE	        (1<<23)
+#define IAB_ENABLE       	        (1<<22)
+#define IAB_MODIFY_FUNC         	(1<<21)
+#define IAB_FUNC_SHIFT          	16
+#define IAB_MODIFY_SRC_FACTOR   	(1<<11)
+#define IAB_SRC_FACTOR_SHIFT		6
+#define IAB_SRC_FACTOR_MASK		(BLENDFACT_MASK<<6)
+#define IAB_MODIFY_DST_FACTOR	        (1<<5)
+#define IAB_DST_FACTOR_SHIFT		0
+#define IAB_DST_FACTOR_MASK		(BLENDFACT_MASK<<0)
+
+
+#define BLENDFUNC_ADD			0x0
+#define BLENDFUNC_SUBTRACT		0x1
+#define BLENDFUNC_REVERSE_SUBTRACT	0x2
+#define BLENDFUNC_MIN			0x3
+#define BLENDFUNC_MAX			0x4
+#define BLENDFUNC_MASK			0x7
+
+/* 3DSTATE_LOAD_INDIRECT, p180 */
+
+#define _3DSTATE_LOAD_INDIRECT	        (CMD_3D|(0x1d<<24)|(0x7<<16))
+#define LI0_STATE_STATIC_INDIRECT       (0x01<<8)
+#define LI0_STATE_DYNAMIC_INDIRECT      (0x02<<8)
+#define LI0_STATE_SAMPLER               (0x04<<8)
+#define LI0_STATE_MAP                   (0x08<<8)
+#define LI0_STATE_PROGRAM               (0x10<<8)
+#define LI0_STATE_CONSTANTS             (0x20<<8)
+
+#define SIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SIS0_FORCE_LOAD                 (1<<1)
+#define SIS0_BUFFER_VALID               (1<<0)
+#define SIS1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define DIS0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define DIS0_BUFFER_RESET               (1<<1)
+#define DIS0_BUFFER_VALID               (1<<0)
+
+#define SSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define SSB0_FORCE_LOAD                 (1<<1)
+#define SSB0_BUFFER_VALID               (1<<0)
+#define SSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define MSB0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define MSB0_FORCE_LOAD                 (1<<1)
+#define MSB0_BUFFER_VALID               (1<<0)
+#define MSB1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSP0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSP0_FORCE_LOAD                 (1<<1)
+#define PSP0_BUFFER_VALID               (1<<0)
+#define PSP1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+#define PSC0_BUFFER_ADDRESS(x)          ((x)&~0x3)
+#define PSC0_FORCE_LOAD                 (1<<1)
+#define PSC0_BUFFER_VALID               (1<<0)
+#define PSC1_BUFFER_LENGTH(x)           ((x)&0xff)
+
+
+
+
+
+/* _3DSTATE_RASTERIZATION_RULES */
+#define _3DSTATE_RASTER_RULES_CMD	(CMD_3D|(0x07<<24))
+#define ENABLE_POINT_RASTER_RULE	(1<<15)
+#define OGL_POINT_RASTER_RULE		(1<<13)
+#define ENABLE_TEXKILL_3D_4D            (1<<10)
+#define TEXKILL_3D                      (0<<9)
+#define TEXKILL_4D                      (1<<9)
+#define ENABLE_LINE_STRIP_PROVOKE_VRTX	(1<<8)
+#define ENABLE_TRI_FAN_PROVOKE_VRTX	(1<<5)
+#define LINE_STRIP_PROVOKE_VRTX(x)	((x)<<6)
+#define TRI_FAN_PROVOKE_VRTX(x) 	((x)<<3)
+
+/* _3DSTATE_SCISSOR_ENABLE, p256 */
+#define _3DSTATE_SCISSOR_ENABLE_CMD	(CMD_3D|(0x1c<<24)|(0x10<<19))
+#define ENABLE_SCISSOR_RECT		((1<<1) | 1)
+#define DISABLE_SCISSOR_RECT		(1<<1)
+
+/* _3DSTATE_SCISSOR_RECTANGLE_0, p257 */
+#define _3DSTATE_SCISSOR_RECT_0_CMD	(CMD_3D|(0x1d<<24)|(0x81<<16)|1)
+/* Dword 1 */
+#define SCISSOR_RECT_0_YMIN(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMIN(x)		(x)
+/* Dword 2 */
+#define SCISSOR_RECT_0_YMAX(x)		((x)<<16)
+#define SCISSOR_RECT_0_XMAX(x)		(x)
+
+/* p189 */
+#define _3DSTATE_LOAD_STATE_IMMEDIATE_1   ((0x3<<29)|(0x1d<<24)|(0x04<<16))
+#define I1_LOAD_S(n)                      (1<<(4+n))
+
+#define S0_VB_OFFSET_MASK              0xffffffc
+#define S0_AUTO_CACHE_INV_DISABLE      (1<<0)
+
+#define S1_VERTEX_WIDTH_SHIFT          24
+#define S1_VERTEX_WIDTH_MASK           (0x3f<<24)
+#define S1_VERTEX_PITCH_SHIFT          16
+#define S1_VERTEX_PITCH_MASK           (0x3f<<16)
+
+#define TEXCOORDFMT_2D                 0x0
+#define TEXCOORDFMT_3D                 0x1
+#define TEXCOORDFMT_4D                 0x2
+#define TEXCOORDFMT_1D                 0x3
+#define TEXCOORDFMT_2D_16              0x4
+#define TEXCOORDFMT_4D_16              0x5
+#define TEXCOORDFMT_NOT_PRESENT        0xf
+#define S2_TEXCOORD_FMT0_MASK            0xf
+#define S2_TEXCOORD_FMT1_SHIFT           4
+#define S2_TEXCOORD_FMT(unit, type)    ((type)<<(unit*4))
+#define S2_TEXCOORD_NONE               (~0)
+
+/* S3 not interesting */
+
+#define S4_POINT_WIDTH_SHIFT           23
+#define S4_POINT_WIDTH_MASK            (0x1ff<<23)
+#define S4_LINE_WIDTH_SHIFT            19
+#define S4_LINE_WIDTH_ONE              (0x2<<19)
+#define S4_LINE_WIDTH_MASK             (0xf<<19)
+#define S4_FLATSHADE_ALPHA             (1<<18)
+#define S4_FLATSHADE_FOG               (1<<17)
+#define S4_FLATSHADE_SPECULAR          (1<<16)
+#define S4_FLATSHADE_COLOR             (1<<15)
+#define S4_CULLMODE_BOTH	       (0<<13)
+#define S4_CULLMODE_NONE	       (1<<13)
+#define S4_CULLMODE_CW		       (2<<13)
+#define S4_CULLMODE_CCW		       (3<<13)
+#define S4_CULLMODE_MASK	       (3<<13)
+#define S4_VFMT_POINT_WIDTH            (1<<12)
+#define S4_VFMT_SPEC_FOG               (1<<11)
+#define S4_VFMT_COLOR                  (1<<10)
+#define S4_VFMT_DEPTH_OFFSET           (1<<9)
+#define S4_VFMT_XYZ     	       (1<<6)
+#define S4_VFMT_XYZW     	       (2<<6)
+#define S4_VFMT_XY     		       (3<<6)
+#define S4_VFMT_XYW     	       (4<<6)
+#define S4_VFMT_XYZW_MASK              (7<<6)
+#define S4_FORCE_DEFAULT_DIFFUSE       (1<<5)
+#define S4_FORCE_DEFAULT_SPECULAR      (1<<4)
+#define S4_LOCAL_DEPTH_OFFSET_ENABLE   (1<<3)
+#define S4_VFMT_FOG_PARAM              (1<<2)
+#define S4_SPRITE_POINT_ENABLE         (1<<1)
+#define S4_LINE_ANTIALIAS_ENABLE       (1<<0)
+
+#define S4_VFMT_MASK (S4_VFMT_POINT_WIDTH   | 	\
+		      S4_VFMT_SPEC_FOG      |	\
+		      S4_VFMT_COLOR         |	\
+		      S4_VFMT_DEPTH_OFFSET  |	\
+		      S4_VFMT_XYZW_MASK     |	\
+		      S4_VFMT_FOG_PARAM)
+
+
+#define S5_WRITEDISABLE_ALPHA          (1<<31)
+#define S5_WRITEDISABLE_RED            (1<<30)
+#define S5_WRITEDISABLE_GREEN          (1<<29)
+#define S5_WRITEDISABLE_BLUE           (1<<28)
+#define S5_WRITEDISABLE_MASK           (0xf<<28)
+#define S5_FORCE_DEFAULT_POINT_SIZE    (1<<27)
+#define S5_LAST_PIXEL_ENABLE           (1<<26)
+#define S5_GLOBAL_DEPTH_OFFSET_ENABLE  (1<<25)
+#define S5_FOG_ENABLE                  (1<<24)
+#define S5_STENCIL_REF_SHIFT           16
+#define S5_STENCIL_REF_MASK            (0xff<<16)
+#define S5_STENCIL_TEST_FUNC_SHIFT     13
+#define S5_STENCIL_TEST_FUNC_MASK      (0x7<<13)
+#define S5_STENCIL_FAIL_SHIFT          10
+#define S5_STENCIL_FAIL_MASK           (0x7<<10)
+#define S5_STENCIL_PASS_Z_FAIL_SHIFT   7
+#define S5_STENCIL_PASS_Z_FAIL_MASK    (0x7<<7)
+#define S5_STENCIL_PASS_Z_PASS_SHIFT   4
+#define S5_STENCIL_PASS_Z_PASS_MASK    (0x7<<4)
+#define S5_STENCIL_WRITE_ENABLE        (1<<3)
+#define S5_STENCIL_TEST_ENABLE         (1<<2)
+#define S5_COLOR_DITHER_ENABLE         (1<<1)
+#define S5_LOGICOP_ENABLE              (1<<0)
+
+
+#define S6_ALPHA_TEST_ENABLE           (1<<31)
+#define S6_ALPHA_TEST_FUNC_SHIFT       28
+#define S6_ALPHA_TEST_FUNC_MASK        (0x7<<28)
+#define S6_ALPHA_REF_SHIFT             20
+#define S6_ALPHA_REF_MASK              (0xff<<20)
+#define S6_DEPTH_TEST_ENABLE           (1<<19)
+#define S6_DEPTH_TEST_FUNC_SHIFT       16
+#define S6_DEPTH_TEST_FUNC_MASK        (0x7<<16)
+#define S6_CBUF_BLEND_ENABLE           (1<<15)
+#define S6_CBUF_BLEND_FUNC_SHIFT       12
+#define S6_CBUF_BLEND_FUNC_MASK        (0x7<<12)
+#define S6_CBUF_SRC_BLEND_FACT_SHIFT   8
+#define S6_CBUF_SRC_BLEND_FACT_MASK    (0xf<<8)
+#define S6_CBUF_DST_BLEND_FACT_SHIFT   4
+#define S6_CBUF_DST_BLEND_FACT_MASK    (0xf<<4)
+#define S6_DEPTH_WRITE_ENABLE          (1<<3)
+#define S6_COLOR_WRITE_ENABLE          (1<<2)
+#define S6_TRISTRIP_PV_SHIFT           0
+#define S6_TRISTRIP_PV_MASK            (0x3<<0)
+
+#define S7_DEPTH_OFFSET_CONST_MASK     ~0
+
+
+
+#define DST_BLND_FACT(f) ((f)<<S6_CBUF_DST_BLEND_FACT_SHIFT)
+#define SRC_BLND_FACT(f) ((f)<<S6_CBUF_SRC_BLEND_FACT_SHIFT)
+#define DST_ABLND_FACT(f) ((f)<<IAB_DST_FACTOR_SHIFT)
+#define SRC_ABLND_FACT(f) ((f)<<IAB_SRC_FACTOR_SHIFT)
+
+
+
+
+/* 3DSTATE_MAP_DEINTERLACER_PARAMETERS */
+
+/* 3DSTATE_MAP_PALETTE_LOAD_32, p206 */
+#define _3DSTATE_MAP_PALETTE_LOAD_32    (CMD_3D|(0x1d<<24)|(0x8f<<16))
+/* subsequent dwords up to length (max 16) are ARGB8888 color values */
+
+/* _3DSTATE_MODES_4, p218 */
+#define _3DSTATE_MODES_4_CMD		(CMD_3D|(0x0d<<24))
+#define ENABLE_LOGIC_OP_FUNC		(1<<23)
+#define LOGIC_OP_FUNC(x)		((x)<<18)
+#define LOGICOP_MASK			(0xf<<18)
+#define MODE4_ENABLE_STENCIL_TEST_MASK	((1<<17)|(0xff00))
+#define ENABLE_STENCIL_TEST_MASK	(1<<17)
+#define STENCIL_TEST_MASK(x)		(((x)&0xff)<<8)
+#define MODE4_ENABLE_STENCIL_WRITE_MASK	((1<<16)|(0x00ff))
+#define ENABLE_STENCIL_WRITE_MASK	(1<<16)
+#define STENCIL_WRITE_MASK(x)		((x)&0xff)
+
+/* _3DSTATE_MODES_5, p220 */
+#define _3DSTATE_MODES_5_CMD		(CMD_3D|(0x0c<<24))
+#define PIPELINE_FLUSH_RENDER_CACHE	(1<<18)
+#define PIPELINE_FLUSH_TEXTURE_CACHE	(1<<16)
+
+
+/* p221 */
+#define _3DSTATE_PIXEL_SHADER_CONSTANTS  (CMD_3D|(0x1d<<24)|(0x6<<16))
+#define PS1_REG(n)                      (1<<(n))
+#define PS2_CONST_X(n)                  (n)
+#define PS3_CONST_Y(n)                  (n)
+#define PS4_CONST_Z(n)                  (n)
+#define PS5_CONST_W(n)                  (n)
+
+/* p222 */
+
+
+#define I915_MAX_TEX_INDIRECT 4
+#define I915_MAX_TEX_INSN     32
+#define I915_MAX_ALU_INSN     64
+#define I915_MAX_DECL_INSN    27
+#define I915_MAX_TEMPORARY    16
+
+
+/* Each instruction is 3 dwords long, though most don't require all
+ * this space.  Maximum of 123 instructions.  Smaller maxes per insn
+ * type.
+ */
+#define _3DSTATE_PIXEL_SHADER_PROGRAM    (CMD_3D|(0x1d<<24)|(0x5<<16))
+
+#define REG_TYPE_R                 0    /* temporary regs, no need to
+                                         * dcl, must be written before
+                                         * read -- Preserved between
+                                         * phases. 
+                                         */
+#define REG_TYPE_T                 1    /* Interpolated values, must be
+                                         * dcl'ed before use.
+                                         *
+                                         * 0..7: texture coord,
+                                         * 8: diffuse spec,
+                                         * 9: specular color,
+                                         * 10: fog parameter in w.
+                                         */
+#define REG_TYPE_CONST             2    /* Restriction: only one const
+                                         * can be referenced per
+                                         * instruction, though it may be
+                                         * selected for multiple inputs.
+                                         * Constants not initialized
+                                         * default to zero.
+                                         */
+#define REG_TYPE_S                 3    /* sampler */
+#define REG_TYPE_OC                4    /* output color (rgba) */
+#define REG_TYPE_OD                5    /* output depth (w), xyz are
+                                         * temporaries.  If not written,
+                                         * interpolated depth is used?
+                                         */
+#define REG_TYPE_U                 6    /* unpreserved temporaries */
+#define REG_TYPE_MASK              0x7
+#define REG_NR_MASK                0xf
+
+
+/* REG_TYPE_T:
+ */
+#define T_TEX0     0
+#define T_TEX1     1
+#define T_TEX2     2
+#define T_TEX3     3
+#define T_TEX4     4
+#define T_TEX5     5
+#define T_TEX6     6
+#define T_TEX7     7
+#define T_DIFFUSE  8
+#define T_SPECULAR 9
+#define T_FOG_W    10           /* interpolated fog is in W coord */
+
+/* Arithmetic instructions */
+
+/* .replicate_swizzle == selection and replication of a particular
+ * scalar channel, ie., .xxxx, .yyyy, .zzzz or .wwww 
+ */
+#define A0_NOP    (0x0<<24)     /* no operation */
+#define A0_ADD    (0x1<<24)     /* dst = src0 + src1 */
+#define A0_MOV    (0x2<<24)     /* dst = src0 */
+#define A0_MUL    (0x3<<24)     /* dst = src0 * src1 */
+#define A0_MAD    (0x4<<24)     /* dst = src0 * src1 + src2 */
+#define A0_DP2ADD (0x5<<24)     /* dst.xyzw = src0.xy dot src1.xy + src2.replicate_swizzle */
+#define A0_DP3    (0x6<<24)     /* dst.xyzw = src0.xyz dot src1.xyz */
+#define A0_DP4    (0x7<<24)     /* dst.xyzw = src0.xyzw dot src1.xyzw */
+#define A0_FRC    (0x8<<24)     /* dst = src0 - floor(src0) */
+#define A0_RCP    (0x9<<24)     /* dst.xyzw = 1/(src0.replicate_swizzle) */
+#define A0_RSQ    (0xa<<24)     /* dst.xyzw = 1/(sqrt(abs(src0.replicate_swizzle))) */
+#define A0_EXP    (0xb<<24)     /* dst.xyzw = exp2(src0.replicate_swizzle) */
+#define A0_LOG    (0xc<<24)     /* dst.xyzw = log2(abs(src0.replicate_swizzle)) */
+#define A0_CMP    (0xd<<24)     /* dst = (src0 >= 0.0) ? src1 : src2 */
+#define A0_MIN    (0xe<<24)     /* dst = (src0 < src1) ? src0 : src1 */
+#define A0_MAX    (0xf<<24)     /* dst = (src0 >= src1) ? src0 : src1 */
+#define A0_FLR    (0x10<<24)    /* dst = floor(src0) */
+#define A0_MOD    (0x11<<24)    /* dst = src0 fmod 1.0 */
+#define A0_TRC    (0x12<<24)    /* dst = int(src0) */
+#define A0_SGE    (0x13<<24)    /* dst = src0 >= src1 ? 1.0 : 0.0 */
+#define A0_SLT    (0x14<<24)    /* dst = src0 < src1 ? 1.0 : 0.0 */
+#define A0_DEST_SATURATE                 (1<<22)
+#define A0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+#define A0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define A0_DEST_CHANNEL_X                (1<<10)
+#define A0_DEST_CHANNEL_Y                (2<<10)
+#define A0_DEST_CHANNEL_Z                (4<<10)
+#define A0_DEST_CHANNEL_W                (8<<10)
+#define A0_DEST_CHANNEL_ALL              (0xf<<10)
+#define A0_DEST_CHANNEL_SHIFT            10
+#define A0_SRC0_TYPE_SHIFT               7
+#define A0_SRC0_NR_SHIFT                 2
+
+#define A0_DEST_CHANNEL_XY              (A0_DEST_CHANNEL_X|A0_DEST_CHANNEL_Y)
+#define A0_DEST_CHANNEL_XYZ             (A0_DEST_CHANNEL_XY|A0_DEST_CHANNEL_Z)
+
+
+#define SRC_X        0
+#define SRC_Y        1
+#define SRC_Z        2
+#define SRC_W        3
+#define SRC_ZERO     4
+#define SRC_ONE      5
+
+#define A1_SRC0_CHANNEL_X_NEGATE         (1<<31)
+#define A1_SRC0_CHANNEL_X_SHIFT          28
+#define A1_SRC0_CHANNEL_Y_NEGATE         (1<<27)
+#define A1_SRC0_CHANNEL_Y_SHIFT          24
+#define A1_SRC0_CHANNEL_Z_NEGATE         (1<<23)
+#define A1_SRC0_CHANNEL_Z_SHIFT          20
+#define A1_SRC0_CHANNEL_W_NEGATE         (1<<19)
+#define A1_SRC0_CHANNEL_W_SHIFT          16
+#define A1_SRC1_TYPE_SHIFT               13
+#define A1_SRC1_NR_SHIFT                 8
+#define A1_SRC1_CHANNEL_X_NEGATE         (1<<7)
+#define A1_SRC1_CHANNEL_X_SHIFT          4
+#define A1_SRC1_CHANNEL_Y_NEGATE         (1<<3)
+#define A1_SRC1_CHANNEL_Y_SHIFT          0
+
+#define A2_SRC1_CHANNEL_Z_NEGATE         (1<<31)
+#define A2_SRC1_CHANNEL_Z_SHIFT          28
+#define A2_SRC1_CHANNEL_W_NEGATE         (1<<27)
+#define A2_SRC1_CHANNEL_W_SHIFT          24
+#define A2_SRC2_TYPE_SHIFT               21
+#define A2_SRC2_NR_SHIFT                 16
+#define A2_SRC2_CHANNEL_X_NEGATE         (1<<15)
+#define A2_SRC2_CHANNEL_X_SHIFT          12
+#define A2_SRC2_CHANNEL_Y_NEGATE         (1<<11)
+#define A2_SRC2_CHANNEL_Y_SHIFT          8
+#define A2_SRC2_CHANNEL_Z_NEGATE         (1<<7)
+#define A2_SRC2_CHANNEL_Z_SHIFT          4
+#define A2_SRC2_CHANNEL_W_NEGATE         (1<<3)
+#define A2_SRC2_CHANNEL_W_SHIFT          0
+
+
+
+/* Texture instructions */
+#define T0_TEXLD     (0x15<<24) /* Sample texture using predeclared
+                                 * sampler and address, and output
+                                 * filtered texel data to destination
+                                 * register */
+#define T0_TEXLDP    (0x16<<24) /* Same as texld but performs a
+                                 * perspective divide of the texture
+                                 * coordinate .xyz values by .w before
+                                 * sampling. */
+#define T0_TEXLDB    (0x17<<24) /* Same as texld but biases the
+                                 * computed LOD by w.  Only S4.6 two's
+                                 * comp is used.  This implies that a
+                                 * float to fixed conversion is
+                                 * done. */
+#define T0_TEXKILL   (0x18<<24) /* Does not perform a sampling
+                                 * operation.  Simply kills the pixel
+                                 * if any channel of the address
+                                 * register is < 0.0. */
+#define T0_DEST_TYPE_SHIFT                19
+/* Allow: R, OC, OD, U */
+/* Note: U (unpreserved) regs do not retain their values between
+ * phases (cannot be used for feedback) 
+ *
+ * Note: oC and OD registers can only be used as the destination of a
+ * texture instruction once per phase (this is an implementation
+ * restriction). 
+ */
+#define T0_DEST_NR_SHIFT                 14
+/* Allow R: 0..15, OC,OD: 0..0, U: 0..2 */
+#define T0_SAMPLER_NR_SHIFT              0      /* This field ignored for TEXKILL */
+#define T0_SAMPLER_NR_MASK               (0xf<<0)
+
+#define T1_ADDRESS_REG_TYPE_SHIFT        24     /* Reg to use as texture coord */
+/* Allow R, T, OC, OD -- R, OC, OD are 'dependent' reads, new program phase */
+#define T1_ADDRESS_REG_NR_SHIFT          17
+#define T2_MBZ                           0
+
+/* Declaration instructions */
+#define D0_DCL       (0x19<<24) /* Declare a t (interpolated attrib)
+                                 * register or an s (sampler)
+                                 * register. */
+#define D0_SAMPLE_TYPE_SHIFT              22
+#define D0_SAMPLE_TYPE_2D                 (0x0<<22)
+#define D0_SAMPLE_TYPE_CUBE               (0x1<<22)
+#define D0_SAMPLE_TYPE_VOLUME             (0x2<<22)
+#define D0_SAMPLE_TYPE_MASK               (0x3<<22)
+
+#define D0_TYPE_SHIFT                19
+/* Allow: T, S */
+#define D0_NR_SHIFT                  14
+/* Allow T: 0..10, S: 0..15 */
+#define D0_CHANNEL_X                (1<<10)
+#define D0_CHANNEL_Y                (2<<10)
+#define D0_CHANNEL_Z                (4<<10)
+#define D0_CHANNEL_W                (8<<10)
+#define D0_CHANNEL_ALL              (0xf<<10)
+#define D0_CHANNEL_NONE             (0<<10)
+
+#define D0_CHANNEL_XY               (D0_CHANNEL_X|D0_CHANNEL_Y)
+#define D0_CHANNEL_XYZ              (D0_CHANNEL_XY|D0_CHANNEL_Z)
+
+/* I915 Errata: Do not allow (xz), (xw), (xzw) combinations for diffuse
+ * or specular declarations. 
+ *
+ * For T dcls, only allow: (x), (xy), (xyz), (w), (xyzw) 
+ *
+ * Must be zero for S (sampler) dcls
+ */
+#define D1_MBZ                          0
+#define D2_MBZ                          0
+
+
+
+/* p207 */
+#define _3DSTATE_MAP_STATE               (CMD_3D|(0x1d<<24)|(0x0<<16))
+
+#define MS1_MAPMASK_SHIFT               0
+#define MS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define MS2_UNTRUSTED_SURFACE           (1<<31)
+#define MS2_ADDRESS_MASK                0xfffffffc
+#define MS2_VERTICAL_LINE_STRIDE        (1<<1)
+#define MS2_VERTICAL_OFFSET             (1<<1)
+
+#define MS3_HEIGHT_SHIFT              21
+#define MS3_WIDTH_SHIFT               10
+#define MS3_PALETTE_SELECT            (1<<9)
+#define MS3_MAPSURF_FORMAT_SHIFT      7
+#define MS3_MAPSURF_FORMAT_MASK       (0x7<<7)
+#define    MAPSURF_8BIT		 	   (1<<7)
+#define    MAPSURF_16BIT		   (2<<7)
+#define    MAPSURF_32BIT		   (3<<7)
+#define    MAPSURF_422			   (5<<7)
+#define    MAPSURF_COMPRESSED		   (6<<7)
+#define    MAPSURF_4BIT_INDEXED		   (7<<7)
+#define MS3_MT_FORMAT_MASK         (0x7 << 3)
+#define MS3_MT_FORMAT_SHIFT        3
+#define    MT_4BIT_IDX_ARGB8888	           (7<<3)       /* SURFACE_4BIT_INDEXED */
+#define    MT_8BIT_I8		           (0<<3)       /* SURFACE_8BIT */
+#define    MT_8BIT_L8		           (1<<3)
+#define    MT_8BIT_A8		           (4<<3)
+#define    MT_8BIT_MONO8	           (5<<3)
+#define    MT_16BIT_RGB565 		   (0<<3)       /* SURFACE_16BIT */
+#define    MT_16BIT_ARGB1555		   (1<<3)
+#define    MT_16BIT_ARGB4444		   (2<<3)
+#define    MT_16BIT_AY88		   (3<<3)
+#define    MT_16BIT_88DVDU	           (5<<3)
+#define    MT_16BIT_BUMP_655LDVDU	   (6<<3)
+#define    MT_16BIT_I16	                   (7<<3)
+#define    MT_16BIT_L16	                   (8<<3)
+#define    MT_16BIT_A16	                   (9<<3)
+#define    MT_32BIT_ARGB8888		   (0<<3)       /* SURFACE_32BIT */
+#define    MT_32BIT_ABGR8888		   (1<<3)
+#define    MT_32BIT_XRGB8888		   (2<<3)
+#define    MT_32BIT_XBGR8888		   (3<<3)
+#define    MT_32BIT_QWVU8888		   (4<<3)
+#define    MT_32BIT_AXVU8888		   (5<<3)
+#define    MT_32BIT_LXVU8888	           (6<<3)
+#define    MT_32BIT_XLVU8888	           (7<<3)
+#define    MT_32BIT_ARGB2101010	           (8<<3)
+#define    MT_32BIT_ABGR2101010	           (9<<3)
+#define    MT_32BIT_AWVU2101010	           (0xA<<3)
+#define    MT_32BIT_GR1616	           (0xB<<3)
+#define    MT_32BIT_VU1616	           (0xC<<3)
+#define    MT_32BIT_xI824	           (0xD<<3)
+#define    MT_32BIT_xA824	           (0xE<<3)
+#define    MT_32BIT_xL824	           (0xF<<3)
+#define    MT_422_YCRCB_SWAPY	           (0<<3)       /* SURFACE_422 */
+#define    MT_422_YCRCB_NORMAL	           (1<<3)
+#define    MT_422_YCRCB_SWAPUV	           (2<<3)
+#define    MT_422_YCRCB_SWAPUVY	           (3<<3)
+#define    MT_COMPRESS_DXT1		   (0<<3)       /* SURFACE_COMPRESSED */
+#define    MT_COMPRESS_DXT2_3	           (1<<3)
+#define    MT_COMPRESS_DXT4_5	           (2<<3)
+#define    MT_COMPRESS_FXT1		   (3<<3)
+#define    MT_COMPRESS_DXT1_RGB		   (4<<3)
+#define MS3_USE_FENCE_REGS              (1<<2)
+#define MS3_TILED_SURFACE             (1<<1)
+#define MS3_TILE_WALK                 (1<<0)
+
+#define MS4_PITCH_SHIFT                 21
+#define MS4_CUBE_FACE_ENA_NEGX          (1<<20)
+#define MS4_CUBE_FACE_ENA_POSX          (1<<19)
+#define MS4_CUBE_FACE_ENA_NEGY          (1<<18)
+#define MS4_CUBE_FACE_ENA_POSY          (1<<17)
+#define MS4_CUBE_FACE_ENA_NEGZ          (1<<16)
+#define MS4_CUBE_FACE_ENA_POSZ          (1<<15)
+#define MS4_CUBE_FACE_ENA_MASK          (0x3f<<15)
+#define MS4_MAX_LOD_SHIFT		9
+#define MS4_MAX_LOD_MASK		(0x3f<<9)
+#define MS4_MIP_LAYOUT_LEGACY           (0<<8)
+#define MS4_MIP_LAYOUT_BELOW_LPT        (0<<8)
+#define MS4_MIP_LAYOUT_RIGHT_LPT        (1<<8)
+#define MS4_VOLUME_DEPTH_SHIFT          0
+#define MS4_VOLUME_DEPTH_MASK           (0xff<<0)
+
+/* p244 */
+#define _3DSTATE_SAMPLER_STATE         (CMD_3D|(0x1d<<24)|(0x1<<16))
+
+#define SS1_MAPMASK_SHIFT               0
+#define SS1_MAPMASK_MASK                (0x8fff<<0)
+
+#define SS2_REVERSE_GAMMA_ENABLE        (1<<31)
+#define SS2_PACKED_TO_PLANAR_ENABLE     (1<<30)
+#define SS2_COLORSPACE_CONVERSION       (1<<29)
+#define SS2_CHROMAKEY_SHIFT             27
+#define SS2_BASE_MIP_LEVEL_SHIFT        22
+#define SS2_BASE_MIP_LEVEL_MASK         (0x1f<<22)
+#define SS2_MIP_FILTER_SHIFT            20
+#define SS2_MIP_FILTER_MASK             (0x3<<20)
+#define   MIPFILTER_NONE       	0
+#define   MIPFILTER_NEAREST	1
+#define   MIPFILTER_LINEAR	3
+#define SS2_MAG_FILTER_SHIFT          17
+#define SS2_MAG_FILTER_MASK           (0x7<<17)
+#define   FILTER_NEAREST	0
+#define   FILTER_LINEAR		1
+#define   FILTER_ANISOTROPIC	2
+#define   FILTER_4X4_1    	3
+#define   FILTER_4X4_2    	4
+#define   FILTER_4X4_FLAT 	5
+#define   FILTER_6X5_MONO   	6       /* XXX - check */
+#define SS2_MIN_FILTER_SHIFT          14
+#define SS2_MIN_FILTER_MASK           (0x7<<14)
+#define SS2_LOD_BIAS_SHIFT            5
+#define SS2_LOD_BIAS_ONE              (0x10<<5)
+#define SS2_LOD_BIAS_MASK             (0x1ff<<5)
+/* Shadow requires:
+ *  MT_X8{I,L,A}24 or MT_{I,L,A}16 texture format
+ *  FILTER_4X4_x  MIN and MAG filters
+ */
+#define SS2_SHADOW_ENABLE             (1<<4)
+#define SS2_MAX_ANISO_MASK            (1<<3)
+#define SS2_MAX_ANISO_2               (0<<3)
+#define SS2_MAX_ANISO_4               (1<<3)
+#define SS2_SHADOW_FUNC_SHIFT         0
+#define SS2_SHADOW_FUNC_MASK          (0x7<<0)
+/* SS2_SHADOW_FUNC values: see COMPAREFUNC_* */
+
+#define SS3_MIN_LOD_SHIFT            24
+#define SS3_MIN_LOD_ONE              (0x10<<24)
+#define SS3_MIN_LOD_MASK             (0xff<<24)
+#define SS3_KILL_PIXEL_ENABLE        (1<<17)
+#define SS3_TCX_ADDR_MODE_SHIFT      12
+#define SS3_TCX_ADDR_MODE_MASK       (0x7<<12)
+#define   TEXCOORDMODE_WRAP		0
+#define   TEXCOORDMODE_MIRROR		1
+#define   TEXCOORDMODE_CLAMP_EDGE	2
+#define   TEXCOORDMODE_CUBE       	3
+#define   TEXCOORDMODE_CLAMP_BORDER	4
+#define   TEXCOORDMODE_MIRROR_ONCE      5
+#define SS3_TCY_ADDR_MODE_SHIFT      9
+#define SS3_TCY_ADDR_MODE_MASK       (0x7<<9)
+#define SS3_TCZ_ADDR_MODE_SHIFT      6
+#define SS3_TCZ_ADDR_MODE_MASK       (0x7<<6)
+#define SS3_NORMALIZED_COORDS        (1<<5)
+#define SS3_TEXTUREMAP_INDEX_SHIFT   1
+#define SS3_TEXTUREMAP_INDEX_MASK    (0xf<<1)
+#define SS3_DEINTERLACER_ENABLE      (1<<0)
+
+#define SS4_BORDER_COLOR_MASK        (~0)
+
+/* 3DSTATE_SPAN_STIPPLE, p258
+ */
+#define _3DSTATE_STIPPLE           ((0x3<<29)|(0x1d<<24)|(0x83<<16))
+#define ST1_ENABLE               (1<<16)
+#define ST1_MASK                 (0xffff)
+
+#define _3DSTATE_DEFAULT_Z          ((0x3<<29)|(0x1d<<24)|(0x98<<16))
+#define _3DSTATE_DEFAULT_DIFFUSE    ((0x3<<29)|(0x1d<<24)|(0x99<<16))
+#define _3DSTATE_DEFAULT_SPECULAR   ((0x3<<29)|(0x1d<<24)|(0x9a<<16))
+
+
+#define MI_FLUSH                   ((0<<29)|(4<<23))
+#define FLUSH_MAP_CACHE            (1<<0)
+#define INHIBIT_FLUSH_RENDER_CACHE (1<<2)
+
+
+#define CMD_3D (0x3<<29)
+
+
+#define _3DPRIMITIVE         ((0x3<<29)|(0x1f<<24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define I915PACKCOLOR4444(r,g,b,a) \
+  ((((a) & 0xf0) << 8) | (((r) & 0xf0) << 4) | ((g) & 0xf0) | ((b) >> 4))
+
+#define I915PACKCOLOR1555(r,g,b,a) \
+  ((((r) & 0xf8) << 7) | (((g) & 0xf8) << 2) | (((b) & 0xf8) >> 3) | \
+    ((a) ? 0x8000 : 0))
+
+#define I915PACKCOLOR565(r,g,b) \
+  ((((r) & 0xf8) << 8) | (((g) & 0xfc) << 3) | (((b) & 0xf8) >> 3))
+
+#define I915PACKCOLOR8888(r,g,b,a) \
+  ((a<<24) | (r<<16) | (g<<8) | b)
+
+
+
+
+#define BR00_BITBLT_CLIENT   0x40000000
+#define BR00_OP_COLOR_BLT    0x10000000
+#define BR00_OP_SRC_COPY_BLT 0x10C00000
+#define BR13_SOLID_PATTERN   0x80000000
+
+#define XY_COLOR_BLT_CMD		((2<<29)|(0x50<<22)|0x4)
+#define XY_COLOR_BLT_WRITE_ALPHA	(1<<21)
+#define XY_COLOR_BLT_WRITE_RGB		(1<<20)
+
+#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
+#define XY_SRC_COPY_BLT_WRITE_ALPHA     (1<<21)
+#define XY_SRC_COPY_BLT_WRITE_RGB       (1<<20)
+
+#define MI_WAIT_FOR_EVENT               ((0x3<<23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+#define MI_BATCH_BUFFER                 (0x30<<23)
+#define MI_BATCH_BUFFER_START           (0x31<<23)
+#define MI_BATCH_BUFFER_END             (0xa<<23)
+
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+#define STENCILOP_KEEP			0
+#define STENCILOP_ZERO			0x1
+#define STENCILOP_REPLACE		0x2
+#define STENCILOP_INCRSAT		0x3
+#define STENCILOP_DECRSAT		0x4
+#define STENCILOP_INCR			0x5
+#define STENCILOP_DECR			0x6
+#define STENCILOP_INVERT		0x7
+
+#define LOGICOP_CLEAR			0
+#define LOGICOP_NOR			0x1
+#define LOGICOP_AND_INV 		0x2
+#define LOGICOP_COPY_INV		0x3
+#define LOGICOP_AND_RVRSE		0x4
+#define LOGICOP_INV			0x5
+#define LOGICOP_XOR			0x6
+#define LOGICOP_NAND			0x7
+#define LOGICOP_AND			0x8
+#define LOGICOP_EQUIV			0x9
+#define LOGICOP_NOOP			0xa
+#define LOGICOP_OR_INV			0xb
+#define LOGICOP_COPY			0xc
+#define LOGICOP_OR_RVRSE		0xd
+#define LOGICOP_OR			0xe
+#define LOGICOP_SET			0xf
+
+#define BLENDFACT_ZERO			0x01
+#define BLENDFACT_ONE			0x02
+#define BLENDFACT_SRC_COLR		0x03
+#define BLENDFACT_INV_SRC_COLR 		0x04
+#define BLENDFACT_SRC_ALPHA		0x05
+#define BLENDFACT_INV_SRC_ALPHA 	0x06
+#define BLENDFACT_DST_ALPHA		0x07
+#define BLENDFACT_INV_DST_ALPHA 	0x08
+#define BLENDFACT_DST_COLR		0x09
+#define BLENDFACT_INV_DST_COLR		0x0a
+#define BLENDFACT_SRC_ALPHA_SATURATE	0x0b
+#define BLENDFACT_CONST_COLOR		0x0c
+#define BLENDFACT_INV_CONST_COLOR	0x0d
+#define BLENDFACT_CONST_ALPHA		0x0e
+#define BLENDFACT_INV_CONST_ALPHA	0x0f
+#define BLENDFACT_MASK          	0x0f
+
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_Q33_G			0x29D2
+
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
new file mode 100644
index 0000000000..1c976082df
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -0,0 +1,284 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_string.h"
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_screen.h"
+#include "i915_texture.h"
+
+
+static const char *
+i915_get_vendor( struct pipe_screen *pscreen )
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+i915_get_name( struct pipe_screen *pscreen )
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (i915_screen(pscreen)->pci_id) {
+   case PCI_CHIP_I915_G:
+      chipset = "915G";
+      break;
+   case PCI_CHIP_I915_GM:
+      chipset = "915GM";
+      break;
+   case PCI_CHIP_I945_G:
+      chipset = "945G";
+      break;
+   case PCI_CHIP_I945_GM:
+      chipset = "945GM";
+      break;
+   case PCI_CHIP_I945_GME:
+      chipset = "945GME";
+      break;
+   case PCI_CHIP_G33_G:
+      chipset = "G33";
+      break;
+   case PCI_CHIP_Q35_G:
+      chipset = "Q35";
+      break;
+   case PCI_CHIP_Q33_G:
+      chipset = "Q33";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i915 (chipset: %s)", chipset);
+   return buffer;
+}
+
+
+static int
+i915_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+i915_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+i915_is_format_supported( struct pipe_screen *screen,
+                          enum pipe_format format, 
+                          enum pipe_texture_target target,
+                          unsigned tex_usage, 
+                          unsigned geom_flags )
+{
+   static const enum pipe_format tex_supported[] = {
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_A8L8_UNORM,
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      PIPE_FORMAT_S8Z24_UNORM,
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   static const enum pipe_format surface_supported[] = {
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+      /*PIPE_FORMAT_R16G16B16A16_SNORM,*/
+      PIPE_FORMAT_NONE  /* list terminator */
+   };
+   const enum pipe_format *list;
+   uint i;
+
+   if(tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET)
+      list = surface_supported;
+   else
+      list = tex_supported;
+
+   for (i = 0; list[i] != PIPE_FORMAT_NONE; i++) {
+      if (list[i] == format)
+         return TRUE;
+   }
+
+   return FALSE;
+}
+
+
+static void
+i915_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+static void *
+i915_surface_map( struct pipe_screen *screen,
+                  struct pipe_surface *surface,
+                  unsigned flags )
+{
+   char *map = pipe_buffer_map( screen, surface->buffer, flags );
+   if (map == NULL)
+      return NULL;
+
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify contexts of a texture change.  
+       */
+      /* i915_screen(screen)->timestamp++; */
+   }
+   
+   return map + surface->offset;
+}
+
+static void
+i915_surface_unmap(struct pipe_screen *screen,
+                   struct pipe_surface *surface)
+{
+   pipe_buffer_unmap( screen, surface->buffer );
+}
+
+
+
+/**
+ * Create a new i915_screen object
+ */
+struct pipe_screen *
+i915_create_screen(struct pipe_winsys *winsys, uint pci_id)
+{
+   struct i915_screen *i915screen = CALLOC_STRUCT(i915_screen);
+
+   if (!i915screen)
+      return NULL;
+
+   switch (pci_id) {
+   case PCI_CHIP_I915_G:
+   case PCI_CHIP_I915_GM:
+      i915screen->is_i945 = FALSE;
+      break;
+
+   case PCI_CHIP_I945_G:
+   case PCI_CHIP_I945_GM:
+   case PCI_CHIP_I945_GME:
+   case PCI_CHIP_G33_G:
+   case PCI_CHIP_Q33_G:
+   case PCI_CHIP_Q35_G:
+      i915screen->is_i945 = TRUE;
+      break;
+
+   default:
+      debug_printf("%s: unknown pci id 0x%x, cannot create screen\n", 
+                   __FUNCTION__, pci_id);
+      return NULL;
+   }
+
+   i915screen->pci_id = pci_id;
+
+   i915screen->screen.winsys = winsys;
+
+   i915screen->screen.destroy = i915_destroy_screen;
+
+   i915screen->screen.get_name = i915_get_name;
+   i915screen->screen.get_vendor = i915_get_vendor;
+   i915screen->screen.get_param = i915_get_param;
+   i915screen->screen.get_paramf = i915_get_paramf;
+   i915screen->screen.is_format_supported = i915_is_format_supported;
+   i915screen->screen.surface_map = i915_surface_map;
+   i915screen->screen.surface_unmap = i915_surface_unmap;
+
+   i915_init_screen_texture_functions(&i915screen->screen);
+
+   return &i915screen->screen;
+}
diff --git a/src/gallium/drivers/i915simple/i915_screen.h b/src/gallium/drivers/i915simple/i915_screen.h
new file mode 100644
index 0000000000..73b0ff05ce
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_screen.h
@@ -0,0 +1,69 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef I915_SCREEN_H
+#define I915_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct i915_screen
+{
+   struct pipe_screen screen;
+
+   boolean is_i945;
+   uint pci_id;
+};
+
+
+/** cast wrapper */
+static INLINE struct i915_screen *
+i915_screen(struct pipe_screen *pscreen)
+{
+   return (struct i915_screen *) pscreen;
+}
+
+
+extern struct pipe_screen *
+i915_create_screen(struct pipe_winsys *winsys, uint pci_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* I915_SCREEN_H */
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
new file mode 100644
index 0000000000..d2487d8277
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -0,0 +1,788 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "draw/draw_context.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "i915_state_inlines.h"
+#include "i915_fpc.h"
+
+/* The i915 (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned
+translate_wrap_mode(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return TEXCOORDMODE_CLAMP_EDGE;   /* not quite correct */
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return TEXCOORDMODE_CLAMP_EDGE;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return TEXCOORDMODE_CLAMP_BORDER;
+//   case PIPE_TEX_WRAP_MIRRORED_REPEAT:
+//      return TEXCOORDMODE_MIRROR;
+   default:
+      return TEXCOORDMODE_WRAP;
+   }
+}
+
+static unsigned translate_img_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      return FILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:
+      return FILTER_LINEAR;
+   case PIPE_TEX_FILTER_ANISO:
+      return FILTER_ANISOTROPIC;
+   default:
+      assert(0);
+      return FILTER_NEAREST;
+   }
+}
+
+static unsigned translate_mip_filter( unsigned filter )
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      return MIPFILTER_NONE;
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      return MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      return MIPFILTER_LINEAR;
+   default:
+      assert(0);
+      return MIPFILTER_NONE;
+   }
+}
+
+
+/* None of this state is actually used for anything yet.
+ */
+static void *
+i915_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{
+   struct i915_blend_state *cso_data = CALLOC_STRUCT( i915_blend_state );
+
+   {
+      unsigned eqRGB  = blend->rgb_func;
+      unsigned srcRGB = blend->rgb_src_factor;
+      unsigned dstRGB = blend->rgb_dst_factor;
+
+      unsigned eqA    = blend->alpha_func;
+      unsigned srcA   = blend->alpha_src_factor;
+      unsigned dstA   = blend->alpha_dst_factor;
+
+      /* Special handling for MIN/MAX filter modes handled at
+       * state_tracker level.
+       */
+
+      if (srcA != srcRGB ||
+	  dstA != dstRGB ||
+	  eqA != eqRGB) {
+
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          IAB_ENABLE |
+                          IAB_MODIFY_FUNC |
+                          IAB_MODIFY_SRC_FACTOR |
+                          IAB_MODIFY_DST_FACTOR |
+                          SRC_ABLND_FACT(i915_translate_blend_factor(srcA)) |
+                          DST_ABLND_FACT(i915_translate_blend_factor(dstA)) |
+                          (i915_translate_blend_func(eqA) << IAB_FUNC_SHIFT));
+      }
+      else {
+	 cso_data->iab = (_3DSTATE_INDEPENDENT_ALPHA_BLEND_CMD |
+                          IAB_MODIFY_ENABLE |
+                          0);
+      }
+   }
+
+   cso_data->modes4 |= (_3DSTATE_MODES_4_CMD |
+                        ENABLE_LOGIC_OP_FUNC |
+                        LOGIC_OP_FUNC(i915_translate_logic_op(blend->logicop_func)));
+
+   if (blend->logicop_enable)
+      cso_data->LIS5 |= S5_LOGICOP_ENABLE;
+
+   if (blend->dither)
+      cso_data->LIS5 |= S5_COLOR_DITHER_ENABLE;
+
+   if ((blend->colormask & PIPE_MASK_R) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_RED;
+
+   if ((blend->colormask & PIPE_MASK_G) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_GREEN;
+
+   if ((blend->colormask & PIPE_MASK_B) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_BLUE;
+
+   if ((blend->colormask & PIPE_MASK_A) == 0)
+      cso_data->LIS5 |= S5_WRITEDISABLE_ALPHA;
+
+   if (blend->blend_enable) {
+      unsigned funcRGB = blend->rgb_func;
+      unsigned srcRGB  = blend->rgb_src_factor;
+      unsigned dstRGB  = blend->rgb_dst_factor;
+
+      cso_data->LIS6 |= (S6_CBUF_BLEND_ENABLE |
+                         SRC_BLND_FACT(i915_translate_blend_factor(srcRGB)) |
+                         DST_BLND_FACT(i915_translate_blend_factor(dstRGB)) |
+                         (i915_translate_blend_func(funcRGB) << S6_CBUF_BLEND_FUNC_SHIFT));
+   }
+
+   return cso_data;
+}
+
+static void i915_bind_blend_state(struct pipe_context *pipe,
+                                  void *blend)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend = (struct i915_blend_state*)blend;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+
+static void i915_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void i915_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->blend_color = *blend_color;
+
+   i915->dirty |= I915_NEW_BLEND;
+}
+
+static void *
+i915_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   struct i915_sampler_state *cso = CALLOC_STRUCT( i915_sampler_state );
+   const unsigned ws = sampler->wrap_s;
+   const unsigned wt = sampler->wrap_t;
+   const unsigned wr = sampler->wrap_r;
+   unsigned minFilt, magFilt;
+   unsigned mipFilt;
+
+   cso->templ = sampler;
+
+   mipFilt = translate_mip_filter(sampler->min_mip_filter);
+   minFilt = translate_img_filter( sampler->min_img_filter );
+   magFilt = translate_img_filter( sampler->mag_img_filter );
+   
+   if (sampler->max_anisotropy > 2.0) {
+      cso->state[0] |= SS2_MAX_ANISO_4;
+   }
+
+   {
+      int b = (int) (sampler->lod_bias * 16.0);
+      b = CLAMP(b, -256, 255);
+      cso->state[0] |= ((b << SS2_LOD_BIAS_SHIFT) & SS2_LOD_BIAS_MASK);
+   }
+
+   /* Shadow:
+    */
+   if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) 
+   {
+      cso->state[0] |= (SS2_SHADOW_ENABLE |
+                        i915_translate_compare_func(sampler->compare_func));
+
+      minFilt = FILTER_4X4_FLAT;
+      magFilt = FILTER_4X4_FLAT;
+   }
+
+   cso->state[0] |= ((minFilt << SS2_MIN_FILTER_SHIFT) |
+                     (mipFilt << SS2_MIP_FILTER_SHIFT) |
+                     (magFilt << SS2_MAG_FILTER_SHIFT));
+
+   cso->state[1] |=
+      ((translate_wrap_mode(ws) << SS3_TCX_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wt) << SS3_TCY_ADDR_MODE_SHIFT) |
+       (translate_wrap_mode(wr) << SS3_TCZ_ADDR_MODE_SHIFT));
+
+   if (sampler->normalized_coords)
+      cso->state[1] |= SS3_NORMALIZED_COORDS;
+
+   {
+      int minlod = (int) (16.0 * sampler->min_lod);
+      int maxlod = (int) (16.0 * sampler->max_lod);
+      minlod = CLAMP(minlod, 0, 16 * 11);
+      maxlod = CLAMP(maxlod, 0, 16 * 11);
+
+      if (minlod > maxlod)
+	 maxlod = minlod;
+
+      cso->minlod = minlod;
+      cso->maxlod = maxlod;
+   }
+
+   {
+      ubyte r = float_to_ubyte(sampler->border_color[0]);
+      ubyte g = float_to_ubyte(sampler->border_color[1]);
+      ubyte b = float_to_ubyte(sampler->border_color[2]);
+      ubyte a = float_to_ubyte(sampler->border_color[3]);
+      cso->state[2] = I915PACKCOLOR8888(r, g, b, a);
+   }
+   return cso;
+}
+
+static void i915_bind_sampler_states(struct pipe_context *pipe,
+                                     unsigned num, void **sampler)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_samplers &&
+       !memcmp(i915->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; ++i)
+      i915->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      i915->sampler[i] = NULL;
+
+   i915->num_samplers = num;
+
+   i915->dirty |= I915_NEW_SAMPLER;
+}
+
+static void i915_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   FREE(sampler);
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+static void *
+i915_create_depth_stencil_state(struct pipe_context *pipe,
+				const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   struct i915_depth_stencil_state *cso = CALLOC_STRUCT( i915_depth_stencil_state );
+
+   {
+      int testmask = depth_stencil->stencil[0].value_mask & 0xff;
+      int writemask = depth_stencil->stencil[0].write_mask & 0xff;
+
+      cso->stencil_modes4 |= (_3DSTATE_MODES_4_CMD |
+                              ENABLE_STENCIL_TEST_MASK |
+                              STENCIL_TEST_MASK(testmask) |
+                              ENABLE_STENCIL_WRITE_MASK |
+                              STENCIL_WRITE_MASK(writemask));
+   }
+
+   if (depth_stencil->stencil[0].enabled) {
+      int test = i915_translate_compare_func(depth_stencil->stencil[0].func);
+      int fop  = i915_translate_stencil_op(depth_stencil->stencil[0].fail_op);
+      int dfop = i915_translate_stencil_op(depth_stencil->stencil[0].zfail_op);
+      int dpop = i915_translate_stencil_op(depth_stencil->stencil[0].zpass_op);
+      int ref  = depth_stencil->stencil[0].ref_value & 0xff;
+
+      cso->stencil_LIS5 |= (S5_STENCIL_TEST_ENABLE |
+                            S5_STENCIL_WRITE_ENABLE |
+                            (ref  << S5_STENCIL_REF_SHIFT) |
+                            (test << S5_STENCIL_TEST_FUNC_SHIFT) |
+                            (fop  << S5_STENCIL_FAIL_SHIFT) |
+                            (dfop << S5_STENCIL_PASS_Z_FAIL_SHIFT) |
+                            (dpop << S5_STENCIL_PASS_Z_PASS_SHIFT));
+   }
+
+   if (depth_stencil->stencil[1].enabled) {
+      int test  = i915_translate_compare_func(depth_stencil->stencil[1].func);
+      int fop   = i915_translate_stencil_op(depth_stencil->stencil[1].fail_op);
+      int dfop  = i915_translate_stencil_op(depth_stencil->stencil[1].zfail_op);
+      int dpop  = i915_translate_stencil_op(depth_stencil->stencil[1].zpass_op);
+      int ref   = depth_stencil->stencil[1].ref_value & 0xff;
+      int tmask = depth_stencil->stencil[1].value_mask & 0xff;
+      int wmask = depth_stencil->stencil[1].write_mask & 0xff;
+
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_FUNCS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     BFO_ENABLE_STENCIL_REF |
+                     BFO_STENCIL_TWO_SIDE |
+                     (ref  << BFO_STENCIL_REF_SHIFT) |
+                     (test << BFO_STENCIL_TEST_SHIFT) |
+                     (fop  << BFO_STENCIL_FAIL_SHIFT) |
+                     (dfop << BFO_STENCIL_PASS_Z_FAIL_SHIFT) |
+                     (dpop << BFO_STENCIL_PASS_Z_PASS_SHIFT));
+
+      cso->bfo[1] = (_3DSTATE_BACKFACE_STENCIL_MASKS |
+                     BFM_ENABLE_STENCIL_TEST_MASK |
+                     BFM_ENABLE_STENCIL_WRITE_MASK |
+                     (tmask << BFM_STENCIL_TEST_MASK_SHIFT) |
+                     (wmask << BFM_STENCIL_WRITE_MASK_SHIFT));
+   }
+   else {
+      /* This actually disables two-side stencil: The bit set is a
+       * modify-enable bit to indicate we are changing the two-side
+       * setting.  Then there is a symbolic zero to show that we are
+       * setting the flag to zero/off.
+       */
+      cso->bfo[0] = (_3DSTATE_BACKFACE_STENCIL_OPS |
+                     BFO_ENABLE_STENCIL_TWO_SIDE |
+                     0);
+      cso->bfo[1] = 0;
+   }
+
+   if (depth_stencil->depth.enabled) {
+      int func = i915_translate_compare_func(depth_stencil->depth.func);
+
+      cso->depth_LIS6 |= (S6_DEPTH_TEST_ENABLE |
+                          (func << S6_DEPTH_TEST_FUNC_SHIFT));
+
+      if (depth_stencil->depth.writemask)
+	 cso->depth_LIS6 |= S6_DEPTH_WRITE_ENABLE;
+   }
+
+   if (depth_stencil->alpha.enabled) {
+      int test = i915_translate_compare_func(depth_stencil->alpha.func);
+      ubyte refByte = float_to_ubyte(depth_stencil->alpha.ref);
+
+      cso->depth_LIS6 |= (S6_ALPHA_TEST_ENABLE |
+			  (test << S6_ALPHA_TEST_FUNC_SHIFT) |
+			  (((unsigned) refByte) << S6_ALPHA_REF_SHIFT));
+   }
+
+   return cso;
+}
+
+static void i915_bind_depth_stencil_state(struct pipe_context *pipe,
+                                          void *depth_stencil)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->depth_stencil = (const struct i915_depth_stencil_state *)depth_stencil;
+
+   i915->dirty |= I915_NEW_DEPTH_STENCIL;
+}
+
+static void i915_delete_depth_stencil_state(struct pipe_context *pipe,
+                                            void *depth_stencil)
+{
+   FREE(depth_stencil);
+}
+
+
+static void i915_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   memcpy( &i915->scissor, scissor, sizeof(*scissor) );
+   i915->dirty |= I915_NEW_SCISSOR;
+}
+
+
+static void i915_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+
+static void *
+i915_create_fs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct i915_fragment_shader *ifs = CALLOC_STRUCT(i915_fragment_shader);
+   if (!ifs)
+      return NULL;
+
+   ifs->state.tokens = tgsi_dup_tokens(templ->tokens);
+
+   tgsi_scan_shader(templ->tokens, &ifs->info);
+
+   /* The shader's compiled to i915 instructions here */
+   i915_translate_fragment_program(i915, ifs);
+
+   return ifs;
+}
+
+static void
+i915_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->fs = (struct i915_fragment_shader*) shader;
+
+   i915->dirty |= I915_NEW_FS;
+}
+
+static
+void i915_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_fragment_shader *ifs = (struct i915_fragment_shader *) shader;
+
+   if (ifs->program)
+      FREE(ifs->program);
+   ifs->program_len = 0;
+
+   FREE((struct tgsi_token *)ifs->state.tokens);
+
+   FREE(ifs);
+}
+
+
+static void *
+i915_create_vs_state(struct pipe_context *pipe,
+                     const struct pipe_shader_state *templ)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   return draw_create_vertex_shader(i915->draw, templ);
+}
+
+static void i915_bind_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_bind_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+
+   i915->dirty |= I915_NEW_VS;
+}
+
+static void i915_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   /* just pass-through to draw module */
+   draw_delete_vertex_shader(i915->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void i915_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   struct pipe_winsys *ws = pipe->winsys;
+   draw_flush(i915->draw);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* Make a copy of shader constants.
+    * During fragment program translation we may add additional
+    * constants to the array.
+    *
+    * We want to consider the situation where some user constants
+    * (ex: a material color) may change frequently but the shader program
+    * stays the same.  In that case we should only be updating the first
+    * N constants, leaving any extras from shader translation alone.
+    */
+   if (buf) {
+      void *mapped;
+      if (buf->size &&
+          (mapped = ws->buffer_map(ws, buf->buffer,
+                                   PIPE_BUFFER_USAGE_CPU_READ))) {
+         memcpy(i915->current.constants[shader], mapped, buf->size);
+         ws->buffer_unmap(ws, buf->buffer);
+         i915->current.num_user_constants[shader]
+            = buf->size / (4 * sizeof(float));
+      }
+      else {
+         i915->current.num_user_constants[shader] = 0;
+      }
+   }
+
+   i915->dirty |= I915_NEW_CONSTANTS;
+}
+
+
+static void i915_set_sampler_textures(struct pipe_context *pipe,
+                                      unsigned num,
+                                      struct pipe_texture **texture)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == i915->num_textures &&
+       !memcmp(i915->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   /* Fixes wrong texture in texobj with VBUF */
+   draw_flush(i915->draw);
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &i915->texture[i],
+                             texture[i]);
+
+   for (i = num; i < i915->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &i915->texture[i],
+                             NULL);
+
+   i915->num_textures = num;
+
+   i915->dirty |= I915_NEW_TEXTURE;
+}
+
+
+
+static void i915_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   i915->framebuffer = *fb; /* struct copy */
+
+   i915->dirty |= I915_NEW_FRAMEBUFFER;
+}
+
+
+
+static void i915_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct i915_context *i915 = i915_context(pipe);
+   draw_flush(i915->draw);
+
+   draw_set_clip_state(i915->draw, clip);
+
+   i915->dirty |= I915_NEW_CLIP;
+}
+
+
+
+/* Called when driver state tracker notices changes to the viewport
+ * matrix:
+ */
+static void i915_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->viewport = *viewport; /* struct copy */
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(i915->draw, &i915->viewport);
+
+   i915->dirty |= I915_NEW_VIEWPORT;
+}
+
+
+static void *
+i915_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   struct i915_rasterizer_state *cso = CALLOC_STRUCT( i915_rasterizer_state );
+
+   cso->templ = rasterizer;
+   cso->color_interp = rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+   cso->light_twoside = rasterizer->light_twoside;
+   cso->ds[0].u = _3DSTATE_DEPTH_OFFSET_SCALE;
+   cso->ds[1].f = rasterizer->offset_scale;
+   if (rasterizer->poly_stipple_enable) {
+      cso->st |= ST1_ENABLE;
+   }
+
+   if (rasterizer->scissor)
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | ENABLE_SCISSOR_RECT;
+   else
+      cso->sc[0] = _3DSTATE_SCISSOR_ENABLE_CMD | DISABLE_SCISSOR_RECT;
+
+   switch (rasterizer->cull_mode) {
+   case PIPE_WINDING_NONE:
+      cso->LIS4 |= S4_CULLMODE_NONE;
+      break;
+   case PIPE_WINDING_CW:
+      cso->LIS4 |= S4_CULLMODE_CW;
+      break;
+   case PIPE_WINDING_CCW:
+      cso->LIS4 |= S4_CULLMODE_CCW;
+      break;
+   case PIPE_WINDING_BOTH:
+      cso->LIS4 |= S4_CULLMODE_BOTH;
+      break;
+   }
+
+   {
+      int line_width = CLAMP((int)(rasterizer->line_width * 2), 1, 0xf);
+
+      cso->LIS4 |= line_width << S4_LINE_WIDTH_SHIFT;
+
+      if (rasterizer->line_smooth)
+	 cso->LIS4 |= S4_LINE_ANTIALIAS_ENABLE;
+   }
+
+   {
+      int point_size = CLAMP((int) rasterizer->point_size, 1, 0xff);
+
+      cso->LIS4 |= point_size << S4_POINT_WIDTH_SHIFT;
+   }
+
+   if (rasterizer->flatshade) {
+      cso->LIS4 |= (S4_FLATSHADE_ALPHA |
+                    S4_FLATSHADE_COLOR |
+                    S4_FLATSHADE_SPECULAR);
+   }
+
+   cso->LIS7 = fui( rasterizer->offset_units );
+
+
+   return cso;
+}
+
+static void i915_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *raster )
+{
+   struct i915_context *i915 = i915_context(pipe);
+
+   i915->rasterizer = (struct i915_rasterizer_state *)raster;
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(i915->draw,
+                          (i915->rasterizer ? i915->rasterizer->templ : NULL));
+
+   i915->dirty |= I915_NEW_RASTERIZER;
+}
+
+static void i915_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *raster)
+{
+   FREE(raster);
+}
+
+static void i915_set_vertex_buffers(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_buffer *buffers)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   memcpy(i915->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   i915->num_vertex_buffers = count;
+
+   /* pass-through to draw module */
+   draw_set_vertex_buffers(i915->draw, count, buffers);
+}
+
+static void i915_set_vertex_elements(struct pipe_context *pipe,
+                                     unsigned count,
+                                     const struct pipe_vertex_element *elements)
+{
+   struct i915_context *i915 = i915_context(pipe);
+   /* Because we change state before the draw_set_vertex_buffers call
+    * we need a flush here, just to be sure.
+    */
+   draw_flush(i915->draw);
+
+   i915->num_vertex_elements = count;
+   /* pass-through to draw module */
+   draw_set_vertex_elements(i915->draw, count, elements);
+}
+
+
+static void i915_set_edgeflags(struct pipe_context *pipe,
+                               const unsigned *bitfield)
+{
+   /* TODO do something here */
+}
+
+void
+i915_init_state_functions( struct i915_context *i915 )
+{
+   i915->pipe.set_edgeflags = i915_set_edgeflags;
+   i915->pipe.create_blend_state = i915_create_blend_state;
+   i915->pipe.bind_blend_state = i915_bind_blend_state;
+   i915->pipe.delete_blend_state = i915_delete_blend_state;
+
+   i915->pipe.create_sampler_state = i915_create_sampler_state;
+   i915->pipe.bind_sampler_states = i915_bind_sampler_states;
+   i915->pipe.delete_sampler_state = i915_delete_sampler_state;
+
+   i915->pipe.create_depth_stencil_alpha_state = i915_create_depth_stencil_state;
+   i915->pipe.bind_depth_stencil_alpha_state = i915_bind_depth_stencil_state;
+   i915->pipe.delete_depth_stencil_alpha_state = i915_delete_depth_stencil_state;
+
+   i915->pipe.create_rasterizer_state = i915_create_rasterizer_state;
+   i915->pipe.bind_rasterizer_state = i915_bind_rasterizer_state;
+   i915->pipe.delete_rasterizer_state = i915_delete_rasterizer_state;
+   i915->pipe.create_fs_state = i915_create_fs_state;
+   i915->pipe.bind_fs_state = i915_bind_fs_state;
+   i915->pipe.delete_fs_state = i915_delete_fs_state;
+   i915->pipe.create_vs_state = i915_create_vs_state;
+   i915->pipe.bind_vs_state = i915_bind_vs_state;
+   i915->pipe.delete_vs_state = i915_delete_vs_state;
+
+   i915->pipe.set_blend_color = i915_set_blend_color;
+   i915->pipe.set_clip_state = i915_set_clip_state;
+   i915->pipe.set_constant_buffer = i915_set_constant_buffer;
+   i915->pipe.set_framebuffer_state = i915_set_framebuffer_state;
+
+   i915->pipe.set_polygon_stipple = i915_set_polygon_stipple;
+   i915->pipe.set_scissor_state = i915_set_scissor_state;
+   i915->pipe.set_sampler_textures = i915_set_sampler_textures;
+   i915->pipe.set_viewport_state = i915_set_viewport_state;
+   i915->pipe.set_vertex_buffers = i915_set_vertex_buffers;
+   i915->pipe.set_vertex_elements = i915_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state.h b/src/gallium/drivers/i915simple/i915_state.h
new file mode 100644
index 0000000000..86c6b0027d
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef I915_STATE_H
+#define I915_STATE_H
+
+struct i915_context;
+
+
+struct i915_tracked_state {
+   unsigned dirty;
+   void (*update)( struct i915_context * );
+};
+
+void i915_update_immediate( struct i915_context *i915 );
+void i915_update_dynamic( struct i915_context *i915 );
+void i915_update_derived( struct i915_context *i915 );
+void i915_update_samplers( struct i915_context *i915 );
+void i915_update_textures(struct i915_context *i915);
+
+void i915_emit_hardware_state( struct i915_context *i915 );
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
new file mode 100644
index 0000000000..178d4e8781
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -0,0 +1,183 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+#include "i915_fpc.h"
+
+
+
+/**
+ * Determine the hardware vertex layout.
+ * Depends on vertex/fragment shader state.
+ */
+static void calculate_vertex_layout( struct i915_context *i915 )
+{
+   const struct i915_fragment_shader *fs = i915->fs;
+   const enum interp_mode colorInterp = i915->rasterizer->color_interp;
+   struct vertex_info vinfo;
+   boolean texCoords[8], colors[2], fog, needW;
+   uint i;
+   int src;
+
+   memset(texCoords, 0, sizeof(texCoords));
+   colors[0] = colors[1] = fog = needW = FALSE;
+   memset(&vinfo, 0, sizeof(vinfo));
+
+   /* Determine which fragment program inputs are needed.  Setup HW vertex
+    * layout below, in the HW-specific attribute order.
+    */
+   for (i = 0; i < fs->info.num_inputs; i++) {
+      switch (fs->info.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         break;
+      case TGSI_SEMANTIC_COLOR:
+         assert(fs->info.input_semantic_index[i] < 2);
+         colors[fs->info.input_semantic_index[i]] = TRUE;
+         break;
+      case TGSI_SEMANTIC_GENERIC:
+         /* usually a texcoord */
+         {
+            const uint unit = fs->info.input_semantic_index[i];
+            assert(unit < 8);
+            texCoords[unit] = TRUE;
+            needW = TRUE;
+         }
+         break;
+      case TGSI_SEMANTIC_FOG:
+         fog = TRUE;
+         break;
+      default:
+         assert(0);
+      }
+   }
+
+   
+   /* pos */
+   src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+   if (needW) {
+      draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZW;
+      vinfo.attrib[0].emit = EMIT_4F;
+   }
+   else {
+      draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
+      vinfo.hwfmt[0] |= S4_VFMT_XYZ;
+      vinfo.attrib[0].emit = EMIT_3F;
+   }
+
+   /* hardware point size */
+   /* XXX todo */
+
+   /* primary color */
+   if (colors[0]) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_COLOR;
+   }
+
+   /* secondary color */
+   if (colors[1]) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
+      draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
+      vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
+   }
+
+   /* fog coord, not fog blend factor */
+   if (fog) {
+      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
+      draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+      vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
+   }
+
+   /* texcoords */
+   for (i = 0; i < 8; i++) {
+      uint hwtc;
+      if (texCoords[i]) {
+         hwtc = TEXCOORDFMT_4D;
+         src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
+         draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+      }
+      else {
+         hwtc = TEXCOORDFMT_NOT_PRESENT;
+      }
+      vinfo.hwfmt[1] |= hwtc << (i * 4);
+   }
+
+   draw_compute_vertex_size(&vinfo);
+
+   if (memcmp(&i915->current.vertex_info, &vinfo, sizeof(vinfo))) {
+      /* Need to set this flag so that the LIS2/4 registers get set.
+       * It also means the i915_update_immediate() function must be called
+       * after this one, in i915_update_derived().
+       */
+      i915->dirty |= I915_NEW_VERTEX_FORMAT;
+
+      memcpy(&i915->current.vertex_info, &vinfo, sizeof(vinfo));
+   }
+}
+
+
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void i915_update_derived( struct i915_context *i915 )
+{
+   if (i915->dirty & (I915_NEW_RASTERIZER | I915_NEW_FS | I915_NEW_VS))
+      calculate_vertex_layout( i915 );
+
+   if (i915->dirty & (I915_NEW_SAMPLER | I915_NEW_TEXTURE))
+      i915_update_samplers(i915);
+
+   if (i915->dirty & I915_NEW_TEXTURE)
+      i915_update_textures(i915);
+
+   if (i915->dirty)
+      i915_update_immediate( i915 );
+
+   if (i915->dirty)
+      i915_update_dynamic( i915 );
+
+   if (i915->dirty & I915_NEW_FS) {
+      i915->hardware_dirty |= I915_HW_PROGRAM; /* XXX right? */
+   }
+
+   /* HW emit currently references framebuffer state directly:
+    */
+   if (i915->dirty & I915_NEW_FRAMEBUFFER)
+      i915->hardware_dirty |= I915_HW_STATIC;
+
+   i915->dirty = 0;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_dynamic.c b/src/gallium/drivers/i915simple/i915_state_dynamic.c
new file mode 100644
index 0000000000..86126a5a15
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_dynamic.c
@@ -0,0 +1,310 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_batch.h"
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_pack_color.h"
+
+#define FILE_DEBUG_FLAG DEBUG_STATE
+
+/* State that we have chosen to store in the DYNAMIC segment of the
+ * i915 indirect state mechanism.  
+ *
+ * Can't cache these in the way we do the static state, as there is no
+ * start/size in the command packet, instead an 'end' value that gets
+ * incremented.
+ *
+ * Additionally, there seems to be a requirement to re-issue the full
+ * (active) state every time a 4kb boundary is crossed.
+ */
+
+static INLINE void set_dynamic_indirect( struct i915_context *i915,
+					 unsigned offset,
+					 const unsigned *src,
+					 unsigned dwords )
+{
+   unsigned i;
+
+   for (i = 0; i < dwords; i++)
+      i915->current.dynamic[offset + i] = src[i];
+
+   i915->hardware_dirty |= I915_HW_DYNAMIC;
+}
+
+
+/***********************************************************************
+ * Modes4: stencil masks and logicop 
+ */
+static void upload_MODES4( struct i915_context *i915 )
+{
+   unsigned modes4 = 0;
+
+   /* I915_NEW_STENCIL */
+   modes4 |= i915->depth_stencil->stencil_modes4;
+   /* I915_NEW_BLEND */
+   modes4 |= i915->blend->modes4;
+
+   /* Always, so that we know when state is in-active: 
+    */
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_MODES4,
+			 &modes4,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_MODES4 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL,
+   upload_MODES4
+};
+
+
+
+
+/***********************************************************************
+ */
+
+static void upload_BFO( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_BFO_0,
+			 &(i915->depth_stencil->bfo[0]),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BFO = {
+   I915_NEW_DEPTH_STENCIL,
+   upload_BFO
+};
+
+
+/***********************************************************************
+ */
+
+
+static void upload_BLENDCOLOR( struct i915_context *i915 )
+{
+   unsigned bc[2];
+
+   memset( bc, 0, sizeof(bc) );
+
+   /* I915_NEW_BLEND {_COLOR} 
+    */
+   {
+      const float *color = i915->blend_color.color;
+
+      bc[0] = _3DSTATE_CONST_BLEND_COLOR_CMD;
+      bc[1] = pack_ui32_float4( color[0],
+				color[1],
+				color[2], 
+				color[3] );
+   }
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_BC_0,
+			 bc,
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_BLENDCOLOR = {
+   I915_NEW_BLEND,
+   upload_BLENDCOLOR
+};
+
+/***********************************************************************
+ */
+
+
+static void upload_IAB( struct i915_context *i915 )
+{
+   unsigned iab = i915->blend->iab;
+
+
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_IAB,
+			 &iab,
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_IAB = {
+   I915_NEW_BLEND,
+   upload_IAB
+};
+
+
+/***********************************************************************
+ */
+
+
+
+static void upload_DEPTHSCALE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_DEPTHSCALE_0,
+			 &(i915->rasterizer->ds[0].u),
+			 2 );
+}
+
+const struct i915_tracked_state i915_upload_DEPTHSCALE = {
+   I915_NEW_RASTERIZER,
+   upload_DEPTHSCALE
+};
+
+
+
+/***********************************************************************
+ * Polygon stipple
+ *
+ * The i915 supports a 4x4 stipple natively, GL wants 32x32.
+ * Fortunately stipple is usually a repeating pattern.
+ *
+ * XXX: does stipple pattern need to be adjusted according to
+ * the window position?
+ *
+ * XXX: possibly need workaround for conform paths test. 
+ */
+
+static void upload_STIPPLE( struct i915_context *i915 )
+{
+   unsigned st[2];
+
+   st[0] = _3DSTATE_STIPPLE;
+   st[1] = 0;
+
+   /* I915_NEW_RASTERIZER
+    */
+   st[1] |= i915->rasterizer->st;
+
+
+   /* I915_NEW_STIPPLE
+    */
+   {
+      const ubyte *mask = (const ubyte *)i915->poly_stipple.stipple;
+      ubyte p[4];
+
+      p[0] = mask[12] & 0xf;
+      p[1] = mask[8] & 0xf;
+      p[2] = mask[4] & 0xf;
+      p[3] = mask[0] & 0xf;
+
+      /* Not sure what to do about fallbacks, so for now just dont:
+       */
+      st[1] |= ((p[0] << 0) |
+		(p[1] << 4) |
+		(p[2] << 8) | 
+		(p[3] << 12));
+   }
+
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_STP_0,
+			 &st[0],
+			 2 );
+}
+
+
+const struct i915_tracked_state i915_upload_STIPPLE = {
+   I915_NEW_RASTERIZER | I915_NEW_STIPPLE,
+   upload_STIPPLE
+};
+
+
+
+/***********************************************************************
+ * Scissor.
+ */
+static void upload_SCISSOR_ENABLE( struct i915_context *i915 )
+{
+   set_dynamic_indirect( i915,
+			 I915_DYNAMIC_SC_ENA_0,
+			 &(i915->rasterizer->sc[0]),
+			 1 );
+}
+
+const struct i915_tracked_state i915_upload_SCISSOR_ENABLE = {
+   I915_NEW_RASTERIZER,
+   upload_SCISSOR_ENABLE
+};
+
+
+
+static void upload_SCISSOR_RECT( struct i915_context *i915 )
+{
+   unsigned x1 = i915->scissor.minx;
+   unsigned y1 = i915->scissor.miny;
+   unsigned x2 = i915->scissor.maxx;
+   unsigned y2 = i915->scissor.maxy;
+   unsigned sc[3];
+ 
+   sc[0] = _3DSTATE_SCISSOR_RECT_0_CMD;
+   sc[1] = (y1 << 16) | (x1 & 0xffff);
+   sc[2] = (y2 << 16) | (x2 & 0xffff);
+
+   set_dynamic_indirect( i915, 
+			 I915_DYNAMIC_SC_RECT_0,
+			 &sc[0],
+			 3 );
+}
+
+
+const struct i915_tracked_state i915_upload_SCISSOR_RECT = {
+   I915_NEW_SCISSOR,
+   upload_SCISSOR_RECT
+};
+
+
+
+
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_MODES4,
+   &i915_upload_BFO,
+   &i915_upload_BLENDCOLOR,
+   &i915_upload_IAB,
+   &i915_upload_DEPTHSCALE,
+   &i915_upload_STIPPLE,
+   &i915_upload_SCISSOR_ENABLE,
+   &i915_upload_SCISSOR_RECT
+};
+
+/* These will be dynamic indirect state commands, but for now just end
+ * up on the batch buffer with everything else.
+ */
+void i915_update_dynamic( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
+
diff --git a/src/gallium/drivers/i915simple/i915_state_emit.c b/src/gallium/drivers/i915simple/i915_state_emit.c
new file mode 100644
index 0000000000..9bd6f92323
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_emit.c
@@ -0,0 +1,402 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "i915_reg.h"
+#include "i915_context.h"
+#include "i915_winsys.h"
+#include "i915_batch.h"
+#include "i915_reg.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+static unsigned translate_format( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return COLOR_BUF_ARGB8888;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return COLOR_BUF_RGB565;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_depth_format( enum pipe_format zformat )
+{
+   switch (zformat) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return DEPTH_FRMT_24_FIXED_8_OTHER;
+   case PIPE_FORMAT_Z16_UNORM:
+      return DEPTH_FRMT_16_FIXED;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+
+/**
+ * Examine framebuffer state to determine width, height.
+ */
+static boolean
+framebuffer_size(const struct pipe_framebuffer_state *fb,
+                 uint *width, uint *height)
+{
+   if (fb->cbufs[0]) {
+      *width = fb->cbufs[0]->width;
+      *height = fb->cbufs[0]->height;
+      return TRUE;
+   }
+   else if (fb->zsbuf) {
+      *width = fb->zsbuf->width;
+      *height = fb->zsbuf->height;
+      return TRUE;
+   }
+   else {
+      *width = *height = 0;
+      return FALSE;
+   }
+}
+
+
+/* Push the state into the sarea and/or texture memory.
+ */
+void
+i915_emit_hardware_state(struct i915_context *i915 )
+{
+   /* XXX: there must be an easier way */
+   const unsigned dwords = ( 14 + 
+                             7 + 
+                             I915_MAX_DYNAMIC + 
+                             8 + 
+                             2 + I915_TEX_UNITS*3 + 
+                             2 + I915_TEX_UNITS*3 +
+                             2 + I915_MAX_CONSTANT*4 + 
+#if 0
+                             i915->current.program_len + 
+#else
+                             i915->fs->program_len + 
+#endif
+                             6 
+                           ) * 3/2; /* plus 50% margin */
+   const unsigned relocs = ( I915_TEX_UNITS +
+	                     3
+                           ) * 3/2; /* plus 50% margin */
+
+#if 0
+   debug_printf("i915_emit_hardware_state: %d dwords, %d relocs\n", dwords, relocs);
+#endif
+   
+   if(!BEGIN_BATCH(dwords, relocs)) {
+      FLUSH_BATCH(NULL);
+      assert(BEGIN_BATCH(dwords, relocs));
+   }
+
+   /* 14 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_INVARIENT)
+   {
+      OUT_BATCH(_3DSTATE_AA_CMD |
+		AA_LINE_ECAAR_WIDTH_ENABLE |
+		AA_LINE_ECAAR_WIDTH_1_0 |
+		AA_LINE_REGION_WIDTH_ENABLE | AA_LINE_REGION_WIDTH_1_0);
+
+      OUT_BATCH(_3DSTATE_DFLT_DIFFUSE_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DFLT_SPEC_CMD);
+      OUT_BATCH(0);
+      
+      OUT_BATCH(_3DSTATE_DFLT_Z_CMD);
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_COORD_SET_BINDINGS |
+		CSB_TCB(0, 0) |
+		CSB_TCB(1, 1) |
+		CSB_TCB(2, 2) |
+		CSB_TCB(3, 3) |
+		CSB_TCB(4, 4) | 
+		CSB_TCB(5, 5) | 
+		CSB_TCB(6, 6) | 
+		CSB_TCB(7, 7));
+
+      OUT_BATCH(_3DSTATE_RASTER_RULES_CMD |
+		ENABLE_POINT_RASTER_RULE |
+		OGL_POINT_RASTER_RULE |
+		ENABLE_LINE_STRIP_PROVOKE_VRTX |
+		ENABLE_TRI_FAN_PROVOKE_VRTX |
+		LINE_STRIP_PROVOKE_VRTX(1) |
+		TRI_FAN_PROVOKE_VRTX(2) | 
+		ENABLE_TEXKILL_3D_4D | 
+		TEXKILL_4D);
+
+      /* Need to initialize this to zero.
+       */
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | I1_LOAD_S(3) | (0));
+      OUT_BATCH(0);
+
+      OUT_BATCH(_3DSTATE_DEPTH_SUBRECT_DISABLE);
+
+      /* disable indirect state for now
+       */
+      OUT_BATCH(_3DSTATE_LOAD_INDIRECT | 0);
+      OUT_BATCH(0);
+   }
+   
+   /* 7 dwords, 1 relocs */
+   if (i915->hardware_dirty & I915_HW_IMMEDIATE)
+   {
+      OUT_BATCH(_3DSTATE_LOAD_STATE_IMMEDIATE_1 | 
+		I1_LOAD_S(0) |
+		I1_LOAD_S(1) |
+		I1_LOAD_S(2) |
+		I1_LOAD_S(4) |
+		I1_LOAD_S(5) |
+		I1_LOAD_S(6) | 
+		(5));
+      
+      if(i915->vbo)
+         OUT_RELOC(i915->vbo,
+                   I915_BUFFER_ACCESS_READ,
+                   i915->current.immediate[I915_IMMEDIATE_S0]);
+      else
+	 /* FIXME: we should not do this */
+	 OUT_BATCH(0);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S1]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S2]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S4]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S5]);
+      OUT_BATCH(i915->current.immediate[I915_IMMEDIATE_S6]);
+   } 
+   
+   /* I915_MAX_DYNAMIC dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_DYNAMIC) 
+   {
+      int i;
+      for (i = 0; i < I915_MAX_DYNAMIC; i++) {
+	 OUT_BATCH(i915->current.dynamic[i]);
+      }
+   }
+   
+   /* 8 dwords, 2 relocs */
+   if (i915->hardware_dirty & I915_HW_STATIC)
+   {
+      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
+      struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
+
+      if (cbuf_surface) {
+	 unsigned cpitch = cbuf_surface->stride;
+	 unsigned ctile = BUF_3D_USE_FENCE;
+	 if (cbuf_surface->texture &&
+	       ((struct i915_texture*)(cbuf_surface->texture))->tiled) {
+	    ctile = BUF_3D_TILED_SURFACE;
+	 }
+
+	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+	 OUT_BATCH(BUF_3D_ID_COLOR_BACK | 
+		   BUF_3D_PITCH(cpitch) |  /* pitch in bytes */
+		   ctile);
+
+	 OUT_RELOC(cbuf_surface->buffer,
+		   I915_BUFFER_ACCESS_WRITE,
+		   cbuf_surface->offset);
+      }
+
+      /* What happens if no zbuf??
+       */
+      if (depth_surface) {
+	 unsigned zpitch = depth_surface->stride;
+	 unsigned ztile = BUF_3D_USE_FENCE;
+	 if (depth_surface->texture &&
+	       ((struct i915_texture*)(depth_surface->texture))->tiled) {
+	    ztile = BUF_3D_TILED_SURFACE;
+	 }
+
+	 OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
+
+	 OUT_BATCH(BUF_3D_ID_DEPTH |
+		   BUF_3D_PITCH(zpitch) |  /* pitch in bytes */
+		   ztile);
+
+	 OUT_RELOC(depth_surface->buffer,
+		   I915_BUFFER_ACCESS_WRITE,
+		   depth_surface->offset);
+      }
+   
+      {
+	 unsigned cformat, zformat = 0;
+      
+	 if (cbuf_surface)
+            cformat = cbuf_surface->format;
+         else
+            cformat = PIPE_FORMAT_A8R8G8B8_UNORM; /* arbitrary */
+         cformat = translate_format(cformat);
+
+	 if (depth_surface) 
+	    zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
+
+	 OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
+	 OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
+		   DSTORG_VERT_BIAS(0x8) | /* .5 */
+		   LOD_PRECLAMP_OGL |
+		   TEX_DEFAULT_COLOR_OGL |
+		   cformat |
+		   zformat );
+      }
+   }
+
+#if 01
+      /* texture images */
+      /* 2 + I915_TEX_UNITS*3 dwords, I915_TEX_UNITS relocs */
+      if (i915->hardware_dirty & (I915_HW_MAP | I915_HW_SAMPLER))
+      {
+         const uint nr = i915->current.sampler_enable_nr;
+         if (nr) {
+            const uint enabled = i915->current.sampler_enable_flags;
+            uint unit;
+            uint count = 0;
+            OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr));
+            OUT_BATCH(enabled);
+            for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+               if (enabled & (1 << unit)) {
+                  struct pipe_buffer *buf =
+                     i915->texture[unit]->buffer;
+                  uint offset = 0;
+                  assert(buf);
+
+                  count++;
+
+                  OUT_RELOC(buf,
+                            I915_BUFFER_ACCESS_READ,
+                            offset);
+                  OUT_BATCH(i915->current.texbuffer[unit][0]); /* MS3 */
+                  OUT_BATCH(i915->current.texbuffer[unit][1]); /* MS4 */
+               }
+            }
+            assert(count == nr);
+         }
+      }
+#endif
+
+#if 01
+   /* samplers */
+   /* 2 + I915_TEX_UNITS*3 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_SAMPLER) 
+   {
+      if (i915->current.sampler_enable_nr) {
+	 int i;
+	 
+	 OUT_BATCH( _3DSTATE_SAMPLER_STATE | 
+		    (3 * i915->current.sampler_enable_nr) );
+
+	 OUT_BATCH( i915->current.sampler_enable_flags );
+
+	 for (i = 0; i < I915_TEX_UNITS; i++) {
+	    if (i915->current.sampler_enable_flags & (1<<i)) {
+	       OUT_BATCH( i915->current.sampler[i][0] );
+	       OUT_BATCH( i915->current.sampler[i][1] );
+	       OUT_BATCH( i915->current.sampler[i][2] );
+	    }
+	 }
+      }
+   }
+#endif
+
+   /* constants */
+   /* 2 + I915_MAX_CONSTANT*4 dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      /* Collate the user-defined constants with the fragment shader's
+       * immediates according to the constant_flags[] array.
+       */
+      const uint nr = i915->fs->num_constants;
+      if (nr) {
+         uint i;
+
+         OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
+         OUT_BATCH( (1 << (nr - 1)) | ((1 << (nr - 1)) - 1) );
+
+         for (i = 0; i < nr; i++) {
+            const uint *c;
+            if (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER) {
+               /* grab user-defined constant */
+               c = (uint *) i915->current.constants[PIPE_SHADER_FRAGMENT][i];
+            }
+            else {
+               /* emit program constant */
+               c = (uint *) i915->fs->constants[i];
+            }
+#if 0 /* debug */
+            {
+               float *f = (float *) c;
+               printf("Const %2d: %f %f %f %f %s\n", i, f[0], f[1], f[2], f[3],
+                      (i915->fs->constant_flags[i] == I915_CONSTFLAG_USER
+                       ? "user" : "immediate"));
+            }
+#endif
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+            OUT_BATCH(*c++);
+         }
+      }
+   }
+
+   /* Fragment program */
+   /* i915->current.program_len dwords, 0 relocs */
+   if (i915->hardware_dirty & I915_HW_PROGRAM)
+   {
+      uint i;
+      /* we should always have, at least, a pass-through program */
+      assert(i915->fs->program_len > 0);
+      for (i = 0; i < i915->fs->program_len; i++) {
+         OUT_BATCH(i915->fs->program[i]);
+      }
+   }
+
+   /* drawing surface size */
+   /* 6 dwords, 0 relocs */
+   {
+      uint w, h;
+      boolean k = framebuffer_size(&i915->framebuffer, &w, &h);
+      (void)k;
+      assert(k);
+
+      OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(((w - 1) & 0xffff) | ((h - 1) << 16));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   }
+
+
+   i915->hardware_dirty = 0;
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_immediate.c b/src/gallium/drivers/i915simple/i915_state_immediate.c
new file mode 100644
index 0000000000..8c16bb4e27
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_immediate.c
@@ -0,0 +1,225 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_state.h"
+#include "i915_reg.h"
+#include "util/u_memory.h"
+
+
+/* All state expressable with the LOAD_STATE_IMMEDIATE_1 packet.
+ * Would like to opportunistically recombine all these fragments into
+ * a single packet containing only what has changed, but for now emit
+ * as multiple packets.
+ */
+
+
+
+
+/***********************************************************************
+ * S0,S1: Vertex buffer state.  
+ */
+static void upload_S0S1(struct i915_context *i915)
+{
+   unsigned LIS0, LIS1;
+
+   /* INTEL_NEW_VBO */
+   /* TODO: re-use vertex buffers here? */
+   LIS0 = i915->vbo_offset;
+
+   /* INTEL_NEW_VERTEX_SIZE -- do this where the vertex size is calculated! 
+    */
+   {
+      unsigned vertex_size = i915->current.vertex_info.size;
+
+      LIS1 = ((vertex_size << 24) |
+	      (vertex_size << 16));
+   }
+
+   /* INTEL_NEW_VBO */
+   /* TODO: use a vertex generation number to track vbo changes */
+   if (1 ||
+       i915->current.immediate[I915_IMMEDIATE_S0] != LIS0 ||
+       i915->current.immediate[I915_IMMEDIATE_S1] != LIS1) 
+   {
+      i915->current.immediate[I915_IMMEDIATE_S0] = LIS0;
+      i915->current.immediate[I915_IMMEDIATE_S1] = LIS1;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S0S1 = {
+   I915_NEW_VBO | I915_NEW_VERTEX_FORMAT,
+   upload_S0S1
+};
+
+
+
+
+/***********************************************************************
+ * S4: Vertex format, rasterization state
+ */
+static void upload_S2S4(struct i915_context *i915)
+{
+   unsigned LIS2, LIS4;
+
+   /* I915_NEW_VERTEX_FORMAT */
+   {
+      LIS2 = i915->current.vertex_info.hwfmt[1];
+      LIS4 = i915->current.vertex_info.hwfmt[0];
+      /*
+      debug_printf("LIS2: 0x%x  LIS4: 0x%x\n", LIS2, LIS4);
+      */
+      assert(LIS4); /* should never be zero? */
+   }
+
+   LIS4 |= i915->rasterizer->LIS4;
+
+   if (LIS2 != i915->current.immediate[I915_IMMEDIATE_S2] ||
+       LIS4 != i915->current.immediate[I915_IMMEDIATE_S4]) {
+
+      i915->current.immediate[I915_IMMEDIATE_S2] = LIS2;
+      i915->current.immediate[I915_IMMEDIATE_S4] = LIS4;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+
+const struct i915_tracked_state i915_upload_S2S4 = {
+   I915_NEW_RASTERIZER | I915_NEW_VERTEX_FORMAT,
+   upload_S2S4
+};
+
+
+
+/***********************************************************************
+ * 
+ */
+static void upload_S5( struct i915_context *i915 )
+{
+   unsigned LIS5 = 0;
+
+   LIS5 |= i915->depth_stencil->stencil_LIS5;
+
+   LIS5 |= i915->blend->LIS5;
+
+#if 0
+   /* I915_NEW_RASTERIZER */
+   if (i915->state.Polygon->OffsetFill) {
+      LIS5 |= S5_GLOBAL_DEPTH_OFFSET_ENABLE;
+   }
+#endif
+
+
+   if (LIS5 != i915->current.immediate[I915_IMMEDIATE_S5]) {
+      i915->current.immediate[I915_IMMEDIATE_S5] = LIS5;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S5 = {
+   (I915_NEW_DEPTH_STENCIL | I915_NEW_BLEND | I915_NEW_RASTERIZER),
+   upload_S5
+};
+
+
+/***********************************************************************
+ */
+static void upload_S6( struct i915_context *i915 )
+{
+   unsigned LIS6 = (2 << S6_TRISTRIP_PV_SHIFT);
+
+   /* I915_NEW_FRAMEBUFFER
+    */
+   if (i915->framebuffer.cbufs[0])
+      LIS6 |= S6_COLOR_WRITE_ENABLE;
+
+   /* I915_NEW_BLEND
+    */
+   LIS6 |= i915->blend->LIS6;
+
+   /* I915_NEW_DEPTH
+    */
+   LIS6 |= i915->depth_stencil->depth_LIS6;
+
+   if (LIS6 != i915->current.immediate[I915_IMMEDIATE_S6]) {
+      i915->current.immediate[I915_IMMEDIATE_S6] = LIS6;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S6 = {
+   I915_NEW_BLEND | I915_NEW_DEPTH_STENCIL | I915_NEW_FRAMEBUFFER,
+   upload_S6
+};
+
+
+/***********************************************************************
+ */
+static void upload_S7( struct i915_context *i915 )
+{
+   unsigned LIS7;
+
+   /* I915_NEW_RASTERIZER
+    */
+   LIS7 = i915->rasterizer->LIS7;
+
+   if (LIS7 != i915->current.immediate[I915_IMMEDIATE_S7]) {
+      i915->current.immediate[I915_IMMEDIATE_S7] = LIS7;
+      i915->hardware_dirty |= I915_HW_IMMEDIATE;
+   }
+}
+
+const struct i915_tracked_state i915_upload_S7 = {
+   I915_NEW_RASTERIZER,
+   upload_S7
+};
+
+
+static const struct i915_tracked_state *atoms[] = {
+   &i915_upload_S0S1,
+   &i915_upload_S2S4,
+   &i915_upload_S5,
+   &i915_upload_S6,
+   &i915_upload_S7
+};
+
+/* 
+ */
+void i915_update_immediate( struct i915_context *i915 )
+{
+   int i;
+
+   for (i = 0; i < Elements(atoms); i++)
+      if (i915->dirty & atoms[i]->dirty)
+	 atoms[i]->update( i915 );
+}
diff --git a/src/gallium/drivers/i915simple/i915_state_inlines.h b/src/gallium/drivers/i915simple/i915_state_inlines.h
new file mode 100644
index 0000000000..378de8f9c4
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_inlines.h
@@ -0,0 +1,230 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_STATE_INLINES_H
+#define I915_STATE_INLINES_H
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_defines.h"
+#include "i915_reg.h"
+
+
+static INLINE unsigned
+i915_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      return COMPAREFUNC_NEVER;
+   case PIPE_FUNC_LESS:
+      return COMPAREFUNC_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return COMPAREFUNC_ALWAYS;
+   default:
+      return COMPAREFUNC_ALWAYS;
+   }
+}
+
+static INLINE unsigned
+i915_translate_stencil_op(unsigned op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return STENCILOP_INVERT;
+   default:
+      return STENCILOP_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BLENDFACT_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BLENDFACT_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BLENDFACT_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BLENDFACT_SRC_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BLENDFACT_INV_SRC_COLR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BLENDFACT_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BLENDFACT_INV_DST_COLR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BLENDFACT_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BLENDFACT_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BLENDFACT_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BLENDFACT_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BLENDFACT_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BLENDFACT_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BLENDFACT_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BLENDFACT_INV_CONST_ALPHA;
+   default:
+      return BLENDFACT_ZERO;
+   }
+}
+
+static INLINE unsigned
+i915_translate_blend_func(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BLENDFUNC_ADD;
+   case PIPE_BLEND_MIN:
+      return BLENDFUNC_MIN;
+   case PIPE_BLEND_MAX:
+      return BLENDFUNC_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BLENDFUNC_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BLENDFUNC_REVERSE_SUBTRACT;
+   default:
+      return 0;
+   }
+}
+
+
+static INLINE unsigned
+i915_translate_logic_op(unsigned opcode)
+{
+   switch (opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return LOGICOP_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return LOGICOP_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return LOGICOP_AND_RVRSE;
+   case PIPE_LOGICOP_COPY:
+      return LOGICOP_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return LOGICOP_COPY_INV;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return LOGICOP_AND_INV;
+   case PIPE_LOGICOP_NOOP:
+      return LOGICOP_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return LOGICOP_XOR;
+   case PIPE_LOGICOP_OR:
+      return LOGICOP_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return LOGICOP_OR_INV;
+   case PIPE_LOGICOP_NOR:
+      return LOGICOP_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return LOGICOP_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return LOGICOP_INV;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return LOGICOP_OR_RVRSE;
+   case PIPE_LOGICOP_NAND:
+      return LOGICOP_NAND;
+   case PIPE_LOGICOP_SET:
+      return LOGICOP_SET;
+   default:
+      return LOGICOP_SET;
+   }
+}
+
+
+
+static INLINE boolean i915_validate_vertices( unsigned hw_prim, unsigned nr )
+{
+   boolean ok;
+
+   switch (hw_prim) {
+   case PRIM3D_POINTLIST:
+      ok = (nr >= 1);
+      assert(ok);
+      break;
+   case PRIM3D_LINELIST:
+      ok = (nr >= 2) && (nr % 2) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_LINESTRIP:
+      ok = (nr >= 2);
+      assert(ok);
+      break;
+   case PRIM3D_TRILIST:
+      ok = (nr >= 3) && (nr % 3) == 0;
+      assert(ok);
+      break;
+   case PRIM3D_TRISTRIP:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_TRIFAN:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   case PRIM3D_POLY:
+      ok = (nr >= 3);
+      assert(ok);
+      break;
+   default:
+      assert(0);
+      ok = 0;
+      break;
+   }
+
+   return ok;
+}
+
+#endif
diff --git a/src/gallium/drivers/i915simple/i915_state_sampler.c b/src/gallium/drivers/i915simple/i915_state_sampler.c
new file mode 100644
index 0000000000..c09c10601b
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_state_sampler.c
@@ -0,0 +1,299 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_memory.h"
+
+#include "i915_state_inlines.h"
+#include "i915_context.h"
+#include "i915_reg.h"
+#include "i915_state.h"
+
+
+/*
+ * A note about min_lod & max_lod.
+ *
+ * There is a circular dependancy between the sampler state
+ * and the map state to be submitted to hw.
+ *
+ * Two condition must be meet:
+ * min_lod =< max_lod == true
+ * max_lod =< last_level == true
+ *
+ *
+ * This is all fine and dandy if it where for the fact that max_lod
+ * is set on the map state instead of the sampler state. That is
+ * the max_lod we submit on map is:
+ * max_lod = MIN2(last_level, max_lod);
+ *
+ * So we need to update the map state when we change samplers and
+ * we need to be change the sampler state when map state is changed.
+ * The first part is done by calling i915_update_texture in
+ * i915_update_samplers and the second part is done else where in
+ * code tracking the state changes.
+ */
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6]);
+/**
+ * Compute i915 texture sampling state.
+ *
+ * Recalculate all state from scratch.  Perhaps not the most
+ * efficient, but this has gotten complex enough that we need
+ * something which is understandable and reliable.
+ * \param state  returns the 3 words of compute state
+ */
+static void update_sampler(struct i915_context *i915,
+                           uint unit,
+			   const struct i915_sampler_state *sampler,
+			   const struct i915_texture *tex,
+			   unsigned state[3] )
+{
+   const struct pipe_texture *pt = &tex->base;
+   unsigned minlod, lastlod;
+
+   /* Need to do this after updating the maps, which call the
+    * intel_finalize_mipmap_tree and hence can update firstLevel:
+    */
+   state[0] = sampler->state[0];
+   state[1] = sampler->state[1];
+   state[2] = sampler->state[2];
+
+   if (pt->format == PIPE_FORMAT_YCBCR ||
+       pt->format == PIPE_FORMAT_YCBCR_REV)
+      state[0] |= SS2_COLORSPACE_CONVERSION;
+
+   /* 3D textures don't seem to respect the border color.
+    * Fallback if there's ever a danger that they might refer to
+    * it.  
+    * 
+    * Effectively this means fallback on 3D clamp or
+    * clamp_to_border.
+    *
+    * XXX: Check if this is true on i945.  
+    * XXX: Check if this bug got fixed in release silicon.
+    */
+#if 0
+   {
+      const unsigned ws = sampler->templ->wrap_s;
+      const unsigned wt = sampler->templ->wrap_t;
+      const unsigned wr = sampler->templ->wrap_r;
+      if (pt->target == PIPE_TEXTURE_3D &&
+          (sampler->templ->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
+           sampler->templ->mag_img_filter != PIPE_TEX_FILTER_NEAREST) &&
+          (ws == PIPE_TEX_WRAP_CLAMP ||
+           wt == PIPE_TEX_WRAP_CLAMP ||
+           wr == PIPE_TEX_WRAP_CLAMP ||
+           ws == PIPE_TEX_WRAP_CLAMP_TO_BORDER ||
+           wt == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 
+           wr == PIPE_TEX_WRAP_CLAMP_TO_BORDER)) {
+         if (i915->strict_conformance) {
+            assert(0);
+            /* 	    sampler->fallback = true; */
+            /* TODO */
+         }
+      }
+   }
+#endif
+
+   /* See note at the top of file */
+   minlod = sampler->minlod;
+   lastlod = pt->last_level << 4;
+
+   if (lastlod < minlod) {
+      minlod = lastlod;
+   }
+
+   state[1] |= (sampler->minlod << SS3_MIN_LOD_SHIFT);
+   state[1] |= (unit << SS3_TEXTUREMAP_INDEX_SHIFT);
+}
+
+
+void i915_update_samplers( struct i915_context *i915 )
+{
+   uint unit;
+
+   i915->current.sampler_enable_nr = 0;
+   i915->current.sampler_enable_flags = 0x0;
+
+   for (unit = 0; unit < i915->num_textures && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->texture[unit]) {
+	 update_sampler( i915,
+	                 unit,
+	                 i915->sampler[unit],       /* sampler state */
+	                 i915->texture[unit],        /* texture */
+	                 i915->current.sampler[unit] /* the result */
+	                 );
+	 i915_update_texture( i915,
+	                      unit,
+	                      i915->texture[unit],          /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+
+	 i915->current.sampler_enable_nr++;
+	 i915->current.sampler_enable_flags |= (1 << unit);
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_SAMPLER | I915_HW_MAP;
+}
+
+
+static uint
+translate_texture_format(enum pipe_format pipeFormat)
+{
+   switch (pipeFormat) {
+   case PIPE_FORMAT_L8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_L8;
+   case PIPE_FORMAT_I8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_I8;
+   case PIPE_FORMAT_A8_UNORM:
+      return MAPSURF_8BIT | MT_8BIT_A8;
+   case PIPE_FORMAT_A8L8_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_AY88;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_RGB565;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB1555;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return MAPSURF_16BIT | MT_16BIT_ARGB4444;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return MAPSURF_32BIT | MT_32BIT_ARGB8888;
+   case PIPE_FORMAT_YCBCR_REV:
+      return (MAPSURF_422 | MT_422_YCRCB_NORMAL);
+   case PIPE_FORMAT_YCBCR:
+      return (MAPSURF_422 | MT_422_YCRCB_SWAPY);
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_FXT1);
+#endif
+   case PIPE_FORMAT_Z16_UNORM:
+      return (MAPSURF_16BIT | MT_16BIT_L16);
+#if 0
+   case PIPE_FORMAT_RGBA_DXT1:
+   case PIPE_FORMAT_RGB_DXT1:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT1);
+   case PIPE_FORMAT_RGBA_DXT3:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT2_3);
+   case PIPE_FORMAT_RGBA_DXT5:
+      return (MAPSURF_COMPRESSED | MT_COMPRESS_DXT4_5);
+#endif
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return (MAPSURF_32BIT | MT_32BIT_xI824);
+   default:
+      debug_printf("i915: translate_texture_format() bad image format %x\n",
+              pipeFormat);
+      assert(0);
+      return 0;
+   }
+}
+
+
+static void
+i915_update_texture(struct i915_context *i915,
+                    uint unit,
+                    const struct i915_texture *tex,
+                    const struct i915_sampler_state *sampler,
+                    uint state[6])
+{
+   const struct pipe_texture *pt = &tex->base;
+   uint format, pitch;
+   const uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+   const uint num_levels = pt->last_level;
+   unsigned max_lod = num_levels * 4;
+   unsigned tiled = MS3_USE_FENCE_REGS;
+
+   assert(tex);
+   assert(width);
+   assert(height);
+   assert(depth);
+
+   format = translate_texture_format(pt->format);
+   pitch = tex->stride;
+
+   assert(format);
+   assert(pitch);
+
+   if (tex->tiled) {
+      assert(!((pitch - 1) & pitch));
+      tiled = MS3_TILED_SURFACE;
+   }
+
+   /* MS3 state */
+   state[0] =
+      (((height - 1) << MS3_HEIGHT_SHIFT)
+       | ((width - 1) << MS3_WIDTH_SHIFT)
+       | format
+       | tiled);
+
+   /*
+    * XXX When min_filter != mag_filter and there's just one mipmap level,
+    * set max_lod = 1 to make sure i915 chooses between min/mag filtering.
+    */
+
+   /* See note at the top of file */
+   if (max_lod > (sampler->maxlod >> 2))
+      max_lod = sampler->maxlod >> 2;
+
+   /* MS4 state */
+   state[1] =
+      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT)
+       | MS4_CUBE_FACE_ENA_MASK
+       | ((max_lod) << MS4_MAX_LOD_SHIFT)
+       | ((depth - 1) << MS4_VOLUME_DEPTH_SHIFT));
+}
+
+
+void
+i915_update_textures(struct i915_context *i915)
+{
+   uint unit;
+
+   for (unit = 0; unit < i915->num_textures && unit < i915->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      /* could also examine the fragment program? */
+      if (i915->texture[unit]) {
+	 i915_update_texture( i915,
+	                      unit,
+	                      i915->texture[unit],          /* texture */
+	                      i915->sampler[unit],          /* sampler state */
+	                      i915->current.texbuffer[unit] );
+      }
+   }
+
+   i915->hardware_dirty |= I915_HW_MAP;
+}
diff --git a/src/gallium/drivers/i915simple/i915_surface.c b/src/gallium/drivers/i915simple/i915_surface.c
new file mode 100644
index 0000000000..62f1926644
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_surface.c
@@ -0,0 +1,123 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "i915_context.h"
+#include "i915_blit.h"
+#include "i915_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+i915_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+		  struct pipe_surface *dst,
+		  unsigned dstx, unsigned dsty,
+		  struct pipe_surface *src,
+		  unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+   assert( dst != src );
+   assert( dst->block.size == src->block.size );
+   assert( dst->block.width == src->block.height );
+   assert( dst->block.height == src->block.height );
+
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+      
+      const void *src_map = pipe->screen->surface_map( pipe->screen,
+                                                       src,
+                                                       PIPE_BUFFER_USAGE_CPU_READ );
+      
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dstx, dsty,
+                     width, height,
+                     src_map,
+                     do_flip ? -(int) src->stride : src->stride,
+                     srcx, do_flip ? height - 1 - srcy : srcy);
+
+      pipe->screen->surface_unmap(pipe->screen, src);
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      i915_copy_blit( i915_context(pipe),
+                      do_flip,
+                      dst->block.size,
+		      (short) src->stride, src->buffer, src->offset,
+		      (short) dst->stride, dst->buffer, dst->offset,
+		      (short) srcx, (short) srcy, (short) dstx, (short) dsty, (short) width, (short) height );
+   }
+}
+
+
+static void
+i915_surface_fill(struct pipe_context *pipe,
+		  struct pipe_surface *dst,
+		  unsigned dstx, unsigned dsty,
+		  unsigned width, unsigned height, unsigned value)
+{
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+
+      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      i915_fill_blit( i915_context(pipe),
+		      dst->block.size,
+		      (short) dst->stride,
+		      dst->buffer, dst->offset,
+		      (short) dstx, (short) dsty,
+		      (short) width, (short) height,
+		      value );
+   }
+}
+
+
+void
+i915_init_surface_functions(struct i915_context *i915)
+{
+   i915->pipe.surface_copy = i915_surface_copy;
+   i915->pipe.surface_fill = i915_surface_fill;
+}
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
new file mode 100644
index 0000000000..2f5459af67
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -0,0 +1,775 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "i915_context.h"
+#include "i915_texture.h"
+#include "i915_debug.h"
+#include "i915_screen.h"
+
+/*
+ * Helper function and arrays
+ */
+
+/**
+ * Initial offset for Cube map.
+ */
+static const int initial_offsets[6][2] = {
+   {0, 0},
+   {0, 2},
+   {1, 0},
+   {1, 2},
+   {1, 1},
+   {1, 3}
+};
+
+/**
+ * Step offsets for Cube map.
+ */
+static const int step_offsets[6][2] = {
+   {0, 2},
+   {0, 2},
+   {-1, 2},
+   {-1, 2},
+   {-1, 1},
+   {-1, 1}
+};
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+static unsigned
+power_of_two(unsigned x)
+{
+   unsigned value = 1;
+   while (value < x)
+      value = value << 1;
+   return value;
+}
+
+static unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+/*
+ * More advanced helper funcs
+ */
+
+
+static void
+i915_miptree_set_level_info(struct i915_texture *tex,
+                             unsigned level,
+                             unsigned nr_images,
+                             unsigned w, unsigned h, unsigned d)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   assert(level < PIPE_MAX_TEXTURE_LEVELS);
+
+   pt->width[level] = w;
+   pt->height[level] = h;
+   pt->depth[level] = d;
+   
+   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
+   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
+
+   tex->nr_images[level] = nr_images;
+
+   /*
+   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+       level, w, h, d, x, y, tex->level_offset[level]);
+   */
+
+   /* Not sure when this would happen, but anyway: 
+    */
+   if (tex->image_offset[level]) {
+      FREE(tex->image_offset[level]);
+      tex->image_offset[level] = NULL;
+   }
+
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void
+i915_miptree_set_image_offset(struct i915_texture *tex,
+			      unsigned level, unsigned img, unsigned x, unsigned y)
+{
+   if (img == 0 && level == 0)
+      assert(x == 0 && y == 0);
+
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * tex->base.block.size;
+
+   /*
+   printf("%s level %d img %d pos %d,%d image_offset %x\n",
+       __FUNCTION__, level, img, x, y, tex->image_offset[level][img]);
+   */
+}
+
+
+/*
+ * Layout functions
+ */
+
+
+/**
+ * Special case to deal with display targets.
+ */
+static boolean
+i915_displaytarget_layout(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   if (pt->last_level > 0 || pt->block.size != 4)
+      return 0;
+
+   i915_miptree_set_level_info( tex, 0, 1,
+                                tex->base.width[0],
+                                tex->base.height[0],
+                                1 );
+   i915_miptree_set_image_offset( tex, 0, 0, 0, 0 );
+
+   if (tex->base.width[0] >= 128) {
+      tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+      tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+#if 0 /* used for tiled display targets */
+      tex->tiled = 1;
+#endif
+   } else {
+      tex->stride = round_up(tex->base.nblocksx[0] * pt->block.size, 64);
+      tex->total_nblocksy = tex->base.nblocksy[0];
+   }
+
+   /*
+   printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      tex->base.width[0], tex->base.height[0], pt->block.size,
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+   */
+
+   return 1;
+}
+
+static void
+i945_miptree_layout_2d( struct i915_texture *tex )
+{
+   struct pipe_texture *pt = &tex->base;
+   const int align_x = 2, align_y = 4;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   /* used for tiled display targets */
+   if (0)
+      if (i915_displaytarget_layout(tex))
+	 return;
+
+   tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx 
+	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+
+      if (mip1_nblocksx > nblocksx)
+	 tex->stride = mip1_nblocksx * pt->block.size;
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 1, width, height, 1);
+      i915_miptree_set_image_offset(tex, level, 0, x, y);
+
+      nblocksy = align(nblocksy, align_y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+	 x += align(nblocksx, align_x);
+      }
+      else {
+	 y += nblocksy;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static void
+i945_miptree_layout_cube(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   const unsigned nblocks = pt->nblocksx[0];
+   unsigned face;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+
+   /*
+   printf("%s %i, %i\n", __FUNCTION__, pt->width[0], pt->height[0]);
+   */
+
+   assert(width == height); /* cubemap images are square */
+
+   /*
+    * XXX Should only be used for compressed formats. But lets
+    * keep this code active just in case.
+    *
+    * Depending on the size of the largest images, pitch can be
+    * determined either by the old-style packing of cubemap faces,
+    * or the final row of 4x4, 2x2 and 1x1 faces below this.
+    */
+   if (nblocks > 32)
+      tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+   else
+      tex->stride = 14 * 8 * pt->block.size;
+
+   tex->total_nblocksy = nblocks * 4;
+
+   /* Set all the levels to effectively occupy the whole rectangular region.
+   */
+   for (level = 0; level <= pt->last_level; level++) {
+      i915_miptree_set_level_info(tex, level, 6, width, height, 1);
+      width /= 2;
+      height /= 2;
+   }
+
+   for (face = 0; face < 6; face++) {
+      unsigned x = initial_offsets[face][0] * nblocks;
+      unsigned y = initial_offsets[face][1] * nblocks;
+      unsigned d = nblocks;
+
+#if 0 /* Fix and enable this code for compressed formats */
+      if (nblocks == 4 && face >= 4) {
+         y = tex->total_height - 4;
+         x = (face - 4) * 8;
+      }
+      else if (nblocks < 4 && (face > 0)) {
+         y = tex->total_height - 4;
+         x = face * 8;
+      }
+#endif
+
+      for (level = 0; level <= pt->last_level; level++) {
+         i915_miptree_set_image_offset(tex, level, face, x, y);
+
+         d >>= 1;
+
+#if 0 /* Fix and enable this code for compressed formats */
+         switch (d) {
+            case 4:
+               switch (face) {
+                  case PIPE_TEX_FACE_POS_X:
+                  case PIPE_TEX_FACE_NEG_X:
+                     x += step_offsets[face][0] * d;
+                     y += step_offsets[face][1] * d;
+                     break;
+                  case PIPE_TEX_FACE_POS_Y:
+                  case PIPE_TEX_FACE_NEG_Y:
+                     y += 12;
+                     x -= 8;
+                     break;
+                  case PIPE_TEX_FACE_POS_Z:
+                  case PIPE_TEX_FACE_NEG_Z:
+                     y = tex->total_height - 4;
+                     x = (face - 4) * 8;
+                     break;
+               }
+            case 2:
+               y = tex->total_height - 4;
+               x = 16 + face * 8;
+               break;
+
+            case 1:
+               x += 48;
+               break;
+            default:
+#endif
+               x += step_offsets[face][0] * d;
+               y += step_offsets[face][1] * d;
+#if 0
+               break;
+         }
+#endif
+      }
+   }
+}
+
+static boolean
+i915_miptree_layout(struct i915_texture * tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE: {
+         const unsigned nblocks = pt->nblocksx[0];
+         unsigned face;
+         unsigned width = pt->width[0], height = pt->height[0];
+
+         assert(width == height); /* cubemap images are square */
+
+         /* double pitch for cube layouts */
+         tex->stride = round_up(nblocks * pt->block.size * 2, 4);
+         tex->total_nblocksy = nblocks * 4;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            i915_miptree_set_level_info(tex, level, 6,
+                                         width, height,
+                                         1);
+            width /= 2;
+            height /= 2;
+         }
+
+         for (face = 0; face < 6; face++) {
+            unsigned x = initial_offsets[face][0] * nblocks;
+            unsigned y = initial_offsets[face][1] * nblocks;
+            unsigned d = nblocks;
+
+            for (level = 0; level <= pt->last_level; level++) {
+               i915_miptree_set_image_offset(tex, level, face, x, y);
+               d >>= 1;
+               x += step_offsets[face][0] * d;
+               y += step_offsets[face][1] * d;
+            }
+         }
+         break;
+      }
+   case PIPE_TEXTURE_3D:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned depth = pt->depth[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+         unsigned stack_nblocksy = 0;
+
+         /* Calculate the size of a single slice. 
+          */
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+
+         /* XXX: hardware expects/requires 9 levels at minimum.
+          */
+         for (level = 0; level <= MAX2(8, pt->last_level);
+              level++) {
+            i915_miptree_set_level_info(tex, level, depth,
+                                        width, height, depth);
+
+
+            stack_nblocksy += MAX2(2, nblocksy);
+
+            width = minify(width);
+            height = minify(height);
+            depth = minify(depth);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+
+         /* Fixup depth image_offsets: 
+          */
+         depth = pt->depth[0];
+         for (level = 0; level <= pt->last_level; level++) {
+            unsigned i;
+            for (i = 0; i < depth; i++) 
+               i915_miptree_set_image_offset(tex, level, i,
+                                             0, i * stack_nblocksy);
+
+            depth = minify(depth);
+         }
+
+
+         /* Multiply slice size by texture depth for total size.  It's
+          * remarkable how wasteful of memory the i915 texture layouts
+          * are.  They are largely fixed in the i945.
+          */
+         tex->total_nblocksy = stack_nblocksy * pt->depth[0];
+         break;
+      }
+
+   default:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+         tex->total_nblocksy = 0;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            i915_miptree_set_level_info(tex, level, 1,
+                                        width, height, 1);
+            i915_miptree_set_image_offset(tex, level, 0,
+                                          0, tex->total_nblocksy);
+
+            nblocksy = round_up(MAX2(2, nblocksy), 2);
+
+	    tex->total_nblocksy += nblocksy;
+
+            width = minify(width);
+            height = minify(height);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+         break;
+      }
+   }
+   /*
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       tex->pitch,
+       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
+   */
+
+   return TRUE;
+}
+
+
+static boolean
+i945_miptree_layout(struct i915_texture * tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   unsigned level;
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE:
+      i945_miptree_layout_cube(tex);
+      break;
+   case PIPE_TEXTURE_3D:{
+         unsigned width = pt->width[0];
+         unsigned height = pt->height[0];
+         unsigned depth = pt->depth[0];
+         unsigned nblocksx = pt->nblocksx[0];
+         unsigned nblocksy = pt->nblocksy[0];
+         unsigned pack_x_pitch, pack_x_nr;
+         unsigned pack_y_pitch;
+
+         tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
+         tex->total_nblocksy = 0;
+
+         pack_y_pitch = MAX2(pt->nblocksy[0], 2);
+         pack_x_pitch = tex->stride / pt->block.size;
+         pack_x_nr = 1;
+
+         for (level = 0; level <= pt->last_level; level++) {
+            unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
+            int x = 0;
+            int y = 0;
+            unsigned q, j;
+
+            i915_miptree_set_level_info(tex, level, nr_images,
+                                        width, height, depth);
+
+            for (q = 0; q < nr_images;) {
+               for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+                  i915_miptree_set_image_offset(tex, level, q, x, y + tex->total_nblocksy);
+                  x += pack_x_pitch;
+               }
+
+               x = 0;
+               y += pack_y_pitch;
+            }
+
+
+            tex->total_nblocksy += y;
+
+            if (pack_x_pitch > 4) {
+               pack_x_pitch >>= 1;
+               pack_x_nr <<= 1;
+               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+            }
+
+            if (pack_y_pitch > 2) {
+               pack_y_pitch >>= 1;
+            }
+
+            width = minify(width);
+            height = minify(height);
+            depth = minify(depth);
+            nblocksx = pf_get_nblocksx(&pt->block, width);
+            nblocksy = pf_get_nblocksy(&pt->block, height);
+         }
+         break;
+      }
+
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_2D:
+//   case PIPE_TEXTURE_RECTANGLE:
+         i945_miptree_layout_2d(tex);
+         break;
+   default:
+      assert(0);
+      return FALSE;
+   }
+
+   /*
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       tex->pitch,
+       tex->total_nblocksy, pt->block.size, tex->stride * tex->total_nblocksy);
+   */
+
+   return TRUE;
+}
+
+
+static struct pipe_texture *
+i915_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
+{
+   struct i915_screen *i915screen = i915_screen(screen);
+   struct pipe_winsys *ws = screen->winsys;
+   struct i915_texture *tex = CALLOC_STRUCT(i915_texture);
+   size_t tex_size;
+
+   if (!tex)
+      return NULL;
+
+   tex->base = *templat;
+   tex->base.refcount = 1;
+   tex->base.screen = screen;
+
+   tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
+   tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
+   
+   if (i915screen->is_i945) {
+      if (!i945_miptree_layout(tex))
+	 goto fail;
+   } else {
+      if (!i915_miptree_layout(tex))
+	 goto fail;
+   }
+
+   tex_size = tex->stride * tex->total_nblocksy;
+
+   tex->buffer = ws->buffer_create(ws, 64,
+				   PIPE_BUFFER_USAGE_PIXEL,
+				   tex_size);
+
+   if (!tex->buffer)
+      goto fail;
+
+#if 0
+   void *ptr = ws->buffer_map(ws, tex->buffer,
+      PIPE_BUFFER_USAGE_CPU_WRITE);
+   memset(ptr, 0x80, tex_size);
+   ws->buffer_unmap(ws, tex->buffer);
+#endif
+
+   return &tex->base;
+
+fail:
+   FREE(tex);
+   return NULL;
+}
+
+
+static void
+i915_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      struct i915_texture *tex = (struct i915_texture *)*pt;
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+      */
+
+      pipe_buffer_reference(screen, &tex->buffer, NULL);
+
+      for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+         if (tex->image_offset[i])
+            FREE(tex->image_offset[i]);
+
+      FREE(tex);
+   }
+   *pt = NULL;
+}
+
+static struct pipe_surface *
+i915_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned flags)
+{
+   struct i915_texture *tex = (struct i915_texture *)pt;
+   struct pipe_winsys *ws = screen->winsys;
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset = tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset = tex->image_offset[level][zslice];
+   }
+   else {
+      offset = tex->image_offset[level][0];
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      ps->refcount = 1;
+      ps->winsys = ws;
+      pipe_texture_reference(&ps->texture, pt);
+      pipe_buffer_reference(screen, &ps->buffer, tex->buffer);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->block = pt->block;
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = tex->stride;
+      ps->offset = offset;
+      ps->usage = flags;
+      ps->status = PIPE_SURFACE_STATUS_DEFINED;
+   }
+   return ps;
+}
+
+static struct pipe_texture *
+i915_texture_blanket(struct pipe_screen * screen,
+                     const struct pipe_texture *base,
+                     const unsigned *stride,
+                     struct pipe_buffer *buffer)
+{
+   struct i915_texture *tex;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   tex = CALLOC_STRUCT(i915_texture);
+   if (!tex)
+      return NULL;
+
+   tex->base = *base;
+   tex->base.refcount = 1;
+   tex->base.screen = screen;
+
+   tex->stride = stride[0];
+
+   i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   pipe_buffer_reference(screen, &tex->buffer, buffer);
+
+   return &tex->base;
+}
+
+void
+i915_init_texture_functions(struct i915_context *i915)
+{
+//   i915->pipe.texture_update = i915_texture_update;
+}
+
+static void
+i915_tex_surface_release(struct pipe_screen *screen,
+                         struct pipe_surface **surface)
+{
+   struct pipe_surface *surf = *surface;
+
+   if (--surf->refcount == 0) {
+
+      /* This really should not be possible, but it's actually
+       * happening quite a bit...  Will fix.
+       */
+      if (surf->status == PIPE_SURFACE_STATUS_CLEAR) {
+         debug_printf("XXX destroying a surface with pending clears...\n");
+         assert(0);
+      }
+
+      pipe_texture_reference(&surf->texture, NULL);
+      pipe_buffer_reference(screen, &surf->buffer, NULL);
+      FREE(surf);
+   }
+
+   *surface = NULL;
+}
+
+void
+i915_init_screen_texture_functions(struct pipe_screen *screen)
+{
+   screen->texture_create = i915_texture_create;
+   screen->texture_release = i915_texture_release;
+   screen->get_tex_surface = i915_get_tex_surface;
+   screen->texture_blanket = i915_texture_blanket;
+   screen->tex_surface_release = i915_tex_surface_release;
+}
diff --git a/src/gallium/drivers/i915simple/i915_texture.h b/src/gallium/drivers/i915simple/i915_texture.h
new file mode 100644
index 0000000000..7225016a9f
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_texture.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef I915_TEXTURE_H
+#define I915_TEXTURE_H
+
+struct i915_context;
+struct pipe_screen;
+
+
+extern void
+i915_init_texture_functions(struct i915_context *i915);
+
+
+extern void
+i915_init_screen_texture_functions(struct pipe_screen *screen);
+
+
+#endif /* I915_TEXTURE_H */
diff --git a/src/gallium/drivers/i915simple/i915_winsys.h b/src/gallium/drivers/i915simple/i915_winsys.h
new file mode 100644
index 0000000000..81904c2a74
--- /dev/null
+++ b/src/gallium/drivers/i915simple/i915_winsys.h
@@ -0,0 +1,121 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that i915simple requires any window system
+ * hosting it to implement.  This is the only include file in i915simple
+ * which is public.
+ * 
+ */
+
+#ifndef I915_WINSYS_H
+#define I915_WINSYS_H
+
+
+#include "pipe/p_defines.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+/* Pipe drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+
+struct i915_batchbuffer;
+struct pipe_buffer;
+struct pipe_fence_handle;
+struct pipe_winsys;
+struct pipe_screen;
+
+
+/**
+ * Additional winsys interface for i915simple.
+ * 
+ * It is an over-simple batchbuffer mechanism.  Will want to improve the
+ * performance of this, perhaps based on the cmdstream stuff.  It
+ * would be pretty impossible to implement swz on top of this
+ * interface.
+ *
+ * Will also need additions/changes to implement static/dynamic
+ * indirect state.
+ */
+struct i915_winsys {
+
+   void (*destroy)( struct i915_winsys *sws );
+   
+   /**
+    * Get the current batch buffer from the winsys.
+    */
+   struct i915_batchbuffer *(*batch_get)( struct i915_winsys *sws );
+
+   /**
+    * Emit a relocation to a buffer.
+    * 
+    * Used not only when the buffer addresses are not pinned, but also to 
+    * ensure refered buffers will not be destroyed until the current batch 
+    * buffer execution is finished.
+    *
+    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and 
+    * I915_BUFFER_ACCESS_READ macros.
+    */
+   void (*batch_reloc)( struct i915_winsys *sws,
+			struct pipe_buffer *buf,
+			unsigned access_flags,
+			unsigned delta );
+
+   /**
+    * Flush the batch.
+    */
+   void (*batch_flush)( struct i915_winsys *sws,
+                        struct pipe_fence_handle **fence );
+};
+
+#define I915_BUFFER_ACCESS_WRITE   0x1 
+#define I915_BUFFER_ACCESS_READ    0x2
+
+#define I915_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+
+
+struct pipe_context *i915_create_context( struct pipe_screen *,
+                                          struct pipe_winsys *,
+                                          struct i915_winsys * );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif 
diff --git a/src/gallium/drivers/i965simple/Makefile b/src/gallium/drivers/i965simple/Makefile
new file mode 100644
index 0000000000..e97146e57c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/Makefile
@@ -0,0 +1,54 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i965simple
+
+C_SOURCES = \
+	brw_blit.c \
+	brw_flush.c \
+	brw_screen.c \
+	brw_surface.c \
+	brw_cc.c \
+	brw_clip.c \
+	brw_clip_line.c \
+	brw_clip_point.c \
+	brw_clip_state.c \
+	brw_clip_tri.c \
+	brw_clip_util.c \
+	brw_context.c \
+	brw_curbe.c \
+	brw_draw.c \
+	brw_draw_upload.c \
+	brw_eu.c \
+	brw_eu_debug.c \
+	brw_eu_emit.c \
+	brw_eu_util.c \
+	brw_gs.c \
+	brw_gs_emit.c \
+	brw_gs_state.c \
+	brw_misc_state.c \
+	brw_sf.c \
+	brw_sf_emit.c \
+	brw_sf_state.c \
+	brw_state.c \
+	brw_state_batch.c \
+	brw_state_cache.c \
+	brw_state_pool.c \
+	brw_state_upload.c \
+	brw_tex_layout.c \
+	brw_urb.c \
+	brw_util.c \
+	brw_vs.c \
+	brw_vs_emit.c \
+	brw_vs_state.c \
+	brw_wm.c \
+	brw_wm_iz.c \
+	brw_wm_decl.c \
+	brw_wm_glsl.c \
+	brw_wm_sampler_state.c \
+	brw_wm_state.c \
+	brw_wm_surface_state.c
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/drivers/i965simple/SConscript b/src/gallium/drivers/i965simple/SConscript
new file mode 100644
index 0000000000..43fc2a4005
--- /dev/null
+++ b/src/gallium/drivers/i965simple/SConscript
@@ -0,0 +1,54 @@
+Import('*')
+
+env = env.Clone()
+
+i965simple = env.ConvenienceLibrary(
+	target = 'i965simple',
+	source = [
+		'brw_blit.c',
+		'brw_cc.c',
+		'brw_clip.c',
+		'brw_clip_line.c',
+		'brw_clip_point.c',
+		'brw_clip_state.c',
+		'brw_clip_tri.c',
+		'brw_clip_util.c',
+		'brw_context.c',
+		'brw_curbe.c',
+		'brw_draw.c',
+		'brw_draw_upload.c',
+		'brw_eu.c',
+		'brw_eu_debug.c',
+		'brw_eu_emit.c',
+		'brw_eu_util.c',
+		'brw_flush.c',
+		'brw_gs.c',
+		'brw_gs_emit.c',
+		'brw_gs_state.c',
+		'brw_misc_state.c',
+		'brw_screen.c',
+		'brw_sf.c',
+		'brw_sf_emit.c',
+		'brw_sf_state.c',
+		'brw_state.c',
+		'brw_state_batch.c',
+		'brw_state_cache.c',
+		'brw_state_pool.c',
+		'brw_state_upload.c',
+		'brw_surface.c',
+		'brw_tex_layout.c',
+		'brw_urb.c',
+		'brw_util.c',
+		'brw_vs.c',
+		'brw_vs_emit.c',
+		'brw_vs_state.c',
+		'brw_wm.c',
+		'brw_wm_decl.c',
+		'brw_wm_glsl.c',
+		'brw_wm_iz.c',
+		'brw_wm_sampler_state.c',
+		'brw_wm_state.c',
+		'brw_wm_surface_state.c',
+	])
+
+Export('i965simple')
diff --git a/src/gallium/drivers/i965simple/brw_batch.h b/src/gallium/drivers/i965simple/brw_batch.h
new file mode 100644
index 0000000000..5f5932a488
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_batch.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef BRW_BATCH_H
+#define BRW_BATCH_H
+
+#include "brw_winsys.h"
+
+#define BATCH_LOCALS
+
+#define INTEL_BATCH_NO_CLIPRECTS 0x1
+#define INTEL_BATCH_CLIPRECTS    0x2
+
+#define BEGIN_BATCH( dwords, relocs ) \
+   brw->winsys->batch_start(brw->winsys, dwords, relocs)
+
+#define OUT_BATCH( dword ) \
+   brw->winsys->batch_dword(brw->winsys, dword)
+
+#define OUT_RELOC( buf, flags, delta ) \
+   brw->winsys->batch_reloc(brw->winsys, buf, flags, delta)
+
+#define ADVANCE_BATCH() \
+   brw->winsys->batch_end( brw->winsys )
+
+/* XXX: this is bogus - need proper handling for out-of-memory in batchbuffer.
+ */
+#define FLUSH_BATCH(fence) do {				\
+   brw->winsys->batch_flush(brw->winsys, fence);	\
+   brw->hardware_dirty = ~0;				\
+} while (0)
+
+#define BRW_BATCH_STRUCT(brw, s) brw_batchbuffer_data( brw->winsys, (s), sizeof(*(s)))
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_blit.c b/src/gallium/drivers/i965simple/brw_blit.c
new file mode 100644
index 0000000000..8494f70493
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.c
@@ -0,0 +1,218 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include <stdio.h>
+#include <errno.h>
+
+#include "brw_batch.h"
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_reg.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+#define FILE_DEBUG_FLAG DEBUG_BLIT
+
+void brw_fill_blit(struct brw_context *brw,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color)
+{
+   unsigned BR13, CMD;
+   BATCH_LOCALS;
+
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (0xF0 << 16) | (1<<24);
+      CMD = XY_COLOR_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (0xF0 << 16) | (1<<24) | (1<<25);
+      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH( CMD );
+   OUT_BATCH( dst_pitch | BR13 );
+   OUT_BATCH( (y << 16) | x );
+   OUT_BATCH( ((y+h) << 16) | (x+w) );
+   OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE, dst_offset );
+   OUT_BATCH( color );
+   ADVANCE_BATCH();
+}
+
+static unsigned translate_raster_op(unsigned logicop)
+{
+   switch(logicop) {
+   case PIPE_LOGICOP_CLEAR: return 0x00;
+   case PIPE_LOGICOP_AND: return 0x88;
+   case PIPE_LOGICOP_AND_REVERSE: return 0x44;
+   case PIPE_LOGICOP_COPY: return 0xCC;
+   case PIPE_LOGICOP_AND_INVERTED: return 0x22;
+   case PIPE_LOGICOP_NOOP: return 0xAA;
+   case PIPE_LOGICOP_XOR: return 0x66;
+   case PIPE_LOGICOP_OR: return 0xEE;
+   case PIPE_LOGICOP_NOR: return 0x11;
+   case PIPE_LOGICOP_EQUIV: return 0x99;
+   case PIPE_LOGICOP_INVERT: return 0x55;
+   case PIPE_LOGICOP_OR_REVERSE: return 0xDD;
+   case PIPE_LOGICOP_COPY_INVERTED: return 0x33;
+   case PIPE_LOGICOP_OR_INVERTED: return 0xBB;
+   case PIPE_LOGICOP_NAND: return 0x77;
+   case PIPE_LOGICOP_SET: return 0xFF;
+   default: return 0;
+   }
+}
+
+
+/* Copy BitBlt
+ */
+void brw_copy_blit(struct brw_context *brw,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op)
+{
+   unsigned CMD, BR13;
+   int dst_y2 = dst_y + h;
+   int dst_x2 = dst_x + w;
+   BATCH_LOCALS;
+
+
+   DBG("%s src:buf(%d)/%d %d,%d dst:buf(%d)/%d %d,%d sz:%dx%d op:%d\n",
+       __FUNCTION__,
+       src_buffer, src_pitch, src_x, src_y,
+       dst_buffer, dst_pitch, dst_x, dst_y,
+       w,h,logic_op);
+
+   assert( logic_op - PIPE_LOGICOP_CLEAR >= 0 );
+   assert( logic_op - PIPE_LOGICOP_CLEAR < 0x10 );
+
+   src_pitch *= cpp;
+   dst_pitch *= cpp;
+
+   switch(cpp) {
+   case 1:
+   case 2:
+   case 3:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24);
+      CMD = XY_SRC_COPY_BLT_CMD;
+      break;
+   case 4:
+      BR13 = (translate_raster_op(logic_op) << 16) | (1<<24) |
+	  (1<<25);
+      CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
+      break;
+   default:
+      return;
+   }
+
+   if (src_tiled) {
+      CMD |= XY_SRC_TILED;
+      src_pitch /= 4;
+   }
+
+   if (dst_tiled) {
+      CMD |= XY_DST_TILED;
+      dst_pitch /= 4;
+   }
+
+   if (dst_y2 < dst_y ||
+       dst_x2 < dst_x) {
+      return;
+   }
+
+   dst_pitch &= 0xffff;
+   src_pitch &= 0xffff;
+
+   /* Initial y values don't seem to work with negative pitches.  If
+    * we adjust the offsets manually (below), it seems to work fine.
+    *
+    * On the other hand, if we always adjust, the hardware doesn't
+    * know which blit directions to use, so overlapping copypixels get
+    * the wrong result.
+    */
+   if (dst_pitch > 0 && src_pitch > 0) {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( dst_pitch | BR13 );
+      OUT_BATCH( (dst_y << 16) | dst_x );
+      OUT_BATCH( (dst_y2 << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset );
+      OUT_BATCH( (src_y << 16) | src_x );
+      OUT_BATCH( src_pitch );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset );
+      ADVANCE_BATCH();
+   }
+   else {
+      BEGIN_BATCH(8, INTEL_BATCH_NO_CLIPRECTS);
+      OUT_BATCH( CMD );
+      OUT_BATCH( (dst_pitch & 0xffff) | BR13 );
+      OUT_BATCH( (0 << 16) | dst_x );
+      OUT_BATCH( (h << 16) | dst_x2 );
+      OUT_RELOC( dst_buffer, BRW_BUFFER_ACCESS_WRITE,
+		 dst_offset + dst_y * dst_pitch );
+      OUT_BATCH( (src_pitch & 0xffff) );
+      OUT_RELOC( src_buffer, BRW_BUFFER_ACCESS_READ,
+		 src_offset + src_y * src_pitch );
+      ADVANCE_BATCH();
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_blit.h b/src/gallium/drivers/i965simple/brw_blit.h
new file mode 100644
index 0000000000..111c5d91d3
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_blit.h
@@ -0,0 +1,33 @@
+#ifndef BRW_BLIT_H
+#define BRW_BLIT_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_buffer;
+struct brw_context;
+
+void brw_fill_blit(struct brw_context *intel,
+                   unsigned cpp,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned dst_offset,
+                   boolean dst_tiled,
+                   short x, short y,
+                   short w, short h,
+                   unsigned color);
+void brw_copy_blit(struct brw_context *intel,
+                   unsigned do_flip,
+                   unsigned cpp,
+                   short src_pitch,
+                   struct pipe_buffer *src_buffer,
+                   unsigned  src_offset,
+                   boolean src_tiled,
+                   short dst_pitch,
+                   struct pipe_buffer *dst_buffer,
+                   unsigned  dst_offset,
+                   boolean dst_tiled,
+                   short src_x, short src_y,
+                   short dst_x, short dst_y,
+                   short w, short h,
+                   unsigned logic_op);
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_cc.c b/src/gallium/drivers/i965simple/brw_cc.c
new file mode 100644
index 0000000000..79d4150383
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_cc.c
@@ -0,0 +1,269 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_util.h"
+
+
+static int brw_translate_compare_func(int func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+      return BRW_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:
+      return BRW_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_LEQUAL:
+      return BRW_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:
+      return BRW_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_GEQUAL:
+      return BRW_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:
+      return BRW_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_EQUAL:
+      return BRW_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_ALWAYS:
+      return BRW_COMPAREFUNCTION_ALWAYS;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return BRW_COMPAREFUNCTION_ALWAYS;
+}
+
+static int brw_translate_stencil_op(int op)
+{
+   switch(op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return BRW_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return BRW_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return BRW_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return BRW_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return BRW_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return BRW_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return BRW_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return BRW_STENCILOP_INVERT;
+   default:
+      return BRW_STENCILOP_ZERO;
+   }
+}
+
+
+static int brw_translate_logic_op(int opcode)
+{
+   switch(opcode) {
+   case PIPE_LOGICOP_CLEAR:
+      return BRW_LOGICOPFUNCTION_CLEAR;
+   case PIPE_LOGICOP_AND:
+      return BRW_LOGICOPFUNCTION_AND;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return BRW_LOGICOPFUNCTION_AND_REVERSE;
+   case PIPE_LOGICOP_COPY:
+      return BRW_LOGICOPFUNCTION_COPY;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return BRW_LOGICOPFUNCTION_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return BRW_LOGICOPFUNCTION_AND_INVERTED;
+   case PIPE_LOGICOP_NOOP:
+      return BRW_LOGICOPFUNCTION_NOOP;
+   case PIPE_LOGICOP_XOR:
+      return BRW_LOGICOPFUNCTION_XOR;
+   case PIPE_LOGICOP_OR:
+      return BRW_LOGICOPFUNCTION_OR;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return BRW_LOGICOPFUNCTION_OR_INVERTED;
+   case PIPE_LOGICOP_NOR:
+      return BRW_LOGICOPFUNCTION_NOR;
+   case PIPE_LOGICOP_EQUIV:
+      return BRW_LOGICOPFUNCTION_EQUIV;
+   case PIPE_LOGICOP_INVERT:
+      return BRW_LOGICOPFUNCTION_INVERT;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return BRW_LOGICOPFUNCTION_OR_REVERSE;
+   case PIPE_LOGICOP_NAND:
+      return BRW_LOGICOPFUNCTION_NAND;
+   case PIPE_LOGICOP_SET:
+      return BRW_LOGICOPFUNCTION_SET;
+   default:
+      return BRW_LOGICOPFUNCTION_SET;
+   }
+}
+
+
+static void upload_cc_vp( struct brw_context *brw )
+{
+   struct brw_cc_viewport ccv;
+
+   memset(&ccv, 0, sizeof(ccv));
+
+   ccv.min_depth = 0.0;
+   ccv.max_depth = 1.0;
+
+   brw->cc.vp_gs_offset = brw_cache_data( &brw->cache[BRW_CC_VP], &ccv );
+}
+
+const struct brw_tracked_state brw_cc_vp = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_cc_vp
+};
+
+
+static void upload_cc_unit( struct brw_context *brw )
+{
+   struct brw_cc_unit_state cc;
+
+   memset(&cc, 0, sizeof(cc));
+
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      cc.cc0.stencil_enable = brw->attribs.DepthStencil->stencil[0].enabled;
+      cc.cc0.stencil_func = brw_translate_compare_func(brw->attribs.DepthStencil->stencil[0].func);
+      cc.cc0.stencil_fail_op = brw_translate_stencil_op(brw->attribs.DepthStencil->stencil[0].fail_op);
+      cc.cc0.stencil_pass_depth_fail_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zfail_op);
+      cc.cc0.stencil_pass_depth_pass_op = brw_translate_stencil_op(
+         brw->attribs.DepthStencil->stencil[0].zpass_op);
+      cc.cc1.stencil_ref = brw->attribs.DepthStencil->stencil[0].ref_value;
+      cc.cc1.stencil_write_mask = brw->attribs.DepthStencil->stencil[0].write_mask;
+      cc.cc1.stencil_test_mask = brw->attribs.DepthStencil->stencil[0].value_mask;
+
+      if (brw->attribs.DepthStencil->stencil[1].enabled) {
+	 cc.cc0.bf_stencil_enable = brw->attribs.DepthStencil->stencil[1].enabled;
+	 cc.cc0.bf_stencil_func = brw_translate_compare_func(
+            brw->attribs.DepthStencil->stencil[1].func);
+	 cc.cc0.bf_stencil_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].fail_op);
+	 cc.cc0.bf_stencil_pass_depth_fail_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zfail_op);
+	 cc.cc0.bf_stencil_pass_depth_pass_op = brw_translate_stencil_op(
+            brw->attribs.DepthStencil->stencil[1].zpass_op);
+	 cc.cc1.bf_stencil_ref = brw->attribs.DepthStencil->stencil[1].ref_value;
+	 cc.cc2.bf_stencil_write_mask = brw->attribs.DepthStencil->stencil[1].write_mask;
+	 cc.cc2.bf_stencil_test_mask = brw->attribs.DepthStencil->stencil[1].value_mask;
+      }
+
+      /* Not really sure about this:
+       */
+      if (brw->attribs.DepthStencil->stencil[0].write_mask ||
+	  brw->attribs.DepthStencil->stencil[1].write_mask)
+	 cc.cc0.stencil_write_enable = 1;
+   }
+
+   /* BRW_NEW_BLEND */
+   if (brw->attribs.Blend->logicop_enable) {
+      cc.cc2.logicop_enable = 1;
+      cc.cc5.logicop_func = brw_translate_logic_op( brw->attribs.Blend->logicop_func );
+   }
+   else if (brw->attribs.Blend->blend_enable) {
+      int eqRGB = brw->attribs.Blend->rgb_func;
+      int eqA = brw->attribs.Blend->alpha_func;
+      int srcRGB = brw->attribs.Blend->rgb_src_factor;
+      int dstRGB = brw->attribs.Blend->rgb_dst_factor;
+      int srcA = brw->attribs.Blend->alpha_src_factor;
+      int dstA = brw->attribs.Blend->alpha_dst_factor;
+
+      if (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX) {
+	 srcRGB = dstRGB = PIPE_BLENDFACTOR_ONE;
+      }
+
+      if (eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX) {
+	 srcA = dstA = PIPE_BLENDFACTOR_ONE;
+      }
+
+      cc.cc6.dest_blend_factor = brw_translate_blend_factor(dstRGB);
+      cc.cc6.src_blend_factor = brw_translate_blend_factor(srcRGB);
+      cc.cc6.blend_function = brw_translate_blend_equation( eqRGB );
+
+      cc.cc5.ia_dest_blend_factor = brw_translate_blend_factor(dstA);
+      cc.cc5.ia_src_blend_factor = brw_translate_blend_factor(srcA);
+      cc.cc5.ia_blend_function = brw_translate_blend_equation( eqA );
+
+      cc.cc3.blend_enable = 1;
+      cc.cc3.ia_blend_enable = (srcA != srcRGB ||
+				dstA != dstRGB ||
+				eqA != eqRGB);
+   }
+   
+   /* BRW_NEW_ALPHATEST
+    */
+   if (brw->attribs.DepthStencil->alpha.enabled) {
+      cc.cc3.alpha_test = 1;
+      cc.cc3.alpha_test_func = 
+	 brw_translate_compare_func(brw->attribs.DepthStencil->alpha.func);
+
+      cc.cc7.alpha_ref.ub[0] = float_to_ubyte(brw->attribs.DepthStencil->alpha.ref);
+
+      cc.cc3.alpha_test_format = BRW_ALPHATEST_FORMAT_UNORM8;
+   }
+
+   if (brw->attribs.Blend->dither) {
+      cc.cc5.dither_enable = 1;
+      cc.cc6.y_dither_offset = 0;
+      cc.cc6.x_dither_offset = 0;
+   }
+
+   if (brw->attribs.DepthStencil->depth.enabled) {
+      cc.cc2.depth_test = brw->attribs.DepthStencil->depth.enabled;
+      cc.cc2.depth_test_function = brw_translate_compare_func(brw->attribs.DepthStencil->depth.func);
+      cc.cc2.depth_write_enable = brw->attribs.DepthStencil->depth.writemask;
+   }
+
+   /* CACHE_NEW_CC_VP */
+   cc.cc4.cc_viewport_state_offset =  brw->cc.vp_gs_offset >> 5;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      cc.cc5.statistics_enable = 1;
+
+   brw->cc.state_gs_offset = brw_cache_data( &brw->cache[BRW_CC_UNIT], &cc );
+}
+
+const struct brw_tracked_state brw_cc_unit = {
+   .dirty = {
+      .brw = BRW_NEW_DEPTH_STENCIL | BRW_NEW_BLEND | BRW_NEW_ALPHA_TEST,
+      .cache = CACHE_NEW_CC_VP
+   },
+   .update = upload_cc_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_clip.c b/src/gallium/drivers/i965simple/brw_clip.c
new file mode 100644
index 0000000000..268124cc53
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.c
@@ -0,0 +1,206 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_clip.h"
+
+#define FRONT_UNFILLED_BIT  0x1
+#define BACK_UNFILLED_BIT   0x2
+
+
+static void compile_clip_prog( struct brw_context *brw,
+			     struct brw_clip_prog_key *key )
+{
+   struct brw_clip_compile c;
+   const unsigned *program;
+   unsigned program_size;
+   unsigned delta;
+   unsigned i;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   c.key = *key;
+
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.header_position_offset = ATTR_SIZE;
+
+   for (i = 0, delta = REG_SIZE; i < PIPE_MAX_SHADER_OUTPUTS; i++)
+      if (c.key.attrs & (1<<i)) {
+	 c.offset[i] = delta;
+	 delta += ATTR_SIZE;
+      }
+
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+   c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Would ideally have the option of producing a program which could
+    * do all three:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_TRIANGLES:
+#if 0
+      if (key->do_unfilled)
+	 brw_emit_unfilled_clip( &c );
+      else
+#endif
+	 brw_emit_tri_clip( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      brw_emit_line_clip( &c );
+      break;
+   case PIPE_PRIM_POINTS:
+      brw_emit_point_clip( &c );
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->clip.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_CLIP_PROG],
+						&c.key,
+						sizeof(c.key),
+						program,
+						program_size,
+						&c.prog_data,
+						&brw->clip.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_clip_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_CLIP_PROG],
+			   key, sizeof(*key),
+			   &brw->clip.prog_data,
+			   &brw->clip.prog_gs_offset);
+}
+
+
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_clip_prog(struct brw_context *brw)
+{
+   struct brw_clip_prog_key key;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key:
+    */
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   key.primitive = brw->reduced_primitive;
+   /* CACHE_NEW_VS_PROG */
+   key.attrs = brw->vs.prog_data->outputs_written;
+   /* BRW_NEW_RASTER */
+   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+   /* BRW_NEW_CLIP */
+   key.nr_userclip = brw->attribs.Clip.nr; /* XXX */
+
+#if 0
+   key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   if (key.primitive == PIPE_PRIM_TRIANGLES) {
+      if (brw->attribs.Raster->cull_mode == PIPE_WINDING_BOTH)
+	 key.clip_mode = BRW_CLIPMODE_REJECT_ALL;
+      else {
+         if (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+             brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL)
+            key.do_unfilled = 1;
+
+	 /* Most cases the fixed function units will handle.  Cases where
+	  * one or more polygon faces are unfilled will require help:
+	  */
+	 if (key.do_unfilled) {
+	    key.clip_mode = BRW_CLIPMODE_CLIP_NON_REJECTED;
+
+	    if (brw->attribs.Raster->offset_cw ||
+                brw->attribs.Raster->offset_ccw) {
+	       key.offset_units = brw->attribs.Raster->offset_units;
+	       key.offset_factor = brw->attribs.Raster->offset_scale;
+	    }
+            key.fill_ccw = brw->attribs.Raster->fill_ccw;
+            key.fill_cw = brw->attribs.Raster->fill_cw;
+            key.offset_ccw = brw->attribs.Raster->offset_ccw;
+            key.offset_cw = brw->attribs.Raster->offset_cw;
+            if (brw->attribs.Raster->light_twoside &&
+                key.fill_cw != CLIP_CULL)
+               key.copy_bfc_cw = 1;
+	 }
+      }
+   }
+#else
+   key.clip_mode = BRW_CLIPMODE_ACCEPT_ALL;
+#endif
+
+   if (!search_cache(brw, &key))
+      compile_clip_prog( brw, &key );
+}
+
+const struct brw_tracked_state brw_clip_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_CLIP |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_clip_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip.h b/src/gallium/drivers/i965simple/brw_clip.h
new file mode 100644
index 0000000000..d70fc094ff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip.h
@@ -0,0 +1,170 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_CLIP_H
+#define BRW_CLIP_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_VERTS (3+6+6)	
+
+/* Note that if unfilled primitives are being emitted, we have to fix
+ * up polygon offset and flatshading at this point:
+ */
+struct brw_clip_prog_key {
+   unsigned attrs:32;		
+   unsigned primitive:4;
+   unsigned nr_userclip:3;
+   unsigned do_flat_shading:1;
+   unsigned do_unfilled:1;
+   unsigned fill_cw:2;		/* includes cull information */
+   unsigned fill_ccw:2;		/* includes cull information */
+   unsigned offset_cw:1;
+   unsigned offset_ccw:1;
+   unsigned pad0:17;
+
+   unsigned copy_bfc_cw:1;
+   unsigned copy_bfc_ccw:1;
+   unsigned clip_mode:3;
+   unsigned pad1:27;
+   
+   float offset_factor;
+   float offset_units;
+};
+
+
+#define CLIP_LINE   0
+#define CLIP_POINT  1
+#define CLIP_FILL   2
+#define CLIP_CULL   3
+
+
+#define PRIM_MASK  (0x1f)
+
+struct brw_clip_compile {
+   struct brw_compile func;
+   struct brw_clip_prog_key key;
+   struct brw_clip_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_VERTS];
+
+      struct brw_reg t;
+      struct brw_reg t0, t1;
+      struct brw_reg dp0, dp1;
+
+      struct brw_reg dpPrev;
+      struct brw_reg dp;
+      struct brw_reg loopcount;
+      struct brw_reg nr_verts;
+      struct brw_reg planemask;
+
+      struct brw_reg inlist;
+      struct brw_reg outlist;
+      struct brw_reg freelist;
+
+      struct brw_reg dir;
+      struct brw_reg tmp0, tmp1;
+      struct brw_reg offset;
+      
+      struct brw_reg fixed_planes;
+      struct brw_reg plane_equation;
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   boolean need_direction;
+
+   unsigned last_mrf;
+
+   unsigned header_position_offset;
+   unsigned offset[PIPE_MAX_ATTRIBS];
+};
+
+#define ATTR_SIZE  (4*4)
+
+/* Points are only culled, so no need for a clip routine, however it
+ * works out easier to have a dummy one.
+ */
+void brw_emit_unfilled_clip( struct brw_clip_compile *c );
+void brw_emit_tri_clip( struct brw_clip_compile *c );
+void brw_emit_line_clip( struct brw_clip_compile *c );
+void brw_emit_point_clip( struct brw_clip_compile *c );
+
+/* brw_clip_tri.c, for use by the unfilled clip routine:
+ */
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c );
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c );
+void brw_clip_tri( struct brw_clip_compile *c );
+void brw_clip_tri_emit_polygon( struct brw_clip_compile *c );
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c, 
+			      unsigned nr_verts );
+
+
+/* Utils:
+ */
+
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag );
+
+void brw_clip_init_planes( struct brw_clip_compile *c );
+
+void brw_clip_emit_vue(struct brw_clip_compile *c, 
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header);
+
+void brw_clip_kill_thread(struct brw_clip_compile *c);
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c );
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c );
+
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from );
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_clip_line.c b/src/gallium/drivers/i965simple/brw_clip_line.c
new file mode 100644
index 0000000000..75d9e5fcda
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_line.c
@@ -0,0 +1,245 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+static void brw_clip_line_alloc_regs( struct brw_clip_compile *c )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < 4; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->reg.t           = brw_vec1_grf(i, 0);
+   c->reg.t0          = brw_vec1_grf(i, 1);
+   c->reg.t1          = brw_vec1_grf(i, 2);
+   c->reg.planemask   = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dp0         = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp1         = brw_vec1_grf(i, 4);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+/* Line clipping, more or less following the following algorithm:
+ *
+ *  for (p=0;p<MAX_PLANES;p++) {
+ *     if (clipmask & (1 << p)) {
+ *        float dp0 = DOTPROD( vtx0, plane[p] );
+ *        float dp1 = DOTPROD( vtx1, plane[p] );
+ *
+ *        if (IS_NEGATIVE(dp1)) {
+ *           float t = dp1 / (dp1 - dp0);
+ *           if (t > t1) t1 = t;
+ *        } else {
+ *           float t = dp0 / (dp0 - dp1);
+ *           if (t > t0) t0 = t;
+ *        }
+ *
+ *        if (t0 + t1 >= 1.0)
+ *           return;
+ *     }
+ *  }
+ *
+ *  interp( ctx, newvtx0, vtx0, vtx1, t0 );
+ *  interp( ctx, newvtx1, vtx1, vtx0, t1 );
+ *
+ */
+static void clip_and_emit_line( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx0     = brw_indirect(0, 0);
+   struct brw_indirect vtx1      = brw_indirect(1, 0);
+   struct brw_indirect newvtx0   = brw_indirect(2, 0);
+   struct brw_indirect newvtx1   = brw_indirect(3, 0);
+   struct brw_indirect plane_ptr = brw_indirect(4, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *is_negative;
+   struct brw_instruction *is_neg2;
+   struct brw_instruction *not_culled;
+   struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
+
+   brw_MOV(p, get_addr_reg(vtx0),      brw_address(c->reg.vertex[0]));
+   brw_MOV(p, get_addr_reg(vtx1),      brw_address(c->reg.vertex[1]));
+   brw_MOV(p, get_addr_reg(newvtx0),   brw_address(c->reg.vertex[2]));
+   brw_MOV(p, get_addr_reg(newvtx1),   brw_address(c->reg.vertex[3]));
+   brw_MOV(p, get_addr_reg(plane_ptr), brw_clip_plane0_address(c));
+
+   /* Note: init t0, t1 together:
+    */
+   brw_MOV(p, vec2(c->reg.t0), brw_imm_f(0));
+
+   brw_clip_init_planes(c);
+   brw_clip_init_clipmask(c);
+
+   /* -ve rhw workaround */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(0x3f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, v1_null_ud, c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+#if 0
+	 /* dp = DP4(vtx->position, plane)
+	  */
+	 brw_DP4(p, vec4(c->reg.dp0), deref_4f(vtx0, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+
+	 /* if (IS_NEGATIVE(dp1))
+	  */
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, vec4(c->reg.dp1), deref_4f(vtx1, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+#else
+         #warning "disabled"
+#endif
+	 is_negative = brw_IF(p, BRW_EXECUTE_1);
+	 {
+	    brw_ADD(p, c->reg.t, c->reg.dp1, negate(c->reg.dp0));
+	    brw_math_invert(p, c->reg.t, c->reg.t);
+	    brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp1);
+
+	    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t1 );
+	    brw_MOV(p, c->reg.t1, c->reg.t);
+	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 }
+	 is_negative = brw_ELSE(p, is_negative);
+	 {
+	    /* Coming back in.  We know that both cannot be negative
+	     * because the line would have been culled in that case.
+	     */
+
+	    /* If both are positive, do nothing */
+             brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
+             is_neg2 = brw_IF(p, BRW_EXECUTE_1);
+             {
+		brw_ADD(p, c->reg.t, c->reg.dp0, negate(c->reg.dp1));
+		brw_math_invert(p, c->reg.t, c->reg.t);
+		brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp0);
+
+		brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_G, c->reg.t, c->reg.t0 );
+		brw_MOV(p, c->reg.t0, c->reg.t);
+		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	     }
+	     brw_ENDIF(p, is_neg2);
+	 }
+	 brw_ENDIF(p, is_negative);
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* while (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+
+   brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   not_culled = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, FALSE);
+      brw_clip_interp_vertex(c, newvtx1, vtx1, vtx0, c->reg.t1, FALSE);
+
+      brw_clip_emit_vue(c, newvtx0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+      brw_clip_emit_vue(c, newvtx1, 0, 1, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+   }
+   brw_ENDIF(p, not_culled);
+   brw_clip_kill_thread(c);
+}
+
+
+
+void brw_emit_line_clip( struct brw_clip_compile *c )
+{
+   brw_clip_line_alloc_regs(c);
+
+   if (c->key.do_flat_shading)
+      brw_clip_copy_colors(c, 0, 1);
+
+   clip_and_emit_line(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_point.c b/src/gallium/drivers/i965simple/brw_clip_point.c
new file mode 100644
index 0000000000..6fce7210d1
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_point.c
@@ -0,0 +1,47 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+/* Point clipping, nothing to do?
+ */
+void brw_emit_point_clip( struct brw_clip_compile *c )
+{
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_tri_alloc_regs(c, 0);
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_state.c b/src/gallium/drivers/i965simple/brw_clip_state.c
new file mode 100644
index 0000000000..8e78dd51be
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_state.c
@@ -0,0 +1,93 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_clip_unit( struct brw_context *brw )
+{
+   struct brw_clip_unit_state clip;
+
+   memset(&clip, 0, sizeof(clip));
+
+   /* CACHE_NEW_CLIP_PROG */
+   clip.thread0.grf_reg_count =
+      align(brw->clip.prog_data->total_grf, 16) / 16 - 1;
+   clip.thread0.kernel_start_pointer = brw->clip.prog_gs_offset >> 6;
+   clip.thread3.urb_entry_read_length = brw->clip.prog_data->urb_read_length;
+   clip.thread3.const_urb_entry_read_length = brw->clip.prog_data->curb_read_length;
+   clip.clip5.clip_mode = brw->clip.prog_data->clip_mode;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   clip.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+
+   /* BRW_NEW_URB_FENCE */
+   clip.thread4.nr_urb_entries = brw->urb.nr_clip_entries; 
+   clip.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   clip.thread4.max_threads = 1; /* 2 threads */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      clip.thread4.stats_enable = 1; 
+
+   /* CONSTANT */
+   clip.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   clip.thread1.single_program_flow = 1;
+   clip.thread3.dispatch_grf_start_reg = 1;
+   clip.thread3.urb_entry_read_offset = 0;
+   clip.clip5.userclip_enable_flags = 0x7f;
+   clip.clip5.userclip_must_clip = 1;
+   clip.clip5.guard_band_enable = 0;
+   clip.clip5.viewport_z_clip_enable = 1;
+   clip.clip5.viewport_xy_clip_enable = 1;
+   clip.clip5.vertex_position_space = BRW_CLIP_NDCSPACE;
+   clip.clip5.api_mode = BRW_CLIP_API_OGL;   
+   clip.clip6.clipper_viewport_state_ptr = 0;
+   clip.viewport_xmin = -1;
+   clip.viewport_xmax = 1;
+   clip.viewport_ymin = -1;
+   clip.viewport_ymax = 1;
+
+   brw->clip.state_gs_offset = brw_cache_data( &brw->cache[BRW_CLIP_UNIT], &clip );
+}
+
+
+const struct brw_tracked_state brw_clip_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_CLIP_PROG
+   },
+   .update = upload_clip_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_clip_tri.c b/src/gallium/drivers/i965simple/brw_clip_tri.c
new file mode 100644
index 0000000000..c5da7b825e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_tri.c
@@ -0,0 +1,566 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmps( struct brw_clip_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
+			      unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   if (c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec4_grf(i, 0);
+      i += (6 + c->key.nr_userclip + 1) / 2;
+
+      c->prog_data.curb_read_length = (6 + c->key.nr_userclip + 1) / 2;
+   }
+   else
+      c->prog_data.curb_read_length = 0;
+
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   if (c->nr_attrs & 1) {
+      for (j = 0; j < 3; j++) {
+	 unsigned delta = c->nr_attrs*16 + 32;
+	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
+      }
+   }
+
+   c->reg.t          = brw_vec1_grf(i, 0);
+   c->reg.loopcount  = retype(brw_vec1_grf(i, 1), BRW_REGISTER_TYPE_UD);
+   c->reg.nr_verts   = retype(brw_vec1_grf(i, 2), BRW_REGISTER_TYPE_UD);
+   c->reg.planemask  = retype(brw_vec1_grf(i, 3), BRW_REGISTER_TYPE_UD);
+   c->reg.plane_equation = brw_vec4_grf(i, 4);
+   i++;
+
+   c->reg.dpPrev     = brw_vec1_grf(i, 0); /* fixme - dp4 will clobber r.1,2,3 */
+   c->reg.dp         = brw_vec1_grf(i, 4);
+   i++;
+
+   c->reg.inlist     = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.outlist    = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   c->reg.freelist   = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, i, 0);
+   i++;
+
+   if (!c->key.nr_userclip) {
+      c->reg.fixed_planes = brw_vec8_grf(i, 0);
+      i++;
+   }
+
+   if (c->key.do_unfilled) {
+      c->reg.dir     = brw_vec4_grf(i, 0);
+      c->reg.offset  = brw_vec4_grf(i, 4);
+      i++;
+      c->reg.tmp0    = brw_vec4_grf(i, 0);
+      c->reg.tmp1    = brw_vec4_grf(i, 4);
+      i++;
+   }
+
+   c->first_tmp = i;
+   c->last_tmp = i;
+
+   c->prog_data.urb_read_length = c->nr_regs; /* ? */
+   c->prog_data.total_grf = i;
+}
+
+
+
+void brw_clip_tri_init_vertices( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+   struct brw_instruction *is_rev;
+
+   /* Initial list of indices for incoming vertexes:
+    */
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE));
+
+   /* XXX: Is there an easier way to do this?  Need to reverse every
+    * second tristrip element:  Can ignore sometimes?
+    */
+   is_rev = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[1]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[0]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(-1));
+   }
+   is_rev = brw_ELSE(p, is_rev);
+   {
+      brw_MOV(p, get_element(c->reg.inlist, 0),  brw_address(c->reg.vertex[0]) );
+      brw_MOV(p, get_element(c->reg.inlist, 1),  brw_address(c->reg.vertex[1]) );
+      if (c->need_direction)
+	 brw_MOV(p, c->reg.dir, brw_imm_f(1));
+   }
+   brw_ENDIF(p, is_rev);
+
+   brw_MOV(p, get_element(c->reg.inlist, 2),  brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, brw_vec8_grf(c->reg.outlist.nr, 0), brw_imm_f(0));
+   brw_MOV(p, c->reg.nr_verts, brw_imm_ud(3));
+}
+
+
+
+void brw_clip_tri_flat_shade( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = c->reg.loopcount; /* handy temporary */
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK));
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   BRW_CONDITIONAL_EQ,
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_copy_colors(c, 1, 0);
+      brw_clip_copy_colors(c, 2, 0);
+   }
+   is_poly = brw_ELSE(p, is_poly);
+   {
+      brw_clip_copy_colors(c, 0, 2);
+      brw_clip_copy_colors(c, 1, 2);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+/* Use mesa's clipping algorithms, translated to GEN4 assembly.
+ */
+void brw_clip_tri( struct brw_clip_compile *c )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_indirect vtx = brw_indirect(0, 0);
+   struct brw_indirect vtxPrev = brw_indirect(1, 0);
+   struct brw_indirect vtxOut = brw_indirect(2, 0);
+   struct brw_indirect plane_ptr = brw_indirect(3, 0);
+   struct brw_indirect inlist_ptr = brw_indirect(4, 0);
+   struct brw_indirect outlist_ptr = brw_indirect(5, 0);
+   struct brw_indirect freelist_ptr = brw_indirect(6, 0);
+   struct brw_instruction *plane_loop;
+   struct brw_instruction *plane_active;
+   struct brw_instruction *vertex_loop;
+   struct brw_instruction *next_test;
+   struct brw_instruction *prev_test;
+
+   brw_MOV(p, get_addr_reg(vtxPrev),     brw_address(c->reg.vertex[2]) );
+   brw_MOV(p, get_addr_reg(plane_ptr),   brw_clip_plane0_address(c));
+   brw_MOV(p, get_addr_reg(inlist_ptr),  brw_address(c->reg.inlist));
+   brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+
+   brw_MOV(p, get_addr_reg(freelist_ptr), brw_address(c->reg.vertex[3]) );
+
+   plane_loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      /* if (planemask & 1)
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_AND(p, vec1(brw_null_reg()), c->reg.planemask, brw_imm_ud(1));
+
+      plane_active = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 /* vtxOut = freelist_ptr++
+	  */
+	 brw_MOV(p, get_addr_reg(vtxOut),       get_addr_reg(freelist_ptr) );
+	 brw_ADD(p, get_addr_reg(freelist_ptr), get_addr_reg(freelist_ptr), brw_imm_uw(c->nr_regs * REG_SIZE));
+
+	 if (c->key.nr_userclip)
+	    brw_MOV(p, c->reg.plane_equation, deref_4f(plane_ptr, 0));
+	 else
+	    brw_MOV(p, c->reg.plane_equation, deref_4b(plane_ptr, 0));
+
+	 brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+	 brw_MOV(p, c->reg.nr_verts, brw_imm_ud(0));
+
+	 vertex_loop = brw_DO(p, BRW_EXECUTE_1);
+	 {
+	    /* vtx = *input_ptr;
+	     */
+	    brw_MOV(p, get_addr_reg(vtx), deref_1uw(inlist_ptr, 0));
+
+	    /* IS_NEGATIVE(prev) */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	    brw_DP4(p, vec4(c->reg.dpPrev), deref_4f(vtxPrev, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	    prev_test = brw_IF(p, BRW_EXECUTE_1);
+	    {
+	       /* IS_POSITIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+
+		  /* Coming back in.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dpPrev, negate(c->reg.dp));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dpPrev);
+
+		  /* If (vtxOut == 0) vtxOut = vtxPrev
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtxPrev) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtxPrev, vtx, c->reg.t, FALSE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+
+	    }
+	    prev_test = brw_ELSE(p, prev_test);
+	    {
+	       /* *outlist_ptr++ = vtxPrev;
+		* nr_verts++;
+		*/
+	       brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxPrev));
+	       brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+	       brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+
+	       /* IS_NEGATIVE(next)
+		*/
+	       brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	       brw_DP4(p, vec4(c->reg.dp), deref_4f(vtx, c->offset[VERT_RESULT_HPOS]), c->reg.plane_equation);
+	       next_test = brw_IF(p, BRW_EXECUTE_1);
+	       {
+		  /* Going out of bounds.  Avoid division by zero as we
+		   * know dp != dpPrev from DIFFERENT_SIGNS, above.
+		   */
+		  brw_ADD(p, c->reg.t, c->reg.dp, negate(c->reg.dpPrev));
+		  brw_math_invert(p, c->reg.t, c->reg.t);
+		  brw_MUL(p, c->reg.t, c->reg.t, c->reg.dp);
+
+		  /* If (vtxOut == 0) vtxOut = vtx
+		   */
+		  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_EQ, get_addr_reg(vtxOut), brw_imm_uw(0) );
+		  brw_MOV(p, get_addr_reg(vtxOut), get_addr_reg(vtx) );
+		  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+		  brw_clip_interp_vertex(c, vtxOut, vtx, vtxPrev, c->reg.t, TRUE);
+
+		  /* *outlist_ptr++ = vtxOut;
+		   * nr_verts++;
+		   * vtxOut = 0;
+		   */
+		  brw_MOV(p, deref_1uw(outlist_ptr, 0), get_addr_reg(vtxOut));
+		  brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_uw(sizeof(short)));
+		  brw_ADD(p, c->reg.nr_verts, c->reg.nr_verts, brw_imm_ud(1));
+		  brw_MOV(p, get_addr_reg(vtxOut), brw_imm_uw(0) );
+	       }
+	       brw_ENDIF(p, next_test);
+	    }
+	    brw_ENDIF(p, prev_test);
+
+	    /* vtxPrev = vtx;
+	     * inlist_ptr++;
+	     */
+	    brw_MOV(p, get_addr_reg(vtxPrev), get_addr_reg(vtx));
+	    brw_ADD(p, get_addr_reg(inlist_ptr), get_addr_reg(inlist_ptr), brw_imm_uw(sizeof(short)));
+
+	    /* while (--loopcount != 0)
+	     */
+	    brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	    brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+	 }
+	 brw_WHILE(p, vertex_loop);
+
+	 /* vtxPrev = *(outlist_ptr-1)  OR: outlist[nr_verts-1]
+	  * inlist = outlist
+	  * inlist_ptr = &inlist[0]
+	  * outlist_ptr = &outlist[0]
+	  */
+	 brw_ADD(p, get_addr_reg(outlist_ptr), get_addr_reg(outlist_ptr), brw_imm_w(-2));
+	 brw_MOV(p, get_addr_reg(vtxPrev), deref_1uw(outlist_ptr, 0));
+	 brw_MOV(p, brw_vec8_grf(c->reg.inlist.nr, 0), brw_vec8_grf(c->reg.outlist.nr, 0));
+	 brw_MOV(p, get_addr_reg(inlist_ptr), brw_address(c->reg.inlist));
+	 brw_MOV(p, get_addr_reg(outlist_ptr), brw_address(c->reg.outlist));
+      }
+      brw_ENDIF(p, plane_active);
+
+      /* plane_ptr++;
+       */
+      brw_ADD(p, get_addr_reg(plane_ptr), get_addr_reg(plane_ptr), brw_clip_plane_stride(c));
+
+      /* nr_verts >= 3
+       */
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      c->reg.nr_verts,
+	      brw_imm_ud(3));
+
+      /* && (planemask>>=1) != 0
+       */
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_SHR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud(1));
+   }
+   brw_WHILE(p, plane_loop);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_tri_emit_polygon(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop, *if_insn;
+
+   /* for (loopcount = nr_verts-2; loopcount > 0; loopcount--)
+    */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+   brw_ADD(p,
+	   c->reg.loopcount,
+	   c->reg.nr_verts,
+	   brw_imm_d(-2));
+
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      struct brw_indirect v0 = brw_indirect(0, 0);
+      struct brw_indirect vptr = brw_indirect(1, 0);
+
+      brw_MOV(p, get_addr_reg(vptr), brw_address(c->reg.inlist));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      brw_clip_emit_vue(c, v0, 1, 0, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_START));
+
+      brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_TRIFAN << 2));
+
+	 brw_ADD(p, get_addr_reg(vptr), get_addr_reg(vptr), brw_imm_uw(2));
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(vptr, 0));
+
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+
+      brw_clip_emit_vue(c, v0, 0, 1, ((_3DPRIM_TRIFAN << 2) | R02_PRIM_END));
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+static void do_clip_tri( struct brw_clip_compile *c )
+{
+   brw_clip_init_planes(c);
+
+   brw_clip_tri(c);
+}
+
+
+static void maybe_do_clip_tri( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      do_clip_tri(c);
+   }
+   brw_ENDIF(p, do_clip);
+}
+
+static void brw_clip_test( struct brw_clip_compile *c )
+{
+#if 0
+    struct brw_reg t = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t2 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+    struct brw_reg t3 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+
+    struct brw_reg v0 = get_tmp(c);
+    struct brw_reg v1 = get_tmp(c);
+    struct brw_reg v2 = get_tmp(c);
+
+    struct brw_indirect vt0 = brw_indirect(0, 0);
+    struct brw_indirect vt1 = brw_indirect(1, 0);
+    struct brw_indirect vt2 = brw_indirect(2, 0);
+
+    struct brw_compile *p = &c->func;
+
+    brw_MOV(p, get_addr_reg(vt0), brw_address(c->reg.vertex[0]));
+    brw_MOV(p, get_addr_reg(vt1), brw_address(c->reg.vertex[1]));
+    brw_MOV(p, get_addr_reg(vt2), brw_address(c->reg.vertex[2]));
+    brw_MOV(p, v0, deref_4f(vt0, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v1, deref_4f(vt1, c->offset[VERT_RESULT_HPOS]));
+    brw_MOV(p, v2, deref_4f(vt2, c->offset[VERT_RESULT_HPOS]));
+
+    /* test nearz, xmin, ymin plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_LE, negate(v0), get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_LE, negate(v1), get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_LE, negate(v2), get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<5)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<3)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<1)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    /* test farz, xmax, ymax plane */
+    brw_CMP(p, t1, BRW_CONDITIONAL_L, v0, get_element(v0, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t2, BRW_CONDITIONAL_L, v1, get_element(v1, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, t3, BRW_CONDITIONAL_L, v2, get_element(v2, 3));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    brw_XOR(p, t, t1, t2);
+    brw_XOR(p, t1, t2, t3);
+    brw_OR(p, t, t, t1);
+
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 0), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<4)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 1), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<2)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+    brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_NZ,
+	    get_element(t, 2), brw_imm_ud(0));
+    brw_OR(p, c->reg.planemask, c->reg.planemask, brw_imm_ud((1<<0)));
+    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+    release_tmps(c);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+void brw_emit_tri_clip( struct brw_clip_compile *c )
+{
+   struct brw_instruction *neg_rhw;
+   struct brw_compile *p = &c->func;
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+   brw_clip_init_clipmask(c);
+
+   /* if -ve rhw workaround bit is set,
+      do cliptest */
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
+	   brw_imm_ud(1<<20));
+   neg_rhw = brw_IF(p, BRW_EXECUTE_1);
+   {
+       brw_clip_test(c);
+   }
+   brw_ENDIF(p, neg_rhw);
+
+   /* Can't push into do_clip_tri because with polygon (or quad)
+    * flatshading, need to apply the flatshade here because we don't
+    * respect the PV when converting to trifan for emit:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+
+   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+      do_clip_tri(c);
+   else
+      maybe_do_clip_tri(c);
+
+   brw_clip_tri_emit_polygon(c);
+
+   /* Send an empty message to kill the thread:
+    */
+   brw_clip_kill_thread(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_clip_unfilled.c b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
new file mode 100644
index 0000000000..b774a76dd6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_unfilled.c
@@ -0,0 +1,477 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+/* This is performed against the original triangles, so no indirection
+ * required:
+BZZZT!
+ */
+static void compute_tri_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg e = c->reg.tmp0;
+   struct brw_reg f = c->reg.tmp1;
+   struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); 
+   struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); 
+
+
+   /* Calculate the vectors of two edges of the triangle:
+    */
+   brw_ADD(p, e, v0, negate(v2)); 
+   brw_ADD(p, f, v1, negate(v2)); 
+
+   /* Take their crossproduct:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+
+   brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
+}
+
+
+static void cull_direction( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   assert (!(c->key.fill_ccw == CLIP_CULL &&
+	     c->key.fill_cw == CLIP_CULL));
+
+   if (c->key.fill_ccw == CLIP_CULL)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+static void copy_bfc( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+   unsigned conditional;
+
+   /* Do we have any colors to copy? 
+    */
+   if (!(c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0]) &&
+       !(c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1]))
+      return;
+
+   /* In some wierd degnerate cases we can end up testing the
+    * direction twice, once for culling and once for bfc copying.  Oh
+    * well, that's what you get for setting wierd GL state.
+    */
+   if (c->key.copy_bfc_ccw)
+      conditional = BRW_CONDITIONAL_GE;
+   else
+      conditional = BRW_CONDITIONAL_L;
+
+   brw_CMP(p,
+	   vec1(brw_null_reg()),
+	   conditional,
+	   get_element(c->reg.dir, 2),
+	   brw_imm_f(0));
+   
+   ccw = brw_IF(p, BRW_EXECUTE_1);
+   {
+      unsigned i;
+
+      for (i = 0; i < 3; i++) {
+	 if (c->offset[VERT_RESULT_COL0] && c->offset[VERT_RESULT_BFC0])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL0]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC0]));
+
+	 if (c->offset[VERT_RESULT_COL1] && c->offset[VERT_RESULT_BFC1])
+	    brw_MOV(p, 
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_COL1]),
+		    byte_offset(c->reg.vertex[i], c->offset[VERT_RESULT_BFC1]));
+      }
+   }
+   brw_ENDIF(p, ccw);
+}
+
+
+
+
+/*
+  float iz	= 1.0 / dir.z;
+  float ac	= dir.x * iz;
+  float bc	= dir.y * iz;
+  offset = ctx->Polygon.OffsetUnits * DEPTH_SCALE;
+  offset += MAX2( abs(ac), abs(bc) ) * ctx->Polygon.OffsetFactor;
+  offset *= MRD;
+*/
+static void compute_offset( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg off = c->reg.offset;
+   struct brw_reg dir = c->reg.dir;
+   
+   brw_math_invert(p, get_element(off, 2), get_element(dir, 2));
+   brw_MUL(p, vec2(off), dir, get_element(off, 2));
+
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_GE,
+	   brw_abs(get_element(off, 0)), 
+	   brw_abs(get_element(off, 1)));
+
+   brw_SEL(p, vec1(off), brw_abs(get_element(off, 0)), brw_abs(get_element(off, 1)));
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+   brw_MUL(p, vec1(off), off, brw_imm_f(c->key.offset_factor));
+   brw_ADD(p, vec1(off), off, brw_imm_f(c->key.offset_units));
+}
+
+
+static void merge_edgeflags( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *is_poly;
+   struct brw_reg tmp0 = get_element_ud(c->reg.tmp0, 0);
+
+   brw_AND(p, tmp0, get_element_ud(c->reg.R0, 2), brw_imm_ud(PRIM_MASK)); 
+   brw_CMP(p, 
+	   vec1(brw_null_reg()), 
+	   BRW_CONDITIONAL_EQ, 
+	   tmp0,
+	   brw_imm_ud(_3DPRIM_POLYGON));
+
+   /* Get away with using reg.vertex because we know that this is not
+    * a _3DPRIM_TRISTRIP_REVERSE:
+    */
+   is_poly = brw_IF(p, BRW_EXECUTE_1);
+   {   
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<8));
+      brw_MOV(p, byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_EQ);
+      brw_AND(p, vec1(brw_null_reg()), get_element_ud(c->reg.R0, 2), brw_imm_ud(1<<9));
+      brw_MOV(p, byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_EDGE]), brw_imm_f(0));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+   brw_ENDIF(p, is_poly);
+}
+
+
+
+static void apply_one_offset( struct brw_clip_compile *c,
+			  struct brw_indirect vert )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg pos = deref_4f(vert, c->offset[VERT_RESULT_HPOS]);
+   struct brw_reg z = get_element(pos, 2);
+
+   brw_ADD(p, z, z, vec1(c->reg.offset));
+}
+
+
+
+/***********************************************************************
+ * Output clipped polygon as an unfilled primitive:
+ */
+static void emit_lines(struct brw_clip_compile *c,
+		       boolean do_offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_edge;
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v1 = brw_indirect(1, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+   struct brw_indirect v1ptr = brw_indirect(3, 0);
+
+   /* Need a seperate loop for offset:
+    */
+   if (do_offset) {
+      brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+      brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+      loop = brw_DO(p, BRW_EXECUTE_1);
+      {
+	 brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+	 brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+	    
+	 apply_one_offset(c, v0);
+	    
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_G);
+	 brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+      }
+      brw_WHILE(p, loop);
+   }
+
+   /* v1ptr = &inlist[nr_verts]
+    * *v1ptr = v0
+    */
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v0ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_ADD(p, get_addr_reg(v1ptr), get_addr_reg(v1ptr), retype(c->reg.nr_verts, BRW_REGISTER_TYPE_UW));
+   brw_MOV(p, deref_1uw(v1ptr, 0), deref_1uw(v0ptr, 0));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_MOV(p, get_addr_reg(v1), deref_1uw(v0ptr, 2));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw edge if edgeflag != 0 */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_edge = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_START);
+	 brw_clip_emit_vue(c, v1, 1, 0, (_3DPRIM_LINESTRIP << 2) | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_edge);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+static void emit_points(struct brw_clip_compile *c,
+			boolean do_offset )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *loop;
+   struct brw_instruction *draw_point;
+
+   struct brw_indirect v0 = brw_indirect(0, 0);
+   struct brw_indirect v0ptr = brw_indirect(2, 0);
+
+   brw_MOV(p, c->reg.loopcount, c->reg.nr_verts);
+   brw_MOV(p, get_addr_reg(v0ptr), brw_address(c->reg.inlist));
+
+   loop = brw_DO(p, BRW_EXECUTE_1);
+   {
+      brw_MOV(p, get_addr_reg(v0), deref_1uw(v0ptr, 0));
+      brw_ADD(p, get_addr_reg(v0ptr), get_addr_reg(v0ptr), brw_imm_uw(2));
+
+      /* draw if edgeflag != 0 
+       */
+      brw_CMP(p, 
+	      vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, 
+	      deref_1f(v0, c->offset[VERT_RESULT_EDGE]),
+	      brw_imm_f(0));
+      draw_point = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 if (do_offset)
+	    apply_one_offset(c, v0);
+
+	 brw_clip_emit_vue(c, v0, 1, 0, (_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END);
+      }
+      brw_ENDIF(p, draw_point);
+
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+      brw_ADD(p, c->reg.loopcount, c->reg.loopcount, brw_imm_d(-1));
+   }
+   brw_WHILE(p, loop);
+}
+
+
+
+
+
+
+
+static void emit_primitives( struct brw_clip_compile *c,
+			     unsigned mode, 
+			     boolean do_offset )
+{
+   switch (mode) {
+   case CLIP_FILL:
+      brw_clip_tri_emit_polygon(c);
+      break;
+
+   case CLIP_LINE:
+      emit_lines(c, do_offset);
+      break;
+
+   case CLIP_POINT:
+      emit_points(c, do_offset);
+      break;
+
+   case CLIP_CULL:
+      assert(0);
+      break;
+   }
+} 
+
+
+
+static void emit_unfilled_primitives( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *ccw;
+
+   /* Direction culling has already been done.
+    */
+   if (c->key.fill_ccw != c->key.fill_cw &&
+       c->key.fill_ccw != CLIP_CULL &&
+       c->key.fill_cw != CLIP_CULL)
+   {
+      brw_CMP(p,
+	      vec1(brw_null_reg()),
+	      BRW_CONDITIONAL_GE,
+	      get_element(c->reg.dir, 2),
+	      brw_imm_f(0));
+   
+      ccw = brw_IF(p, BRW_EXECUTE_1);
+      {
+	 emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+      }
+      ccw = brw_ELSE(p, ccw);
+      {
+	 emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+      }
+      brw_ENDIF(p, ccw);
+   }
+   else if (c->key.fill_cw != CLIP_CULL) {
+      emit_primitives(c, c->key.fill_cw, c->key.offset_cw);
+   }
+   else if (c->key.fill_ccw != CLIP_CULL) { 
+      emit_primitives(c, c->key.fill_ccw, c->key.offset_ccw);
+   }
+}
+
+
+
+
+static void check_nr_verts( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.nr_verts, brw_imm_d(3));      
+   if_insn = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_kill_thread(c);
+   }
+   brw_ENDIF(p, if_insn);
+}
+
+
+void brw_emit_unfilled_clip( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *do_clip;
+   
+
+   c->need_direction = ((c->key.offset_ccw || c->key.offset_cw) ||
+			(c->key.fill_ccw != c->key.fill_cw) ||
+			c->key.fill_ccw == CLIP_CULL ||
+			c->key.fill_cw == CLIP_CULL ||
+			c->key.copy_bfc_cw ||
+			c->key.copy_bfc_ccw);
+
+   brw_clip_tri_alloc_regs(c, 3 + c->key.nr_userclip + 6);
+   brw_clip_tri_init_vertices(c);
+
+   assert(c->offset[VERT_RESULT_EDGE]);
+
+   if (c->key.fill_ccw == CLIP_CULL &&
+       c->key.fill_cw == CLIP_CULL) {
+      brw_clip_kill_thread(c);
+      return;
+   }
+
+   merge_edgeflags(c);
+
+   /* Need to use the inlist indirection here: 
+    */
+   if (c->need_direction) 
+      compute_tri_direction(c);
+   
+   if (c->key.fill_ccw == CLIP_CULL ||
+       c->key.fill_cw == CLIP_CULL)
+      cull_direction(c);
+
+   if (c->key.offset_ccw ||
+       c->key.offset_cw)
+      compute_offset(c);
+
+   if (c->key.copy_bfc_ccw ||
+       c->key.copy_bfc_cw)
+      copy_bfc(c);
+
+   /* Need to do this whether we clip or not:
+    */
+   if (c->key.do_flat_shading)
+      brw_clip_tri_flat_shade(c);
+   
+   brw_clip_init_clipmask(c);
+   brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_NZ, c->reg.planemask, brw_imm_ud(0));
+   do_clip = brw_IF(p, BRW_EXECUTE_1);
+   {
+      brw_clip_init_planes(c);
+      brw_clip_tri(c);
+      check_nr_verts(c);
+   }
+   brw_ENDIF(p, do_clip);
+   
+   emit_unfilled_primitives(c);
+   brw_clip_kill_thread(c);
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_clip_util.c b/src/gallium/drivers/i965simple/brw_clip_util.c
new file mode 100644
index 0000000000..6d58ceafff
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_clip_util.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_clip.h"
+
+
+
+
+
+static struct brw_reg get_tmp( struct brw_clip_compile *c )
+{
+   struct brw_reg tmp = brw_vec4_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_clip_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+
+static struct brw_reg make_plane_ud(unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   return brw_imm_ud((w<<24) | (z<<16) | (y<<8) | x);
+}
+
+
+void brw_clip_init_planes( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+
+   if (!c->key.nr_userclip) {
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 0), make_plane_ud( 0,    0, 0xff, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 1), make_plane_ud( 0,    0,    1, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 2), make_plane_ud( 0, 0xff,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 3), make_plane_ud( 0,    1,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 4), make_plane_ud(0xff,  0,    0, 1));
+      brw_MOV(p, get_element_ud(c->reg.fixed_planes, 5), make_plane_ud( 1,    0,    0, 1));
+   }
+}
+
+
+
+#define W 3
+
+/* Project 'pos' to screen space (or back again), overwrite with results:
+ */
+static void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
+{
+   struct brw_compile *p = &c->func;
+
+   /* calc rhw
+    */
+   brw_math_invert(p, get_element(pos, W), get_element(pos, W));
+
+   /* value.xyz *= value.rhw
+    */
+   brw_set_access_mode(p, BRW_ALIGN_16);
+   brw_MUL(p, brw_writemask(pos, TGSI_WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_set_access_mode(p, BRW_ALIGN_1);
+}
+
+
+static void brw_clip_project_vertex( struct brw_clip_compile *c,
+				     struct brw_indirect vert_addr )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+
+   /* Fixup position.  Extract from the original vertex and re-project
+    * to screen space:
+    */
+   brw_MOV(p, tmp, deref_4f(vert_addr, c->offset[VERT_RESULT_HPOS]));
+   brw_clip_project_position(c, tmp);
+   brw_MOV(p, deref_4f(vert_addr, c->header_position_offset), tmp);
+
+   release_tmp(c, tmp);
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+/* Interpolate between two vertices and put the result into a0.0.
+ * Increment a0.0 accordingly.
+ */
+void brw_clip_interp_vertex( struct brw_clip_compile *c,
+			     struct brw_indirect dest_ptr,
+			     struct brw_indirect v0_ptr, /* from */
+			     struct brw_indirect v1_ptr, /* to */
+			     struct brw_reg t0,
+			     boolean force_edgeflag)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = get_tmp(c);
+   unsigned i;
+
+   /* Just copy the vertex header:
+    */
+   brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
+
+   /* Iterate over each attribute (could be done in pairs?)
+    */
+   for (i = 0; i < c->nr_attrs; i++) {
+      unsigned delta = i*16 + 32;
+
+      if (delta == c->offset[VERT_RESULT_EDGE]) {
+	 if (force_edgeflag)
+	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
+	 else
+	    brw_MOV(p, deref_4f(dest_ptr, delta), deref_4f(v0_ptr, delta));
+      }
+      else {
+	 /* Interpolate:
+	  *
+	  *        New = attr0 + t*attr1 - t*attr0
+	  */
+	 brw_MUL(p,
+		 vec4(brw_null_reg()),
+		 deref_4f(v1_ptr, delta),
+		 t0);
+
+	 brw_MAC(p,
+		 tmp,
+		 negate(deref_4f(v0_ptr, delta)),
+		 t0);
+
+	 brw_ADD(p,
+		 deref_4f(dest_ptr, delta),
+		 deref_4f(v0_ptr, delta),
+		 tmp);
+      }
+   }
+
+   if (i & 1) {
+      unsigned delta = i*16 + 32;
+      brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
+   }
+
+   release_tmp(c, tmp);
+
+   /* Recreate the projected (NDC) coordinate in the new vertex
+    * header:
+    */
+   brw_clip_project_vertex(c, dest_ptr );
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+
+#define MAX_MRF 16
+
+void brw_clip_emit_vue(struct brw_clip_compile *c,
+		       struct brw_indirect vert,
+		       boolean allocate,
+		       boolean eot,
+		       unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   unsigned start = c->last_mrf;
+
+   assert(!(allocate && eot));
+
+   /* Cycle through mrf regs - probably futile as we have to wait for
+    * the allocation response anyway.  Also, the order this function
+    * is invoked doesn't correspond to the order the instructions will
+    * be executed, so it won't have any effect in many cases.
+    */
+#if 0
+   if (start + c->nr_regs + 1 >= MAX_MRF)
+      start = 0;
+
+   c->last_mrf = start + c->nr_regs + 1;
+#endif
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy_from_indirect(p, brw_message_reg(start+1), vert, c->nr_regs);
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+
+   /* Send each vertex as a seperate write to the urb.  This
+    * is different to the concept in brw_sf_emit.c, where
+    * subsequent writes are used to build up a single urb
+    * entry.  Each of these writes instantiates a seperate
+    * urb entry - (I think... what about 'allocate'?)
+    */
+   brw_urb_WRITE(p,
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 start,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response_length */
+		 eot,		/* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_clip_kill_thread(struct brw_clip_compile *c)
+{
+   struct brw_compile *p = &c->func;
+
+   /* Send an empty message to kill the thread and release any
+    * allocated urb entry:
+    */
+   brw_urb_WRITE(p,
+		 retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 0,		/* allocate */
+		 0,		/* used */
+		 0, 		/* msg len */
+		 0, 		/* response len */
+		 1, 		/* eot */
+		 1,		/* writes complete */
+		 0,
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+
+struct brw_reg brw_clip_plane0_address( struct brw_clip_compile *c )
+{
+   return brw_address(c->reg.fixed_planes);
+}
+
+
+struct brw_reg brw_clip_plane_stride( struct brw_clip_compile *c )
+{
+   if (c->key.nr_userclip) {
+      return brw_imm_uw(16);
+   }
+   else {
+      return brw_imm_uw(4);
+   }
+}
+
+
+/* If flatshading, distribute color from provoking vertex prior to
+ * clipping.
+ */
+void brw_clip_copy_colors( struct brw_clip_compile *c,
+			   unsigned to, unsigned from )
+{
+#if 0
+   struct brw_compile *p = &c->func;
+
+   if (c->offset[VERT_RESULT_COL0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL0]));
+
+   if (c->offset[VERT_RESULT_COL1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_COL1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_COL1]));
+
+   if (c->offset[VERT_RESULT_BFC0])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC0]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC0]));
+
+   if (c->offset[VERT_RESULT_BFC1])
+      brw_MOV(p,
+	      byte_offset(c->reg.vertex[to], c->offset[VERT_RESULT_BFC1]),
+	      byte_offset(c->reg.vertex[from], c->offset[VERT_RESULT_BFC1]));
+#else
+         #warning "disabled"
+#endif
+}
+
+
+
+void brw_clip_init_clipmask( struct brw_clip_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg incoming = get_element_ud(c->reg.R0, 2);
+
+   /* Shift so that lowest outcode bit is rightmost:
+    */
+   brw_SHR(p, c->reg.planemask, incoming, brw_imm_ud(26));
+
+   if (c->key.nr_userclip) {
+      struct brw_reg tmp = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UD);
+
+      /* Rearrange userclip outcodes so that they come directly after
+       * the fixed plane bits.
+       */
+      brw_AND(p, tmp, incoming, brw_imm_ud(0x3f<<14));
+      brw_SHR(p, tmp, tmp, brw_imm_ud(8));
+      brw_OR(p, c->reg.planemask, c->reg.planemask, tmp);
+
+      release_tmp(c, tmp);
+   }
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.c b/src/gallium/drivers/i965simple/brw_context.c
new file mode 100644
index 0000000000..96920df008
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.c
@@ -0,0 +1,114 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_draw.h"
+#include "brw_vs.h"
+#include "brw_tex_layout.h"
+#include "brw_winsys.h"
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+
+#ifndef BRW_DEBUG
+int BRW_DEBUG = (0);
+#endif
+
+
+static void brw_destroy(struct pipe_context *pipe)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   if(brw->winsys->destroy)
+      brw->winsys->destroy(brw->winsys);
+   
+   FREE(brw);
+}
+
+
+static void brw_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+                      unsigned clearValue)
+{
+   int x, y, w, h;
+   /* FIXME: corny... */
+
+   x = 0;
+   y = 0;
+   w = ps->width;
+   h = ps->height;
+
+   pipe->surface_fill(pipe, ps, x, y, w, h, clearValue);
+}
+
+
+struct pipe_context *brw_create(struct pipe_screen *screen,
+                                struct brw_winsys *brw_winsys,
+                                unsigned pci_id)
+{
+   struct brw_context *brw;
+
+   debug_printf("%s: creating brw_context with pci id 0x%x\n",
+                __FUNCTION__, pci_id);
+
+   brw = CALLOC_STRUCT(brw_context);
+   if (brw == NULL)
+      return NULL;
+
+   brw->winsys = brw_winsys;
+   brw->pipe.winsys = screen->winsys;
+   brw->pipe.screen = screen;
+
+   brw->pipe.destroy = brw_destroy;
+   brw->pipe.clear = brw_clear;
+
+   brw_init_surface_functions(brw);
+   brw_init_texture_functions(brw);
+   brw_init_state_functions(brw);
+   brw_init_flush_functions(brw);
+   brw_init_draw_functions( brw );
+
+
+   brw_init_state( brw );
+
+   brw->pci_id = pci_id;
+   brw->dirty = ~0;
+   brw->hardware_dirty = ~0;
+
+   memset(&brw->wm.bind, ~0, sizeof(brw->wm.bind));
+
+   return &brw->pipe;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_context.h b/src/gallium/drivers/i965simple/brw_context.h
new file mode 100644
index 0000000000..3079485180
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_context.h
@@ -0,0 +1,684 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRWCONTEXT_INC
+#define BRWCONTEXT_INC
+
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "tgsi/tgsi_scan.h"
+
+#include "brw_structs.h"
+#include "brw_winsys.h"
+
+
+/* Glossary:
+ *
+ * URB - uniform resource buffer.  A mid-sized buffer which is
+ * partitioned between the fixed function units and used for passing
+ * values (vertices, primitives, constants) between them.
+ *
+ * CURBE - constant URB entry.  An urb region (entry) used to hold
+ * constant values which the fixed function units can be instructed to
+ * preload into the GRF when spawining a thread.
+ *
+ * VUE - vertex URB entry.  An urb entry holding a vertex and usually
+ * a vertex header.  The header contains control information and
+ * things like primitive type, Begin/end flags and clip codes.
+ *
+ * PUE - primitive URB entry.  An urb entry produced by the setup (SF)
+ * unit holding rasterization and interpolation parameters.
+ *
+ * GRF - general register file.  One of several register files
+ * addressable by programmed threads.  The inputs (r0, payload, curbe,
+ * urb) of the thread are preloaded to this area before the thread is
+ * spawned.  The registers are individually 8 dwords wide and suitable
+ * for general usage.  Registers holding thread input values are not
+ * special and may be overwritten.
+ *
+ * MRF - message register file.  Threads communicate (and terminate)
+ * by sending messages.  Message parameters are placed in contigous
+ * MRF registers.  All program output is via these messages.  URB
+ * entries are populated by sending a message to the shared URB
+ * function containing the new data, together with a control word,
+ * often an unmodified copy of R0.
+ *
+ * R0 - GRF register 0.  Typically holds control information used when
+ * sending messages to other threads.
+ *
+ * EU or GEN4 EU: The name of the programmable subsystem of the
+ * i965 hardware.  Threads are executed by the EU, the registers
+ * described above are part of the EU architecture.
+ *
+ * Fixed function units:
+ *
+ * CS - Command streamer.  Notional first unit, little software
+ * interaction.  Holds the URB entries used for constant data, ie the
+ * CURBEs.
+ *
+ * VF/VS - Vertex Fetch / Vertex Shader.  The fixed function part of
+ * this unit is responsible for pulling vertices out of vertex buffers
+ * in vram and injecting them into the processing pipe as VUEs.  If
+ * enabled, it first passes them to a VS thread which is a good place
+ * for the driver to implement any active vertex shader.
+ *
+ * GS - Geometry Shader.  This corresponds to a new DX10 concept.  If
+ * enabled, incoming strips etc are passed to GS threads in individual
+ * line/triangle/point units.  The GS thread may perform arbitary
+ * computation and emit whatever primtives with whatever vertices it
+ * chooses.  This makes GS an excellent place to implement GL's
+ * unfilled polygon modes, though of course it is capable of much
+ * more.  Additionally, GS is used to translate away primitives not
+ * handled by latter units, including Quads and Lineloops.
+ *
+ * CS - Clipper.  Mesa's clipping algorithms are imported to run on
+ * this unit.  The fixed function part performs cliptesting against
+ * the 6 fixed clipplanes and makes descisions on whether or not the
+ * incoming primitive needs to be passed to a thread for clipping.
+ * User clip planes are handled via cooperation with the VS thread.
+ *
+ * SF - Strips Fans or Setup: Triangles are prepared for
+ * rasterization.  Interpolation coefficients are calculated.
+ * Flatshading and two-side lighting usually performed here.
+ *
+ * WM - Windower.  Interpolation of vertex attributes performed here.
+ * Fragment shader implemented here.  SIMD aspects of EU taken full
+ * advantage of, as pixels are processed in blocks of 16.
+ *
+ * CC - Color Calculator.  No EU threads associated with this unit.
+ * Handles blending and (presumably) depth and stencil testing.
+ */
+
+#define BRW_MAX_CURBE                    (32*16)
+
+struct brw_context;
+struct brw_winsys;
+
+
+/* Raised when we receive new state across the pipe interface:
+ */
+#define BRW_NEW_VIEWPORT                0x1
+#define BRW_NEW_RASTERIZER              0x2
+#define BRW_NEW_FS                      0x4
+#define BRW_NEW_BLEND                   0x8
+#define BRW_NEW_CLIP                    0x10
+#define BRW_NEW_SCISSOR                 0x20
+#define BRW_NEW_STIPPLE                 0x40
+#define BRW_NEW_FRAMEBUFFER             0x80
+#define BRW_NEW_ALPHA_TEST              0x100
+#define BRW_NEW_DEPTH_STENCIL           0x200
+#define BRW_NEW_SAMPLER                 0x400
+#define BRW_NEW_TEXTURE                 0x800
+#define BRW_NEW_CONSTANTS               0x1000
+#define BRW_NEW_VBO                     0x2000
+#define BRW_NEW_VS                      0x4000
+
+/* Raised for other internal events:
+ */
+#define BRW_NEW_URB_FENCE               0x10000
+#define BRW_NEW_PSP                     0x20000
+#define BRW_NEW_CURBE_OFFSETS           0x40000
+#define BRW_NEW_REDUCED_PRIMITIVE       0x80000
+#define BRW_NEW_PRIMITIVE               0x100000
+#define BRW_NEW_SCENE                 0x200000
+#define BRW_NEW_SF_LINKAGE              0x400000
+
+extern int BRW_DEBUG;
+
+#define DEBUG_TEXTURE	0x1
+#define DEBUG_STATE	0x2
+#define DEBUG_IOCTL	0x4
+#define DEBUG_PRIMS	0x8
+#define DEBUG_VERTS	0x10
+#define DEBUG_FALLBACKS	0x20
+#define DEBUG_VERBOSE	0x40
+#define DEBUG_DRI       0x80
+#define DEBUG_DMA       0x100
+#define DEBUG_SANITY    0x200
+#define DEBUG_SYNC      0x400
+#define DEBUG_SLEEP     0x800
+#define DEBUG_PIXEL     0x1000
+#define DEBUG_STATS     0x2000
+#define DEBUG_TILE      0x4000
+#define DEBUG_SINGLE_THREAD   0x8000
+#define DEBUG_WM        0x10000
+#define DEBUG_URB       0x20000
+#define DEBUG_VS        0x40000
+#define DEBUG_BATCH	0x80000
+#define DEBUG_BUFMGR	0x100000
+#define DEBUG_BLIT	0x200000
+#define DEBUG_REGION	0x400000
+#define DEBUG_MIPTREE	0x800000
+
+#define DBG(...) do {						\
+   if (BRW_DEBUG & FILE_DEBUG_FLAG)				\
+      debug_printf(__VA_ARGS__);				\
+} while(0)
+
+#define PRINT(...) do {						\
+   debug_printf(__VA_ARGS__);			                \
+} while(0)
+
+struct brw_state_flags {
+   unsigned cache;
+   unsigned brw;
+};
+
+
+struct brw_vertex_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   int id;
+};
+
+
+struct brw_fragment_program {
+   struct pipe_shader_state program;
+   struct tgsi_shader_info info;
+   
+   boolean UsesDepth; /* XXX add this to tgsi_shader_info? */
+   int id;
+};
+
+
+struct pipe_setup_linkage {
+   struct {
+      unsigned vp_output:5;
+      unsigned interp_mode:4;
+      unsigned bf_vp_output:5;
+   } fp_input[PIPE_MAX_SHADER_INPUTS];
+
+   unsigned fp_input_count:5;
+   unsigned max_vp_output:5;
+};
+   
+
+
+struct brw_texture {
+   struct pipe_texture base;
+
+   /* Derived from the above:
+    */
+   unsigned stride;
+   unsigned depth_pitch;          /* per-image on i945? */
+   unsigned total_nblocksy;
+
+   unsigned nr_images[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* Explicitly store the offset of each image for each cube face or
+    * depth value.  Pretty much have to accept that hardware formats
+    * are going to be so diverse that there is no unified way to
+    * compute the offsets of depth/cube images within a mipmap level,
+    * so have to store them as a lookup table:
+    */
+   unsigned *image_offset[PIPE_MAX_TEXTURE_LEVELS];   /**< array [depth] of offsets */
+
+   /* Includes image offset tables:
+    */
+   unsigned level_offset[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+};
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs:
+ */
+
+struct brw_wm_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+
+   unsigned first_curbe_grf;
+   unsigned total_grf;
+   unsigned total_scratch;
+
+   /* Internally generated constants for the CURBE.  These are loaded
+    * ahead of the data from the constant buffer.
+    */
+   const float internal_const[8];
+   unsigned nr_internal_consts;
+   unsigned max_const;
+
+   boolean error;
+};
+
+struct brw_sf_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+
+   /* Each vertex may have upto 12 attributes, 4 components each,
+    * except WPOS which requires only 2.  (11*4 + 2) == 44 ==> 11
+    * rows.
+    *
+    * Actually we use 4 for each, so call it 12 rows.
+    */
+   unsigned urb_entry_size;
+};
+
+struct brw_clip_prog_data {
+   unsigned curb_read_length;	/* user planes? */
+   unsigned clip_mode;
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_gs_prog_data {
+   unsigned urb_read_length;
+   unsigned total_grf;
+};
+
+struct brw_vs_prog_data {
+   unsigned curb_read_length;
+   unsigned urb_read_length;
+   unsigned total_grf;
+   unsigned outputs_written;
+
+   unsigned inputs_read;
+
+   unsigned max_const;
+
+   float    imm_buf[PIPE_MAX_CONSTANT][4];
+   unsigned num_imm;
+   unsigned num_consts;
+
+   /* Used for calculating urb partitions:
+    */
+   unsigned urb_entry_size;
+};
+
+
+#define BRW_MAX_TEX_UNIT 8
+#define BRW_WM_MAX_SURF BRW_MAX_TEX_UNIT + 1
+
+/* Create a fixed sized struct for caching binding tables:
+ */
+struct brw_surface_binding_table {
+   unsigned surf_ss_offset[BRW_WM_MAX_SURF];
+};
+
+
+struct brw_cache;
+
+struct brw_mem_pool {
+   struct pipe_buffer *buffer;
+
+   unsigned size;
+   unsigned offset;		/* offset of first free byte */
+
+   struct brw_context *brw;
+};
+
+struct brw_cache_item {
+   unsigned hash;
+   unsigned key_size;		/* for variable-sized keys */
+   const void *key;
+
+   unsigned offset;		/* offset within pool's buffer */
+   unsigned data_size;
+
+   struct brw_cache_item *next;
+};
+
+
+
+struct brw_cache {
+   unsigned id;
+
+   const char *name;
+
+   struct brw_context *brw;
+   struct brw_mem_pool *pool;
+
+   struct brw_cache_item **items;
+   unsigned size, n_items;
+
+   unsigned key_size;		/* for fixed-size keys */
+   unsigned aux_size;
+
+   unsigned last_addr;			/* offset of active item */
+};
+
+
+
+
+/* Considered adding a member to this struct to document which flags
+ * an update might raise so that ordering of the state atoms can be
+ * checked or derived at runtime.  Dropped the idea in favor of having
+ * a debug mode where the state is monitored for flags which are
+ * raised that have already been tested against.
+ */
+struct brw_tracked_state {
+   struct brw_state_flags dirty;
+   void (*update)( struct brw_context *brw );
+};
+
+
+/* Flags for brw->state.cache.
+ */
+#define CACHE_NEW_CC_VP                  (1<<BRW_CC_VP)
+#define CACHE_NEW_CC_UNIT                (1<<BRW_CC_UNIT)
+#define CACHE_NEW_WM_PROG                (1<<BRW_WM_PROG)
+#define CACHE_NEW_SAMPLER_DEFAULT_COLOR  (1<<BRW_SAMPLER_DEFAULT_COLOR)
+#define CACHE_NEW_SAMPLER                (1<<BRW_SAMPLER)
+#define CACHE_NEW_WM_UNIT                (1<<BRW_WM_UNIT)
+#define CACHE_NEW_SF_PROG                (1<<BRW_SF_PROG)
+#define CACHE_NEW_SF_VP                  (1<<BRW_SF_VP)
+#define CACHE_NEW_SF_UNIT                (1<<BRW_SF_UNIT)
+#define CACHE_NEW_VS_UNIT                (1<<BRW_VS_UNIT)
+#define CACHE_NEW_VS_PROG                (1<<BRW_VS_PROG)
+#define CACHE_NEW_GS_UNIT                (1<<BRW_GS_UNIT)
+#define CACHE_NEW_GS_PROG                (1<<BRW_GS_PROG)
+#define CACHE_NEW_CLIP_VP                (1<<BRW_CLIP_VP)
+#define CACHE_NEW_CLIP_UNIT              (1<<BRW_CLIP_UNIT)
+#define CACHE_NEW_CLIP_PROG              (1<<BRW_CLIP_PROG)
+#define CACHE_NEW_SURFACE                (1<<BRW_SS_SURFACE)
+#define CACHE_NEW_SURF_BIND              (1<<BRW_SS_SURF_BIND)
+
+
+
+
+enum brw_mempool_id {
+   BRW_GS_POOL,
+   BRW_SS_POOL,
+   BRW_MAX_POOL
+};
+
+
+struct brw_cached_batch_item {
+   struct header *header;
+   unsigned sz;
+   struct brw_cached_batch_item *next;
+};
+
+
+
+/* Protect against a future where PIPE_MAX_ATTRIBS > 32.  Wouldn't life
+ * be easier if C allowed arrays of packed elements?
+ */
+#define ATTRIB_BIT_DWORDS  ((PIPE_MAX_ATTRIBS+31)/32)
+
+
+
+
+struct brw_vertex_info {
+   unsigned varying;  /* varying:1[PIPE_MAX_ATTRIBS] */
+   unsigned sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[PIPE_MAX_ATTRIBS] */
+};
+
+
+
+
+
+struct brw_context
+{
+   struct pipe_context pipe;
+   struct brw_winsys *winsys;
+
+   unsigned primitive;
+   unsigned reduced_primitive;
+
+   boolean emit_state_always;
+
+   struct {
+      struct brw_state_flags dirty;
+   } state;
+
+
+   struct {
+      const struct pipe_blend_state         *Blend;
+      const struct pipe_depth_stencil_alpha_state *DepthStencil;
+      const struct pipe_poly_stipple        *PolygonStipple;
+      const struct pipe_rasterizer_state    *Raster;
+      const struct pipe_sampler_state       *Samplers[PIPE_MAX_SAMPLERS];
+      const struct brw_vertex_program       *VertexProgram;
+      const struct brw_fragment_program     *FragmentProgram;
+
+      struct pipe_clip_state          Clip;
+      struct pipe_blend_color         BlendColor;
+      struct pipe_scissor_state       Scissor;
+      struct pipe_viewport_state      Viewport;
+      struct pipe_framebuffer_state   FrameBuffer;
+
+      const struct pipe_constant_buffer *Constants[2];
+      const struct brw_texture          *Texture[PIPE_MAX_SAMPLERS];
+   } attribs;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+
+   struct brw_mem_pool pool[BRW_MAX_POOL];
+   struct brw_cache cache[BRW_MAX_CACHE];
+   struct brw_cached_batch_item *cached_batch_items;
+
+   struct {
+
+      /* Arrays with buffer objects to copy non-bufferobj arrays into
+       * for upload:
+       */
+      const struct pipe_vertex_buffer *vbo_array[PIPE_MAX_ATTRIBS];
+
+      struct brw_vertex_element_state inputs[PIPE_MAX_ATTRIBS];
+
+#define BRW_NR_UPLOAD_BUFS 17
+#define BRW_UPLOAD_INIT_SIZE (128*1024)
+
+      /* Summary of size and varying of active arrays, so we can check
+       * for changes to this state:
+       */
+      struct brw_vertex_info info;
+   } vb;
+
+
+   unsigned hardware_dirty;
+   unsigned dirty;
+   unsigned pci_id;
+   /* BRW_NEW_URB_ALLOCATIONS:
+    */
+   struct {
+      unsigned vsize;		/* vertex size plus header in urb registers */
+      unsigned csize;		/* constant buffer size in urb registers */
+      unsigned sfsize;		/* setup data size in urb registers */
+
+      boolean constrained;
+
+      unsigned nr_vs_entries;
+      unsigned nr_gs_entries;
+      unsigned nr_clip_entries;
+      unsigned nr_sf_entries;
+      unsigned nr_cs_entries;
+
+/*       unsigned vs_size; */
+/*       unsigned gs_size; */
+/*       unsigned clip_size; */
+/*       unsigned sf_size; */
+/*       unsigned cs_size; */
+
+      unsigned vs_start;
+      unsigned gs_start;
+      unsigned clip_start;
+      unsigned sf_start;
+      unsigned cs_start;
+   } urb;
+
+
+   /* BRW_NEW_CURBE_OFFSETS:
+    */
+   struct {
+      unsigned wm_start;
+      unsigned wm_size;
+      unsigned clip_start;
+      unsigned clip_size;
+      unsigned vs_start;
+      unsigned vs_size;
+      unsigned total_size;
+
+      unsigned gs_offset;
+
+      float *last_buf;
+      unsigned last_bufsz;
+   } curbe;
+
+   struct {
+      struct brw_vs_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } vs;
+
+   struct {
+      struct brw_gs_prog_data *prog_data;
+
+      boolean prog_active;
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } gs;
+
+   struct {
+      struct brw_clip_prog_data *prog_data;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } clip;
+
+
+   struct {
+      struct brw_sf_prog_data *prog_data;
+
+      struct pipe_setup_linkage linkage;
+
+      unsigned prog_gs_offset;
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } sf;
+
+   struct {
+      struct brw_wm_prog_data *prog_data;
+
+//      struct brw_wm_compiler *compile_data;
+
+
+      /**
+       * Array of sampler state uploaded at sampler_gs_offset of BRW_SAMPLER
+       * cache
+       */
+      struct brw_sampler_state sampler[BRW_MAX_TEX_UNIT];
+
+      unsigned render_surf;
+      unsigned nr_surfaces;
+
+      unsigned max_threads;
+      struct pipe_buffer *scratch_buffer;
+      unsigned scratch_buffer_size;
+
+      unsigned sampler_count;
+      unsigned sampler_gs_offset;
+
+      struct brw_surface_binding_table bind;
+      unsigned bind_ss_offset;
+
+      unsigned prog_gs_offset;
+      unsigned state_gs_offset;
+   } wm;
+
+
+   struct {
+      unsigned vp_gs_offset;
+      unsigned state_gs_offset;
+   } cc;
+
+
+   /* Used to give every program string a unique id
+    */
+   unsigned program_id;
+};
+
+
+#define BRW_PACKCOLOR8888(r,g,b,a)  ((r<<24) | (g<<16) | (b<<8) | a)
+
+
+/*======================================================================
+ * brw_vtbl.c
+ */
+void brw_do_flush( struct brw_context *brw,
+		   unsigned flags );
+
+
+/*======================================================================
+ * brw_state.c
+ */
+void brw_validate_state(struct brw_context *brw);
+void brw_init_state(struct brw_context *brw);
+void brw_destroy_state(struct brw_context *brw);
+
+
+/*======================================================================
+ * brw_tex.c
+ */
+void brwUpdateTextureState( struct brw_context *brw );
+
+
+/* brw_urb.c
+ */
+void brw_upload_urb_fence(struct brw_context *brw);
+
+void brw_upload_constant_buffer_state(struct brw_context *brw);
+
+void brw_init_surface_functions(struct brw_context *brw);
+void brw_init_state_functions(struct brw_context *brw);
+void brw_init_flush_functions(struct brw_context *brw);
+void brw_init_string_functions(struct brw_context *brw);
+
+/*======================================================================
+ * Inline conversion functions.  These are better-typed than the
+ * macros used previously:
+ */
+static inline struct brw_context *
+brw_context( struct pipe_context *ctx )
+{
+   return (struct brw_context *)ctx;
+}
+
+#endif
+
diff --git a/src/gallium/drivers/i965simple/brw_curbe.c b/src/gallium/drivers/i965simple/brw_curbe.c
new file mode 100644
index 0000000000..824ee7fd6d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_curbe.c
@@ -0,0 +1,369 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "pipe/p_state.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#define FILE_DEBUG_FLAG DEBUG_FALLBACKS
+
+/* Partition the CURBE between the various users of constant values:
+ */
+static void calculate_curbe_offsets( struct brw_context *brw )
+{
+   /* CACHE_NEW_WM_PROG */
+   unsigned nr_fp_regs = align(brw->wm.prog_data->max_const, 16);
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   unsigned nr_vp_regs = align(brw->vs.prog_data->max_const, 16);
+   unsigned nr_clip_regs = 0;
+   unsigned total_regs;
+
+#if 0
+   /* BRW_NEW_CLIP ? */
+   if (brw->attribs.Transform->ClipPlanesEnabled) {
+      unsigned nr_planes = 6 + brw_count_bits(brw->attribs.Transform->ClipPlanesEnabled);
+      nr_clip_regs = align(nr_planes * 4, 16);
+   }
+#endif
+
+
+   total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
+
+   /* This can happen - what to do?  Probably rather than falling
+    * back, the best thing to do is emit programs which code the
+    * constants as immediate values.  Could do this either as a static
+    * cap on WM and VS, or adaptively.
+    *
+    * Unfortunately, this is currently dependent on the results of the
+    * program generation process (in the case of wm), so this would
+    * introduce the need to re-generate programs in the event of a
+    * curbe allocation failure.
+    */
+   /* Max size is 32 - just large enough to
+    * hold the 128 parameters allowed by
+    * the fragment and vertex program
+    * api's.  It's not clear what happens
+    * when both VP and FP want to use 128
+    * parameters, though.
+    */
+   assert(total_regs <= 32);
+
+   /* Lazy resize:
+    */
+   if (nr_fp_regs > brw->curbe.wm_size ||
+       nr_vp_regs > brw->curbe.vs_size ||
+       nr_clip_regs != brw->curbe.clip_size ||
+       (total_regs < brw->curbe.total_size / 4 &&
+	brw->curbe.total_size > 16)) {
+
+      unsigned reg = 0;
+
+      /* Calculate a new layout:
+       */
+      reg = 0;
+      brw->curbe.wm_start = reg;
+      brw->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
+      brw->curbe.clip_start = reg;
+      brw->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
+      brw->curbe.vs_start = reg;
+      brw->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
+      brw->curbe.total_size = reg;
+
+#if 0
+      if (0)
+	 DBG("curbe wm %d+%d clip %d+%d vs %d+%d\n",
+		      brw->curbe.wm_start,
+		      brw->curbe.wm_size,
+		      brw->curbe.clip_start,
+		      brw->curbe.clip_size,
+		      brw->curbe.vs_start,
+		      brw->curbe.vs_size );
+#endif
+
+      brw->state.dirty.brw |= BRW_NEW_CURBE_OFFSETS;
+   }
+}
+
+
+const struct brw_tracked_state brw_curbe_offsets = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_VS),
+      .cache = CACHE_NEW_WM_PROG
+   },
+   .update = calculate_curbe_offsets
+};
+
+
+
+/* Define the number of curbes within CS's urb allocation.  Multiple
+ * urb entries -> multiple curbes.  These will be used by
+ * fixed-function hardware in a double-buffering scheme to avoid a
+ * pipeline stall each time the contents of the curbe is changed.
+ */
+void brw_upload_constant_buffer_state(struct brw_context *brw)
+{
+   struct brw_constant_buffer_state cbs;
+   memset(&cbs, 0, sizeof(cbs));
+
+   /* It appears that this is the state packet for the CS unit, ie. the
+    * urb entries detailed here are housed in the CS range from the
+    * URB_FENCE command.
+    */
+   cbs.header.opcode = CMD_CONST_BUFFER_STATE;
+   cbs.header.length = sizeof(cbs)/4 - 2;
+
+   /* BRW_NEW_URB_FENCE */
+   cbs.bits0.nr_urb_entries = brw->urb.nr_cs_entries;
+   cbs.bits0.urb_entry_size = brw->urb.csize - 1;
+
+   assert(brw->urb.nr_cs_entries);
+   BRW_CACHED_BATCH_STRUCT(brw, &cbs);
+}
+
+
+static float fixed_plane[6][4] = {
+   { 0,    0,   -1, 1 },
+   { 0,    0,    1, 1 },
+   { 0,   -1,    0, 1 },
+   { 0,    1,    0, 1 },
+   {-1,    0,    0, 1 },
+   { 1,    0,    0, 1 }
+};
+
+/* Upload a new set of constants.  Too much variability to go into the
+ * cache mechanism, but maybe would benefit from a comparison against
+ * the current uploaded set of constants.
+ */
+static void upload_constant_buffer(struct brw_context *brw)
+{
+   struct brw_mem_pool *pool = &brw->pool[BRW_GS_POOL];
+   unsigned sz = brw->curbe.total_size;
+   unsigned bufsz = sz * sizeof(float);
+   float *buf;
+   unsigned i;
+
+
+   if (sz == 0) {
+      struct brw_constant_buffer cb;
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 0;
+      cb.bits0.buffer_length = 0;
+      cb.bits0.buffer_address = 0;
+      BRW_BATCH_STRUCT(brw, &cb);
+
+      if (brw->curbe.last_buf) {
+	 free(brw->curbe.last_buf);
+	 brw->curbe.last_buf = NULL;
+	 brw->curbe.last_bufsz  = 0;
+      }
+
+      return;
+   }
+
+   buf = (float *)malloc(bufsz);
+
+   memset(buf, 0, bufsz);
+
+   if (brw->curbe.wm_size) {
+      unsigned offset = brw->curbe.wm_start * 16;
+
+      /* First the constant buffer constants:
+       */
+      
+      /* Then any internally generated constants: 
+       */
+      for (i = 0; i < brw->wm.prog_data->nr_internal_consts; i++)
+	 buf[offset + i] = brw->wm.prog_data->internal_const[i];
+
+      assert(brw->wm.prog_data->max_const == 
+	     brw->wm.prog_data->nr_internal_consts);
+   }
+
+
+   /* The clipplanes are actually delivered to both CLIP and VS units.
+    * VS uses them to calculate the outcode bitmasks.
+    */
+   if (brw->curbe.clip_size) {
+      unsigned offset = brw->curbe.clip_start * 16;
+      unsigned j;
+
+      /* If any planes are going this way, send them all this way:
+       */
+      for (i = 0; i < 6; i++) {
+	 buf[offset + i * 4 + 0] = fixed_plane[i][0];
+	 buf[offset + i * 4 + 1] = fixed_plane[i][1];
+	 buf[offset + i * 4 + 2] = fixed_plane[i][2];
+	 buf[offset + i * 4 + 3] = fixed_plane[i][3];
+      }
+
+      /* Clip planes: BRW_NEW_CLIP:
+       */
+      for (j = 0; j < brw->attribs.Clip.nr; j++) {
+	 buf[offset + i * 4 + 0] = brw->attribs.Clip.ucp[j][0];
+	 buf[offset + i * 4 + 1] = brw->attribs.Clip.ucp[j][1];
+	 buf[offset + i * 4 + 2] = brw->attribs.Clip.ucp[j][2];
+	 buf[offset + i * 4 + 3] = brw->attribs.Clip.ucp[j][3];
+	 i++;
+      }
+   }
+
+
+   if (brw->curbe.vs_size) {
+      unsigned offset = brw->curbe.vs_start * 16;
+      /*unsigned nr = vp->max_const;*/
+      const struct pipe_constant_buffer *cbuffer = brw->attribs.Constants[0];
+      struct pipe_winsys *ws = brw->pipe.winsys;
+      /* FIXME: buffer size is num_consts + num_immediates */
+      if (brw->vs.prog_data->num_consts) {
+         /* map the vertex constant buffer and copy to curbe: */
+         void *data = ws->buffer_map(ws, cbuffer->buffer, 0);
+         /* FIXME: this is wrong. the cbuffer->size currently
+          * represents size of consts + immediates. so if we'll
+          * have both we'll copy over the end of the buffer
+          * with the subsequent memcpy */
+         memcpy(&buf[offset], data, cbuffer->size);
+         ws->buffer_unmap(ws, cbuffer->buffer);
+         offset += cbuffer->size;
+      }
+      /*immediates*/
+      if (brw->vs.prog_data->num_imm) {
+         memcpy(&buf[offset], brw->vs.prog_data->imm_buf,
+                brw->vs.prog_data->num_imm * 4 * sizeof(float));
+      }
+   }
+
+   if (1) {
+      for (i = 0; i < sz; i+=4)
+	 debug_printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4,
+		      buf[i+0], buf[i+1], buf[i+2], buf[i+3]);
+
+      debug_printf("last_buf %p buf %p sz %d/%d cmp %d\n",
+		   brw->curbe.last_buf, buf,
+		   bufsz, brw->curbe.last_bufsz,
+		   brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1);
+   }
+
+   if (brw->curbe.last_buf &&
+       bufsz == brw->curbe.last_bufsz &&
+       memcmp(buf, brw->curbe.last_buf, bufsz) == 0) {
+      free(buf);
+/*       return; */
+   }
+   else {
+      if (brw->curbe.last_buf)
+	 free(brw->curbe.last_buf);
+      brw->curbe.last_buf = buf;
+      brw->curbe.last_bufsz = bufsz;
+
+
+      if (!brw_pool_alloc(pool,
+			  bufsz,
+			  1 << 6,
+			  &brw->curbe.gs_offset)) {
+	 debug_printf("out of GS memory for curbe\n");
+	 assert(0);
+	 return;
+      }
+
+
+      /* Copy data to the buffer:
+       */
+      brw->winsys->buffer_subdata_typed(brw->winsys,
+					pool->buffer, 
+					brw->curbe.gs_offset, 
+					bufsz, 
+					buf,
+					BRW_CONSTANT_BUFFER );
+   }
+
+   /* TODO: only emit the constant_buffer packet when necessary, ie:
+      - contents have changed
+      - offset has changed
+      - hw requirements due to other packets emitted.
+   */
+   {
+      struct brw_constant_buffer cb;
+
+      memset(&cb, 0, sizeof(cb));
+
+      cb.header.opcode = CMD_CONST_BUFFER;
+      cb.header.length = sizeof(cb)/4 - 2;
+      cb.header.valid = 1;
+      cb.bits0.buffer_length = sz - 1;
+      cb.bits0.buffer_address = brw->curbe.gs_offset >> 6;
+
+      /* Because this provokes an action (ie copy the constants into the
+       * URB), it shouldn't be shortcircuited if identical to the
+       * previous time - because eg. the urb destination may have
+       * changed, or the urb contents different to last time.
+       *
+       * Note that the data referred to is actually copied internally,
+       * not just used in place according to passed pointer.
+       *
+       * It appears that the CS unit takes care of using each available
+       * URB entry (Const URB Entry == CURBE) in turn, and issuing
+       * flushes as necessary when doublebuffering of CURBEs isn't
+       * possible.
+       */
+      BRW_BATCH_STRUCT(brw, &cb);
+   }
+}
+
+/* This tracked state is unique in that the state it monitors varies
+ * dynamically depending on the parameters tracked by the fragment and
+ * vertex programs.  This is the template used as a starting point,
+ * each context will maintain a copy of this internally and update as
+ * required.
+ */
+const struct brw_tracked_state brw_constant_buffer = {
+   .dirty = {
+      .brw  = (BRW_NEW_CLIP |
+	       BRW_NEW_CONSTANTS |
+	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_PSP | /* Implicit - hardware requires this, not used above */
+	       BRW_NEW_CURBE_OFFSETS),
+      .cache = (CACHE_NEW_WM_PROG)
+   },
+   .update = upload_constant_buffer
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_defines.h b/src/gallium/drivers/i965simple/brw_defines.h
new file mode 100644
index 0000000000..9379a397f6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_defines.h
@@ -0,0 +1,852 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_DEFINES_H
+#define BRW_DEFINES_H
+
+/*
+ */
+#define MI_NOOP                              0x00
+#define MI_USER_INTERRUPT                    0x02
+#define MI_WAIT_FOR_EVENT                    0x03
+#define MI_FLUSH                             0x04
+#define MI_REPORT_HEAD                       0x07
+#define MI_ARB_ON_OFF                        0x08
+#define MI_BATCH_BUFFER_END                  0x0A
+#define MI_OVERLAY_FLIP                      0x11
+#define MI_LOAD_SCAN_LINES_INCL              0x12
+#define MI_LOAD_SCAN_LINES_EXCL              0x13
+#define MI_DISPLAY_BUFFER_INFO               0x14
+#define MI_SET_CONTEXT                       0x18
+#define MI_STORE_DATA_IMM                    0x20
+#define MI_STORE_DATA_INDEX                  0x21
+#define MI_LOAD_REGISTER_IMM                 0x22
+#define MI_STORE_REGISTER_MEM                0x24
+#define MI_BATCH_BUFFER_START                0x31
+
+#define MI_SYNCHRONOUS_FLIP                  0x0
+#define MI_ASYNCHRONOUS_FLIP                 0x1
+
+#define MI_BUFFER_SECURE                     0x0
+#define MI_BUFFER_NONSECURE                  0x1
+
+#define MI_ARBITRATE_AT_CHAIN_POINTS         0x0
+#define MI_ARBITRATE_BETWEEN_INSTS           0x1
+#define MI_NO_ARBITRATION                    0x3
+
+#define MI_CONDITION_CODE_WAIT_DISABLED      0x0
+#define MI_CONDITION_CODE_WAIT_0             0x1
+#define MI_CONDITION_CODE_WAIT_1             0x2
+#define MI_CONDITION_CODE_WAIT_2             0x3
+#define MI_CONDITION_CODE_WAIT_3             0x4
+#define MI_CONDITION_CODE_WAIT_4             0x5
+
+#define MI_DISPLAY_PIPE_A                    0x0
+#define MI_DISPLAY_PIPE_B                    0x1
+
+#define MI_DISPLAY_PLANE_A                   0x0
+#define MI_DISPLAY_PLANE_B                   0x1
+#define MI_DISPLAY_PLANE_C                   0x2
+
+#define MI_STANDARD_FLIP                                 0x0
+#define MI_ENQUEUE_FLIP_PERFORM_BASE_FRAME_NUMBER_LOAD   0x1
+#define MI_ENQUEUE_FLIP_TARGET_FRAME_NUMBER_RELATIVE     0x2
+#define MI_ENQUEUE_FLIP_ABSOLUTE_TARGET_FRAME_NUMBER     0x3
+
+#define MI_PHYSICAL_ADDRESS                  0x0
+#define MI_VIRTUAL_ADDRESS                   0x1
+
+#define MI_BUFFER_MEMORY_MAIN                0x0
+#define MI_BUFFER_MEMORY_GTT                 0x2
+#define MI_BUFFER_MEMORY_PER_PROCESS_GTT     0x3
+
+#define MI_FLIP_CONTINUE                     0x0
+#define MI_FLIP_ON                           0x1
+#define MI_FLIP_OFF                          0x2
+
+#define MI_UNTRUSTED_REGISTER_SPACE          0x0
+#define MI_TRUSTED_REGISTER_SPACE            0x1
+
+/* 3D state:
+ */
+#define _3DOP_3DSTATE_PIPELINED       0x0
+#define _3DOP_3DSTATE_NONPIPELINED    0x1
+#define _3DOP_3DCONTROL               0x2
+#define _3DOP_3DPRIMITIVE             0x3
+
+#define _3DSTATE_PIPELINED_POINTERS       0x00
+#define _3DSTATE_BINDING_TABLE_POINTERS   0x01
+#define _3DSTATE_VERTEX_BUFFERS           0x08
+#define _3DSTATE_VERTEX_ELEMENTS          0x09
+#define _3DSTATE_INDEX_BUFFER             0x0A
+#define _3DSTATE_VF_STATISTICS            0x0B
+#define _3DSTATE_DRAWING_RECTANGLE            0x00
+#define _3DSTATE_CONSTANT_COLOR               0x01
+#define _3DSTATE_SAMPLER_PALETTE_LOAD         0x02
+#define _3DSTATE_CHROMA_KEY                   0x04
+#define _3DSTATE_DEPTH_BUFFER                 0x05
+#define _3DSTATE_POLY_STIPPLE_OFFSET          0x06
+#define _3DSTATE_POLY_STIPPLE_PATTERN         0x07
+#define _3DSTATE_LINE_STIPPLE                 0x08
+#define _3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP    0x09
+#define _3DCONTROL    0x00
+#define _3DPRIMITIVE  0x00
+
+#define PIPE_CONTROL_NOWRITE          0x00
+#define PIPE_CONTROL_WRITEIMMEDIATE   0x01
+#define PIPE_CONTROL_WRITEDEPTH       0x02
+#define PIPE_CONTROL_WRITETIMESTAMP   0x03
+
+#define PIPE_CONTROL_GTTWRITE_PROCESS_LOCAL 0x00
+#define PIPE_CONTROL_GTTWRITE_GLOBAL        0x01
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09
+#define _3DPRIM_LINESTRIP_ADJ     0x0A
+#define _3DPRIM_TRILIST_ADJ       0x0B
+#define _3DPRIM_TRISTRIP_ADJ      0x0C
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x15
+
+#define _3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL 0
+#define _3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     1
+
+#define BRW_ANISORATIO_2     0
+#define BRW_ANISORATIO_4     1
+#define BRW_ANISORATIO_6     2
+#define BRW_ANISORATIO_8     3
+#define BRW_ANISORATIO_10    4
+#define BRW_ANISORATIO_12    5
+#define BRW_ANISORATIO_14    6
+#define BRW_ANISORATIO_16    7
+
+#define BRW_BLENDFACTOR_ONE                 0x1
+#define BRW_BLENDFACTOR_SRC_COLOR           0x2
+#define BRW_BLENDFACTOR_SRC_ALPHA           0x3
+#define BRW_BLENDFACTOR_DST_ALPHA           0x4
+#define BRW_BLENDFACTOR_DST_COLOR           0x5
+#define BRW_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define BRW_BLENDFACTOR_CONST_COLOR         0x7
+#define BRW_BLENDFACTOR_CONST_ALPHA         0x8
+#define BRW_BLENDFACTOR_SRC1_COLOR          0x9
+#define BRW_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define BRW_BLENDFACTOR_ZERO                0x11
+#define BRW_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define BRW_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define BRW_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define BRW_BLENDFACTOR_INV_DST_COLOR       0x15
+#define BRW_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define BRW_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define BRW_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define BRW_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define BRW_BLENDFUNCTION_ADD               0
+#define BRW_BLENDFUNCTION_SUBTRACT          1
+#define BRW_BLENDFUNCTION_REVERSE_SUBTRACT  2
+#define BRW_BLENDFUNCTION_MIN               3
+#define BRW_BLENDFUNCTION_MAX               4
+
+#define BRW_ALPHATEST_FORMAT_UNORM8         0
+#define BRW_ALPHATEST_FORMAT_FLOAT32        1
+
+#define BRW_CHROMAKEY_KILL_ON_ANY_MATCH  0
+#define BRW_CHROMAKEY_REPLACE_BLACK      1
+
+#define BRW_CLIP_API_OGL     0
+#define BRW_CLIP_API_DX      1
+
+#define BRW_CLIPMODE_NORMAL              0
+#define BRW_CLIPMODE_CLIP_ALL            1
+#define BRW_CLIPMODE_CLIP_NON_REJECTED   2
+#define BRW_CLIPMODE_REJECT_ALL          3
+#define BRW_CLIPMODE_ACCEPT_ALL          4
+
+#define BRW_CLIP_NDCSPACE     0
+#define BRW_CLIP_SCREENSPACE  1
+
+#define BRW_COMPAREFUNCTION_ALWAYS       0
+#define BRW_COMPAREFUNCTION_NEVER        1
+#define BRW_COMPAREFUNCTION_LESS         2
+#define BRW_COMPAREFUNCTION_EQUAL        3
+#define BRW_COMPAREFUNCTION_LEQUAL       4
+#define BRW_COMPAREFUNCTION_GREATER      5
+#define BRW_COMPAREFUNCTION_NOTEQUAL     6
+#define BRW_COMPAREFUNCTION_GEQUAL       7
+
+#define BRW_COVERAGE_PIXELS_HALF     0
+#define BRW_COVERAGE_PIXELS_1        1
+#define BRW_COVERAGE_PIXELS_2        2
+#define BRW_COVERAGE_PIXELS_4        3
+
+#define BRW_CULLMODE_BOTH        0
+#define BRW_CULLMODE_NONE        1
+#define BRW_CULLMODE_FRONT       2
+#define BRW_CULLMODE_BACK        3
+
+#define BRW_DEFAULTCOLOR_R8G8B8A8_UNORM      0
+#define BRW_DEFAULTCOLOR_R32G32B32A32_FLOAT  1
+
+#define BRW_DEPTHFORMAT_D32_FLOAT_S8X24_UINT     0
+#define BRW_DEPTHFORMAT_D32_FLOAT                1
+#define BRW_DEPTHFORMAT_D24_UNORM_S8_UINT        2
+#define BRW_DEPTHFORMAT_D16_UNORM                5
+
+#define BRW_FLOATING_POINT_IEEE_754        0
+#define BRW_FLOATING_POINT_NON_IEEE_754    1
+
+#define BRW_FRONTWINDING_CW      0
+#define BRW_FRONTWINDING_CCW     1
+
+#define BRW_SPRITE_POINT_ENABLE  16
+
+#define BRW_INDEX_BYTE     0
+#define BRW_INDEX_WORD     1
+#define BRW_INDEX_DWORD    2
+
+#define BRW_LOGICOPFUNCTION_CLEAR            0
+#define BRW_LOGICOPFUNCTION_NOR              1
+#define BRW_LOGICOPFUNCTION_AND_INVERTED     2
+#define BRW_LOGICOPFUNCTION_COPY_INVERTED    3
+#define BRW_LOGICOPFUNCTION_AND_REVERSE      4
+#define BRW_LOGICOPFUNCTION_INVERT           5
+#define BRW_LOGICOPFUNCTION_XOR              6
+#define BRW_LOGICOPFUNCTION_NAND             7
+#define BRW_LOGICOPFUNCTION_AND              8
+#define BRW_LOGICOPFUNCTION_EQUIV            9
+#define BRW_LOGICOPFUNCTION_NOOP             10
+#define BRW_LOGICOPFUNCTION_OR_INVERTED      11
+#define BRW_LOGICOPFUNCTION_COPY             12
+#define BRW_LOGICOPFUNCTION_OR_REVERSE       13
+#define BRW_LOGICOPFUNCTION_OR               14
+#define BRW_LOGICOPFUNCTION_SET              15
+
+#define BRW_MAPFILTER_NEAREST        0x0
+#define BRW_MAPFILTER_LINEAR         0x1
+#define BRW_MAPFILTER_ANISOTROPIC    0x2
+
+#define BRW_MIPFILTER_NONE        0
+#define BRW_MIPFILTER_NEAREST     1
+#define BRW_MIPFILTER_LINEAR      3
+
+#define BRW_POLYGON_FRONT_FACING     0
+#define BRW_POLYGON_BACK_FACING      1
+
+#define BRW_PREFILTER_ALWAYS     0x0
+#define BRW_PREFILTER_NEVER      0x1
+#define BRW_PREFILTER_LESS       0x2
+#define BRW_PREFILTER_EQUAL      0x3
+#define BRW_PREFILTER_LEQUAL     0x4
+#define BRW_PREFILTER_GREATER    0x5
+#define BRW_PREFILTER_NOTEQUAL   0x6
+#define BRW_PREFILTER_GEQUAL     0x7
+
+#define BRW_PROVOKING_VERTEX_0    0
+#define BRW_PROVOKING_VERTEX_1    1
+#define BRW_PROVOKING_VERTEX_2    2
+
+#define BRW_RASTRULE_UPPER_LEFT  0
+#define BRW_RASTRULE_UPPER_RIGHT 1
+
+#define BRW_RENDERTARGET_CLAMPRANGE_UNORM    0
+#define BRW_RENDERTARGET_CLAMPRANGE_SNORM    1
+#define BRW_RENDERTARGET_CLAMPRANGE_FORMAT   2
+
+#define BRW_STENCILOP_KEEP               0
+#define BRW_STENCILOP_ZERO               1
+#define BRW_STENCILOP_REPLACE            2
+#define BRW_STENCILOP_INCRSAT            3
+#define BRW_STENCILOP_DECRSAT            4
+#define BRW_STENCILOP_INCR               5
+#define BRW_STENCILOP_DECR               6
+#define BRW_STENCILOP_INVERT             7
+
+#define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
+#define BRW_SURFACE_MIPMAPLAYOUT_RIGHT   1
+
+#define BRW_SURFACEFORMAT_R32G32B32A32_FLOAT             0x000
+#define BRW_SURFACEFORMAT_R32G32B32A32_SINT              0x001
+#define BRW_SURFACEFORMAT_R32G32B32A32_UINT              0x002
+#define BRW_SURFACEFORMAT_R32G32B32A32_UNORM             0x003
+#define BRW_SURFACEFORMAT_R32G32B32A32_SNORM             0x004
+#define BRW_SURFACEFORMAT_R64G64_FLOAT                   0x005
+#define BRW_SURFACEFORMAT_R32G32B32X32_FLOAT             0x006
+#define BRW_SURFACEFORMAT_R32G32B32A32_SSCALED           0x007
+#define BRW_SURFACEFORMAT_R32G32B32A32_USCALED           0x008
+#define BRW_SURFACEFORMAT_R32G32B32_FLOAT                0x040
+#define BRW_SURFACEFORMAT_R32G32B32_SINT                 0x041
+#define BRW_SURFACEFORMAT_R32G32B32_UINT                 0x042
+#define BRW_SURFACEFORMAT_R32G32B32_UNORM                0x043
+#define BRW_SURFACEFORMAT_R32G32B32_SNORM                0x044
+#define BRW_SURFACEFORMAT_R32G32B32_SSCALED              0x045
+#define BRW_SURFACEFORMAT_R32G32B32_USCALED              0x046
+#define BRW_SURFACEFORMAT_R16G16B16A16_UNORM             0x080
+#define BRW_SURFACEFORMAT_R16G16B16A16_SNORM             0x081
+#define BRW_SURFACEFORMAT_R16G16B16A16_SINT              0x082
+#define BRW_SURFACEFORMAT_R16G16B16A16_UINT              0x083
+#define BRW_SURFACEFORMAT_R16G16B16A16_FLOAT             0x084
+#define BRW_SURFACEFORMAT_R32G32_FLOAT                   0x085
+#define BRW_SURFACEFORMAT_R32G32_SINT                    0x086
+#define BRW_SURFACEFORMAT_R32G32_UINT                    0x087
+#define BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS       0x088
+#define BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT        0x089
+#define BRW_SURFACEFORMAT_L32A32_FLOAT                   0x08A
+#define BRW_SURFACEFORMAT_R32G32_UNORM                   0x08B
+#define BRW_SURFACEFORMAT_R32G32_SNORM                   0x08C
+#define BRW_SURFACEFORMAT_R64_FLOAT                      0x08D
+#define BRW_SURFACEFORMAT_R16G16B16X16_UNORM             0x08E
+#define BRW_SURFACEFORMAT_R16G16B16X16_FLOAT             0x08F
+#define BRW_SURFACEFORMAT_A32X32_FLOAT                   0x090
+#define BRW_SURFACEFORMAT_L32X32_FLOAT                   0x091
+#define BRW_SURFACEFORMAT_I32X32_FLOAT                   0x092
+#define BRW_SURFACEFORMAT_R16G16B16A16_SSCALED           0x093
+#define BRW_SURFACEFORMAT_R16G16B16A16_USCALED           0x094
+#define BRW_SURFACEFORMAT_R32G32_SSCALED                 0x095
+#define BRW_SURFACEFORMAT_R32G32_USCALED                 0x096
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM                 0x0C0
+#define BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB            0x0C1
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM              0x0C2
+#define BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB         0x0C3
+#define BRW_SURFACEFORMAT_R10G10B10A2_UINT               0x0C4
+#define BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM       0x0C5
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM                 0x0C7
+#define BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB            0x0C8
+#define BRW_SURFACEFORMAT_R8G8B8A8_SNORM                 0x0C9
+#define BRW_SURFACEFORMAT_R8G8B8A8_SINT                  0x0CA
+#define BRW_SURFACEFORMAT_R8G8B8A8_UINT                  0x0CB
+#define BRW_SURFACEFORMAT_R16G16_UNORM                   0x0CC
+#define BRW_SURFACEFORMAT_R16G16_SNORM                   0x0CD
+#define BRW_SURFACEFORMAT_R16G16_SINT                    0x0CE
+#define BRW_SURFACEFORMAT_R16G16_UINT                    0x0CF
+#define BRW_SURFACEFORMAT_R16G16_FLOAT                   0x0D0
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM              0x0D1
+#define BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB         0x0D2
+#define BRW_SURFACEFORMAT_R11G11B10_FLOAT                0x0D3
+#define BRW_SURFACEFORMAT_R32_SINT                       0x0D6
+#define BRW_SURFACEFORMAT_R32_UINT                       0x0D7
+#define BRW_SURFACEFORMAT_R32_FLOAT                      0x0D8
+#define BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS          0x0D9
+#define BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT           0x0DA
+#define BRW_SURFACEFORMAT_L16A16_UNORM                   0x0DF
+#define BRW_SURFACEFORMAT_I24X8_UNORM                    0x0E0
+#define BRW_SURFACEFORMAT_L24X8_UNORM                    0x0E1
+#define BRW_SURFACEFORMAT_A24X8_UNORM                    0x0E2
+#define BRW_SURFACEFORMAT_I32_FLOAT                      0x0E3
+#define BRW_SURFACEFORMAT_L32_FLOAT                      0x0E4
+#define BRW_SURFACEFORMAT_A32_FLOAT                      0x0E5
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM                 0x0E9
+#define BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB            0x0EA
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM                 0x0EB
+#define BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB            0x0EC
+#define BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP             0x0ED
+#define BRW_SURFACEFORMAT_B10G10R10X2_UNORM              0x0EE
+#define BRW_SURFACEFORMAT_L16A16_FLOAT                   0x0F0
+#define BRW_SURFACEFORMAT_R32_UNORM                      0x0F1
+#define BRW_SURFACEFORMAT_R32_SNORM                      0x0F2
+#define BRW_SURFACEFORMAT_R10G10B10X2_USCALED            0x0F3
+#define BRW_SURFACEFORMAT_R8G8B8A8_SSCALED               0x0F4
+#define BRW_SURFACEFORMAT_R8G8B8A8_USCALED               0x0F5
+#define BRW_SURFACEFORMAT_R16G16_SSCALED                 0x0F6
+#define BRW_SURFACEFORMAT_R16G16_USCALED                 0x0F7
+#define BRW_SURFACEFORMAT_R32_SSCALED                    0x0F8
+#define BRW_SURFACEFORMAT_R32_USCALED                    0x0F9
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM                   0x100
+#define BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB              0x101
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM                 0x102
+#define BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB            0x103
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM                 0x104
+#define BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB            0x105
+#define BRW_SURFACEFORMAT_R8G8_UNORM                     0x106
+#define BRW_SURFACEFORMAT_R8G8_SNORM                     0x107
+#define BRW_SURFACEFORMAT_R8G8_SINT                      0x108
+#define BRW_SURFACEFORMAT_R8G8_UINT                      0x109
+#define BRW_SURFACEFORMAT_R16_UNORM                      0x10A
+#define BRW_SURFACEFORMAT_R16_SNORM                      0x10B
+#define BRW_SURFACEFORMAT_R16_SINT                       0x10C
+#define BRW_SURFACEFORMAT_R16_UINT                       0x10D
+#define BRW_SURFACEFORMAT_R16_FLOAT                      0x10E
+#define BRW_SURFACEFORMAT_I16_UNORM                      0x111
+#define BRW_SURFACEFORMAT_L16_UNORM                      0x112
+#define BRW_SURFACEFORMAT_A16_UNORM                      0x113
+#define BRW_SURFACEFORMAT_L8A8_UNORM                     0x114
+#define BRW_SURFACEFORMAT_I16_FLOAT                      0x115
+#define BRW_SURFACEFORMAT_L16_FLOAT                      0x116
+#define BRW_SURFACEFORMAT_A16_FLOAT                      0x117
+#define BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM            0x119
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM                 0x11A
+#define BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB            0x11B
+#define BRW_SURFACEFORMAT_R8G8_SSCALED                   0x11C
+#define BRW_SURFACEFORMAT_R8G8_USCALED                   0x11D
+#define BRW_SURFACEFORMAT_R16_SSCALED                    0x11E
+#define BRW_SURFACEFORMAT_R16_USCALED                    0x11F
+#define BRW_SURFACEFORMAT_R8_UNORM                       0x140
+#define BRW_SURFACEFORMAT_R8_SNORM                       0x141
+#define BRW_SURFACEFORMAT_R8_SINT                        0x142
+#define BRW_SURFACEFORMAT_R8_UINT                        0x143
+#define BRW_SURFACEFORMAT_A8_UNORM                       0x144
+#define BRW_SURFACEFORMAT_I8_UNORM                       0x145
+#define BRW_SURFACEFORMAT_L8_UNORM                       0x146
+#define BRW_SURFACEFORMAT_P4A4_UNORM                     0x147
+#define BRW_SURFACEFORMAT_A4P4_UNORM                     0x148
+#define BRW_SURFACEFORMAT_R8_SSCALED                     0x149
+#define BRW_SURFACEFORMAT_R8_USCALED                     0x14A
+#define BRW_SURFACEFORMAT_R1_UINT                        0x181
+#define BRW_SURFACEFORMAT_YCRCB_NORMAL                   0x182
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUVY                  0x183
+#define BRW_SURFACEFORMAT_BC1_UNORM                      0x186
+#define BRW_SURFACEFORMAT_BC2_UNORM                      0x187
+#define BRW_SURFACEFORMAT_BC3_UNORM                      0x188
+#define BRW_SURFACEFORMAT_BC4_UNORM                      0x189
+#define BRW_SURFACEFORMAT_BC5_UNORM                      0x18A
+#define BRW_SURFACEFORMAT_BC1_UNORM_SRGB                 0x18B
+#define BRW_SURFACEFORMAT_BC2_UNORM_SRGB                 0x18C
+#define BRW_SURFACEFORMAT_BC3_UNORM_SRGB                 0x18D
+#define BRW_SURFACEFORMAT_MONO8                          0x18E
+#define BRW_SURFACEFORMAT_YCRCB_SWAPUV                   0x18F
+#define BRW_SURFACEFORMAT_YCRCB_SWAPY                    0x190
+#define BRW_SURFACEFORMAT_DXT1_RGB                       0x191
+#define BRW_SURFACEFORMAT_FXT1                           0x192
+#define BRW_SURFACEFORMAT_R8G8B8_UNORM                   0x193
+#define BRW_SURFACEFORMAT_R8G8B8_SNORM                   0x194
+#define BRW_SURFACEFORMAT_R8G8B8_SSCALED                 0x195
+#define BRW_SURFACEFORMAT_R8G8B8_USCALED                 0x196
+#define BRW_SURFACEFORMAT_R64G64B64A64_FLOAT             0x197
+#define BRW_SURFACEFORMAT_R64G64B64_FLOAT                0x198
+#define BRW_SURFACEFORMAT_BC4_SNORM                      0x199
+#define BRW_SURFACEFORMAT_BC5_SNORM                      0x19A
+#define BRW_SURFACEFORMAT_R16G16B16_UNORM                0x19C
+#define BRW_SURFACEFORMAT_R16G16B16_SNORM                0x19D
+#define BRW_SURFACEFORMAT_R16G16B16_SSCALED              0x19E
+#define BRW_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+
+#define BRW_SURFACERETURNFORMAT_FLOAT32  0
+#define BRW_SURFACERETURNFORMAT_S1       1
+
+#define BRW_SURFACE_1D      0
+#define BRW_SURFACE_2D      1
+#define BRW_SURFACE_3D      2
+#define BRW_SURFACE_CUBE    3
+#define BRW_SURFACE_BUFFER  4
+#define BRW_SURFACE_NULL    7
+
+#define BRW_TEXCOORDMODE_WRAP            0
+#define BRW_TEXCOORDMODE_MIRROR          1
+#define BRW_TEXCOORDMODE_CLAMP           2
+#define BRW_TEXCOORDMODE_CUBE            3
+#define BRW_TEXCOORDMODE_CLAMP_BORDER    4
+#define BRW_TEXCOORDMODE_MIRROR_ONCE     5
+
+#define BRW_THREAD_PRIORITY_NORMAL   0
+#define BRW_THREAD_PRIORITY_HIGH     1
+
+#define BRW_TILEWALK_XMAJOR                 0
+#define BRW_TILEWALK_YMAJOR                 1
+
+#define BRW_VERTEX_SUBPIXEL_PRECISION_8BITS  0
+#define BRW_VERTEX_SUBPIXEL_PRECISION_4BITS  1
+
+#define BRW_VERTEXBUFFER_ACCESS_VERTEXDATA     0
+#define BRW_VERTEXBUFFER_ACCESS_INSTANCEDATA   1
+
+#define BRW_VFCOMPONENT_NOSTORE      0
+#define BRW_VFCOMPONENT_STORE_SRC    1
+#define BRW_VFCOMPONENT_STORE_0      2
+#define BRW_VFCOMPONENT_STORE_1_FLT  3
+#define BRW_VFCOMPONENT_STORE_1_INT  4
+#define BRW_VFCOMPONENT_STORE_VID    5
+#define BRW_VFCOMPONENT_STORE_IID    6
+#define BRW_VFCOMPONENT_STORE_PID    7
+
+
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+#define BRW_COMPRESSION_NONE          0
+#define BRW_COMPRESSION_2NDHALF       1
+#define BRW_COMPRESSION_COMPRESSED    2
+
+#define BRW_CONDITIONAL_NONE  0
+#define BRW_CONDITIONAL_Z     1
+#define BRW_CONDITIONAL_NZ    2
+#define BRW_CONDITIONAL_EQ    1	/* Z */
+#define BRW_CONDITIONAL_NEQ   2	/* NZ */
+#define BRW_CONDITIONAL_G     3
+#define BRW_CONDITIONAL_GE    4
+#define BRW_CONDITIONAL_L     5
+#define BRW_CONDITIONAL_LE    6
+#define BRW_CONDITIONAL_C     7
+#define BRW_CONDITIONAL_O     8
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+#define BRW_EXECUTE_1     0
+#define BRW_EXECUTE_2     1
+#define BRW_EXECUTE_4     2
+#define BRW_EXECUTE_8     3
+#define BRW_EXECUTE_16    4
+#define BRW_EXECUTE_32    5
+
+#define BRW_HORIZONTAL_STRIDE_0   0
+#define BRW_HORIZONTAL_STRIDE_1   1
+#define BRW_HORIZONTAL_STRIDE_2   2
+#define BRW_HORIZONTAL_STRIDE_4   3
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+#define BRW_OPCODE_MOV        1
+#define BRW_OPCODE_SEL        2
+#define BRW_OPCODE_NOT        4
+#define BRW_OPCODE_AND        5
+#define BRW_OPCODE_OR         6
+#define BRW_OPCODE_XOR        7
+#define BRW_OPCODE_SHR        8
+#define BRW_OPCODE_SHL        9
+#define BRW_OPCODE_RSR        10
+#define BRW_OPCODE_RSL        11
+#define BRW_OPCODE_ASR        12
+#define BRW_OPCODE_CMP        16
+#define BRW_OPCODE_JMPI       32
+#define BRW_OPCODE_IF         34
+#define BRW_OPCODE_IFF        35
+#define BRW_OPCODE_ELSE       36
+#define BRW_OPCODE_ENDIF      37
+#define BRW_OPCODE_DO         38
+#define BRW_OPCODE_WHILE      39
+#define BRW_OPCODE_BREAK      40
+#define BRW_OPCODE_CONTINUE   41
+#define BRW_OPCODE_HALT       42
+#define BRW_OPCODE_MSAVE      44
+#define BRW_OPCODE_MRESTORE   45
+#define BRW_OPCODE_PUSH       46
+#define BRW_OPCODE_POP        47
+#define BRW_OPCODE_WAIT       48
+#define BRW_OPCODE_SEND       49
+#define BRW_OPCODE_ADD        64
+#define BRW_OPCODE_MUL        65
+#define BRW_OPCODE_AVG        66
+#define BRW_OPCODE_FRC        67
+#define BRW_OPCODE_RNDU       68
+#define BRW_OPCODE_RNDD       69
+#define BRW_OPCODE_RNDE       70
+#define BRW_OPCODE_RNDZ       71
+#define BRW_OPCODE_MAC        72
+#define BRW_OPCODE_MACH       73
+#define BRW_OPCODE_LZD        74
+#define BRW_OPCODE_SAD2       80
+#define BRW_OPCODE_SADA2      81
+#define BRW_OPCODE_DP4        84
+#define BRW_OPCODE_DPH        85
+#define BRW_OPCODE_DP3        86
+#define BRW_OPCODE_DP2        87
+#define BRW_OPCODE_DPA2       88
+#define BRW_OPCODE_LINE       89
+#define BRW_OPCODE_NOP        126
+
+#define BRW_PREDICATE_NONE             0
+#define BRW_PREDICATE_NORMAL           1
+#define BRW_PREDICATE_ALIGN1_ANYV             2
+#define BRW_PREDICATE_ALIGN1_ALLV             3
+#define BRW_PREDICATE_ALIGN1_ANY2H            4
+#define BRW_PREDICATE_ALIGN1_ALL2H            5
+#define BRW_PREDICATE_ALIGN1_ANY4H            6
+#define BRW_PREDICATE_ALIGN1_ALL4H            7
+#define BRW_PREDICATE_ALIGN1_ANY8H            8
+#define BRW_PREDICATE_ALIGN1_ALL8H            9
+#define BRW_PREDICATE_ALIGN1_ANY16H           10
+#define BRW_PREDICATE_ALIGN1_ALL16H           11
+#define BRW_PREDICATE_ALIGN16_REPLICATE_X     2
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Y     3
+#define BRW_PREDICATE_ALIGN16_REPLICATE_Z     4
+#define BRW_PREDICATE_ALIGN16_REPLICATE_W     5
+#define BRW_PREDICATE_ALIGN16_ANY4H           6
+#define BRW_PREDICATE_ALIGN16_ALL4H           7
+
+#define BRW_ARCHITECTURE_REGISTER_FILE    0
+#define BRW_GENERAL_REGISTER_FILE         1
+#define BRW_MESSAGE_REGISTER_FILE         2
+#define BRW_IMMEDIATE_VALUE               3
+
+#define BRW_REGISTER_TYPE_UD  0
+#define BRW_REGISTER_TYPE_D   1
+#define BRW_REGISTER_TYPE_UW  2
+#define BRW_REGISTER_TYPE_W   3
+#define BRW_REGISTER_TYPE_UB  4
+#define BRW_REGISTER_TYPE_B   5
+#define BRW_REGISTER_TYPE_VF  5	/* packed float vector, immediates only? */
+#define BRW_REGISTER_TYPE_HF  6
+#define BRW_REGISTER_TYPE_V   6	/* packed int vector, immediates only, uword dest only */
+#define BRW_REGISTER_TYPE_F   7
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+#define BRW_VERTICAL_STRIDE_0                 0
+#define BRW_VERTICAL_STRIDE_1                 1
+#define BRW_VERTICAL_STRIDE_2                 2
+#define BRW_VERTICAL_STRIDE_4                 3
+#define BRW_VERTICAL_STRIDE_8                 4
+#define BRW_VERTICAL_STRIDE_16                5
+#define BRW_VERTICAL_STRIDE_32                6
+#define BRW_VERTICAL_STRIDE_64                7
+#define BRW_VERTICAL_STRIDE_128               8
+#define BRW_VERTICAL_STRIDE_256               9
+#define BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL   0xF
+
+#define BRW_WIDTH_1       0
+#define BRW_WIDTH_2       1
+#define BRW_WIDTH_4       2
+#define BRW_WIDTH_8       3
+#define BRW_WIDTH_16      4
+
+#define BRW_STATELESS_BUFFER_BOUNDARY_1K      0
+#define BRW_STATELESS_BUFFER_BOUNDARY_2K      1
+#define BRW_STATELESS_BUFFER_BOUNDARY_4K      2
+#define BRW_STATELESS_BUFFER_BOUNDARY_8K      3
+#define BRW_STATELESS_BUFFER_BOUNDARY_16K     4
+#define BRW_STATELESS_BUFFER_BOUNDARY_32K     5
+#define BRW_STATELESS_BUFFER_BOUNDARY_64K     6
+#define BRW_STATELESS_BUFFER_BOUNDARY_128K    7
+#define BRW_STATELESS_BUFFER_BOUNDARY_256K    8
+#define BRW_STATELESS_BUFFER_BOUNDARY_512K    9
+#define BRW_STATELESS_BUFFER_BOUNDARY_1M      10
+#define BRW_STATELESS_BUFFER_BOUNDARY_2M      11
+
+#define BRW_POLYGON_FACING_FRONT      0
+#define BRW_POLYGON_FACING_BACK       1
+
+#define BRW_MESSAGE_TARGET_NULL               0
+#define BRW_MESSAGE_TARGET_MATH               1
+#define BRW_MESSAGE_TARGET_SAMPLER            2
+#define BRW_MESSAGE_TARGET_GATEWAY            3
+#define BRW_MESSAGE_TARGET_DATAPORT_READ      4
+#define BRW_MESSAGE_TARGET_DATAPORT_WRITE     5
+#define BRW_MESSAGE_TARGET_URB                6
+#define BRW_MESSAGE_TARGET_THREAD_SPAWNER     7
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6 /* was 7 */
+#define BRW_MATH_FUNCTION_COS                              7 /* was 8 */
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* was 6 */
+#define BRW_MATH_FUNCTION_TAN                              9
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE  0
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+
+
+
+#define CMD_URB_FENCE                 0x6000
+#define CMD_CONST_BUFFER_STATE        0x6001
+#define CMD_CONST_BUFFER              0x6002
+
+#define CMD_STATE_BASE_ADDRESS        0x6101
+#define CMD_STATE_INSN_POINTER        0x6102
+#define CMD_PIPELINE_SELECT           0x6104
+
+#define CMD_PIPELINED_STATE_POINTERS  0x7800
+#define CMD_BINDING_TABLE_PTRS        0x7801
+#define CMD_VERTEX_BUFFER             0x7808
+#define CMD_VERTEX_ELEMENT            0x7809
+#define CMD_INDEX_BUFFER              0x780a
+#define CMD_VF_STATISTICS             0x780b
+
+#define CMD_DRAW_RECT                 0x7900
+#define CMD_BLEND_CONSTANT_COLOR      0x7901
+#define CMD_CHROMA_KEY                0x7904
+#define CMD_DEPTH_BUFFER              0x7905
+#define CMD_POLY_STIPPLE_OFFSET       0x7906
+#define CMD_POLY_STIPPLE_PATTERN      0x7907
+#define CMD_LINE_STIPPLE_PATTERN      0x7908
+#define CMD_GLOBAL_DEPTH_OFFSET_CLAMP 0x7909
+
+#define CMD_PIPE_CONTROL              0x7a00
+
+#define CMD_3D_PRIM                   0x7b00
+
+#define CMD_MI_FLUSH                  0x0200
+
+
+/* Various values from the R0 vertex header:
+ */
+#define R02_PRIM_END    0x1
+#define R02_PRIM_START  0x2
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw.c b/src/gallium/drivers/i965simple/brw_draw.c
new file mode 100644
index 0000000000..7598e3dc8a
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+
+static unsigned hw_prim[PIPE_PRIM_POLYGON+1] = {
+   _3DPRIM_POINTLIST,
+   _3DPRIM_LINELIST,
+   _3DPRIM_LINELOOP,
+   _3DPRIM_LINESTRIP,
+   _3DPRIM_TRILIST,
+   _3DPRIM_TRISTRIP,
+   _3DPRIM_TRIFAN,
+   _3DPRIM_QUADLIST,
+   _3DPRIM_QUADSTRIP,
+   _3DPRIM_POLYGON
+};
+
+
+static const int reduced_prim[PIPE_PRIM_POLYGON+1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES
+};
+
+
+/* When the primitive changes, set a state bit and re-validate.  Not
+ * the nicest and would rather deal with this by having all the
+ * programs be immune to the active primitive (ie. cope with all
+ * possibilities).  That may not be realistic however.
+ */
+static void brw_set_prim(struct brw_context *brw, int prim)
+{
+   PRINT("PRIM: %d\n", prim);
+
+   /* Slight optimization to avoid the GS program when not needed:
+    */
+   if (prim == PIPE_PRIM_QUAD_STRIP &&
+       brw->attribs.Raster->flatshade &&
+       brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_FILL)
+      prim = PIPE_PRIM_TRIANGLE_STRIP;
+
+   if (prim != brw->primitive) {
+      brw->primitive = prim;
+      brw->state.dirty.brw |= BRW_NEW_PRIMITIVE;
+
+      if (reduced_prim[prim] != brw->reduced_primitive) {
+	 brw->reduced_primitive = reduced_prim[prim];
+	 brw->state.dirty.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+      }
+
+      brw_validate_state(brw);
+   }
+
+}
+
+
+static unsigned trim(int prim, unsigned length)
+{
+   if (prim == PIPE_PRIM_QUAD_STRIP)
+      return length > 3 ? (length - length % 2) : 0;
+   else if (prim == PIPE_PRIM_QUADS)
+      return length - length % 4;
+   else
+      return length;
+}
+
+
+
+static boolean brw_emit_prim( struct brw_context *brw,
+			      boolean indexed,
+			      unsigned start,
+			      unsigned count )
+
+{
+   struct brw_3d_primitive prim_packet;
+
+   if (BRW_DEBUG & DEBUG_PRIMS)
+      PRINT("PRIM: %d %d %d\n",  brw->primitive, start, count);
+
+   prim_packet.header.opcode = CMD_3D_PRIM;
+   prim_packet.header.length = sizeof(prim_packet)/4 - 2;
+   prim_packet.header.pad = 0;
+   prim_packet.header.topology = hw_prim[brw->primitive];
+   prim_packet.header.indexed = indexed;
+
+   prim_packet.verts_per_instance = trim(brw->primitive, count);
+   prim_packet.start_vert_location = start;
+   prim_packet.instance_count = 1;
+   prim_packet.start_instance_location = 0;
+   prim_packet.base_vert_location = 0;
+
+   if (prim_packet.verts_per_instance == 0)
+      return TRUE;
+
+   return brw_batchbuffer_data( brw->winsys,
+                                &prim_packet,
+                                sizeof(prim_packet) );
+}
+
+
+/* May fail if out of video memory for texture or vbo upload, or on
+ * fallback conditions.
+ */
+static boolean brw_try_draw_elements( struct pipe_context *pipe,
+				      struct pipe_buffer *index_buffer,
+				      unsigned index_size,
+				      unsigned mode,
+				      unsigned start,
+				      unsigned count )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Set the first primitive ahead of validate_state:
+    */
+   brw_set_prim(brw, mode);
+
+   /* Upload index, vertex data:
+    */
+   if (index_buffer &&
+       !brw_upload_indices( brw, index_buffer, index_size, start, count ))
+      return FALSE;
+
+   if (!brw_upload_vertex_buffers(brw))
+      return FALSE;
+
+   if (!brw_upload_vertex_elements( brw ))
+      return FALSE;
+
+   /* XXX:  Need to separate validate and upload of state.
+    */
+   if (brw->state.dirty.brw)
+      brw_validate_state( brw );
+
+   if (!brw_emit_prim(brw, index_buffer != NULL,
+                      start, count))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_elements( struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode,
+				  unsigned start,
+				  unsigned count )
+{
+   if (!brw_try_draw_elements( pipe,
+			       indexBuffer,
+			       indexSize,
+			       mode, start, count ))
+   {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe,
+				  indexBuffer,
+				  indexSize,
+				  mode, start,
+				  count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+
+   return TRUE;
+}
+
+
+
+static boolean brw_draw_arrays( struct pipe_context *pipe,
+				    unsigned mode,
+				    unsigned start,
+				    unsigned count )
+{
+   if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+      /* flush ? */
+
+      if (!brw_try_draw_elements( pipe, NULL, 0, mode, start, count )) {
+	 assert(0);
+	 return FALSE;
+      }
+   }
+   
+   return TRUE;
+}
+
+
+
+void brw_init_draw_functions( struct brw_context *brw )
+{
+   brw->pipe.draw_arrays = brw_draw_arrays;
+   brw->pipe.draw_elements = brw_draw_elements;
+}
+
+
diff --git a/src/gallium/drivers/i965simple/brw_draw.h b/src/gallium/drivers/i965simple/brw_draw.h
new file mode 100644
index 0000000000..62fe0d5d0e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw.h
@@ -0,0 +1,55 @@
+ /**************************************************************************
+ * 
+ * Copyright 2005 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef BRW_DRAW_H
+#define BRW_DRAW_H
+
+#include "pipe/p_context.h"
+
+struct brw_context;
+
+
+
+void brw_init_draw_functions( struct brw_context *brw );
+
+
+boolean brw_upload_vertices( struct brw_context *brw,
+			       unsigned min_index,
+			       unsigned max_index );
+
+boolean brw_upload_indices(struct brw_context *brw,
+                           const struct pipe_buffer *index_buffer,
+                           int ib_size, int start, int count);
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw );
+boolean brw_upload_vertex_elements( struct brw_context *brw );
+
+unsigned brw_translate_surface_format( unsigned id );
+
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_draw_upload.c b/src/gallium/drivers/i965simple/brw_draw_upload.c
new file mode 100644
index 0000000000..7c20ea52af
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_draw_upload.c
@@ -0,0 +1,300 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "brw_batch.h"
+#include "brw_draw.h"
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+
+struct brw_array_state {
+   union header_union header;
+
+   struct {
+      union {
+	 struct {
+	    unsigned pitch:11;
+	    unsigned pad:15;
+	    unsigned access_type:1;
+	    unsigned vb_index:5;
+	 } bits;
+	 unsigned dword;
+      } vb0;
+
+      struct pipe_buffer *buffer;
+      unsigned offset;
+
+      unsigned max_index;
+      unsigned instance_data_step_rate;
+
+   } vb[BRW_VBP_MAX];
+};
+
+
+
+unsigned brw_translate_surface_format( unsigned id )
+{
+   switch (id) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return BRW_SURFACEFORMAT_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return BRW_SURFACEFORMAT_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return BRW_SURFACEFORMAT_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return BRW_SURFACEFORMAT_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return BRW_SURFACEFORMAT_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return BRW_SURFACEFORMAT_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return BRW_SURFACEFORMAT_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return BRW_SURFACEFORMAT_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return BRW_SURFACEFORMAT_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return BRW_SURFACEFORMAT_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return BRW_SURFACEFORMAT_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return BRW_SURFACEFORMAT_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return BRW_SURFACEFORMAT_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return BRW_SURFACEFORMAT_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return BRW_SURFACEFORMAT_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return BRW_SURFACEFORMAT_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return BRW_SURFACEFORMAT_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return BRW_SURFACEFORMAT_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return BRW_SURFACEFORMAT_R8G8B8A8_SSCALED;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned get_index_type(int type)
+{
+   switch (type) {
+   case 1: return BRW_INDEX_BYTE;
+   case 2: return BRW_INDEX_WORD;
+   case 4: return BRW_INDEX_DWORD;
+   default: assert(0); return 0;
+   }
+}
+
+
+boolean brw_upload_vertex_buffers( struct brw_context *brw )
+{
+   struct brw_array_state vbp;
+   unsigned nr_enabled = 0;
+   unsigned i;
+
+   memset(&vbp, 0, sizeof(vbp));
+
+   /* This is a hardware limit:
+    */
+
+   for (i = 0; i < BRW_VEP_MAX; i++)
+   {
+      if (brw->vb.vbo_array[i] == NULL) {
+	 nr_enabled = i;
+	 break;
+      }
+
+      vbp.vb[i].vb0.bits.pitch = brw->vb.vbo_array[i]->pitch;
+      vbp.vb[i].vb0.bits.pad = 0;
+      vbp.vb[i].vb0.bits.access_type = BRW_VERTEXBUFFER_ACCESS_VERTEXDATA;
+      vbp.vb[i].vb0.bits.vb_index = i;
+      vbp.vb[i].offset = brw->vb.vbo_array[i]->buffer_offset;
+      vbp.vb[i].buffer = brw->vb.vbo_array[i]->buffer;
+      vbp.vb[i].max_index = brw->vb.vbo_array[i]->max_index;
+   }
+
+
+   vbp.header.bits.length = (1 + nr_enabled * 4) - 2;
+   vbp.header.bits.opcode = CMD_VERTEX_BUFFER;
+
+   BEGIN_BATCH(vbp.header.bits.length+2, 0);
+   OUT_BATCH( vbp.header.dword );
+
+   for (i = 0; i < nr_enabled; i++) {
+      OUT_BATCH( vbp.vb[i].vb0.dword );
+      OUT_RELOC( vbp.vb[i].buffer,  PIPE_BUFFER_USAGE_GPU_READ,
+		 vbp.vb[i].offset);
+      OUT_BATCH( vbp.vb[i].max_index );
+      OUT_BATCH( vbp.vb[i].instance_data_step_rate );
+   }
+   ADVANCE_BATCH();
+   return TRUE;
+}
+
+
+
+boolean brw_upload_vertex_elements( struct brw_context *brw )
+{
+   struct brw_vertex_element_packet vep;
+
+   unsigned i;
+   unsigned nr_enabled = brw->attribs.VertexProgram->info.num_inputs;
+
+   memset(&vep, 0, sizeof(vep));
+
+   for (i = 0; i < nr_enabled; i++) 
+      vep.ve[i] = brw->vb.inputs[i];
+
+
+   vep.header.length = (1 + nr_enabled * sizeof(vep.ve[0])/4) - 2;
+   vep.header.opcode = CMD_VERTEX_ELEMENT;
+   brw_cached_batch_struct(brw, &vep, 4 + nr_enabled * sizeof(vep.ve[0]));
+
+   return TRUE;
+}
+
+boolean brw_upload_indices( struct brw_context *brw,
+                            const struct pipe_buffer *index_buffer,
+                            int ib_size, int start, int count)
+{
+   /* Emit the indexbuffer packet:
+    */
+   {
+      struct brw_indexbuffer ib;
+
+      memset(&ib, 0, sizeof(ib));
+
+      ib.header.bits.opcode = CMD_INDEX_BUFFER;
+      ib.header.bits.length = sizeof(ib)/4 - 2;
+      ib.header.bits.index_format = get_index_type(ib_size);
+      ib.header.bits.cut_index_enable = 0;
+
+
+      BEGIN_BATCH(4, 0);
+      OUT_BATCH( ib.header.dword );
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start);
+      OUT_RELOC( index_buffer, PIPE_BUFFER_USAGE_GPU_READ, start + count);
+      OUT_BATCH( 0 );
+      ADVANCE_BATCH();
+   }
+   return TRUE;
+}
diff --git a/src/gallium/drivers/i965simple/brw_eu.c b/src/gallium/drivers/i965simple/brw_eu.c
new file mode 100644
index 0000000000..e2002d1821
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.c
@@ -0,0 +1,130 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+  
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+/* How does predicate control work when execution_size != 8?  Do I
+ * need to test/set for 0xffff when execution_size is 16?
+ */
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value )
+{
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   if (value != 0xff) {
+      if (value != p->flag_value) {
+	 brw_push_insn_state(p);
+	 brw_MOV(p, brw_flag_reg(), brw_imm_uw(value));
+	 p->flag_value = value;
+	 brw_pop_insn_state(p);
+      }
+
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }   
+}
+
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc )
+{
+   p->current->header.predicate_control = pc;
+}
+
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional )
+{
+   p->current->header.destreg__conditonalmod = conditional;
+}
+
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode )
+{
+   p->current->header.access_mode = access_mode;
+}
+
+void brw_set_compression_control( struct brw_compile *p, boolean compression_control )
+{
+   p->current->header.compression_control = compression_control;
+}
+
+void brw_set_mask_control( struct brw_compile *p, unsigned value )
+{
+   p->current->header.mask_control = value;
+}
+
+void brw_set_saturate( struct brw_compile *p, unsigned value )
+{
+   p->current->header.saturate = value;
+}
+
+void brw_push_insn_state( struct brw_compile *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current+1, p->current, sizeof(struct brw_instruction));
+   p->current++;   
+}
+
+void brw_pop_insn_state( struct brw_compile *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void brw_init_compile( struct brw_compile *p )
+{
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   /* Some defaults?
+    */
+   brw_set_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_saturate(p, 0);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_predicate_control_flag_value(p, 0xff); 
+}
+
+
+const unsigned *brw_get_program( struct brw_compile *p,
+			       unsigned *sz )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++)
+      brw_NOP(p);
+
+   *sz = p->nr_insn * sizeof(struct brw_instruction);
+   return (const unsigned *)p->store;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu.h b/src/gallium/drivers/i965simple/brw_eu.h
new file mode 100644
index 0000000000..23151ae9ed
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu.h
@@ -0,0 +1,888 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include "brw_structs.h"
+#include "brw_defines.h"
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_shader_tokens.h"
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+
+
+#define REG_SIZE (8*4)
+
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg
+{
+   unsigned type:4;
+   unsigned file:2;
+   unsigned nr:8;
+   unsigned subnr:5;		/* :1 in align16 */
+   unsigned negate:1;		/* source only */
+   unsigned abs:1;		/* source only */
+   unsigned vstride:4;		/* source only */
+   unsigned width:3;		/* src only, align1 only */
+   unsigned hstride:2;   		/* src only, align1 only */
+   unsigned address_mode:1;	/* relative addressing, hopefully! */
+   unsigned pad0:1;
+
+   union {
+      struct {
+	 unsigned swizzle:8;		/* src only, align16 only */
+	 unsigned writemask:4;		/* dest only, align16 only */
+	 int  indirect_offset:10;	/* relative addressing offset */
+	 unsigned pad1:10;		/* two dwords total */
+      } bits;
+
+      float f;
+      int   d;
+      unsigned ud;
+   } dw1;
+};
+
+
+struct brw_indirect {
+   unsigned addr_subnr:4;
+   int addr_offset:10;
+   unsigned pad:18;
+};
+
+
+#define BRW_EU_MAX_INSN_STACK 5
+#define BRW_EU_MAX_INSN 1200
+
+struct brw_compile {
+   struct brw_instruction store[BRW_EU_MAX_INSN];
+   unsigned nr_insn;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   struct brw_instruction stack[BRW_EU_MAX_INSN_STACK];
+   struct brw_instruction *current;
+
+   unsigned flag_value;
+   boolean single_program_flow;
+};
+
+
+
+static __inline int type_sz( unsigned type )
+{
+   switch( type ) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+      return 4;
+   case BRW_REGISTER_TYPE_HF:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      return 0;
+   }
+}
+
+static __inline struct brw_reg brw_reg( unsigned file,
+					unsigned nr,
+					unsigned subnr,
+					unsigned type,
+					unsigned vstride,
+					unsigned width,
+					unsigned hstride,
+					unsigned swizzle,
+					unsigned writemask)
+{
+
+   struct brw_reg reg;
+   reg.type = type;
+   reg.file = file;
+   reg.nr = nr;
+   reg.subnr = subnr * type_sz(type);
+   reg.negate = 0;
+   reg.abs = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.dw1.bits.swizzle = swizzle;
+   reg.dw1.bits.writemask = writemask;
+   reg.dw1.bits.indirect_offset = 0;
+   reg.dw1.bits.pad1 = 0;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec16_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_16,
+		  BRW_WIDTH_16,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+static __inline struct brw_reg brw_vec8_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_8,
+		  BRW_WIDTH_8,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec4_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_4,
+		  BRW_WIDTH_4,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYZW,
+		  TGSI_WRITEMASK_XYZW);
+}
+
+
+static __inline struct brw_reg brw_vec2_reg( unsigned file,
+					      unsigned nr,
+					      unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_2,
+		  BRW_WIDTH_2,
+		  BRW_HORIZONTAL_STRIDE_1,
+		  BRW_SWIZZLE_XYXY,
+		  TGSI_WRITEMASK_XY);
+}
+
+static __inline struct brw_reg brw_vec1_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return brw_reg(file,
+		  nr,
+		  subnr,
+		  BRW_REGISTER_TYPE_F,
+		  BRW_VERTICAL_STRIDE_0,
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XXXX,
+		  TGSI_WRITEMASK_X);
+}
+
+
+static __inline struct brw_reg retype( struct brw_reg reg,
+				       unsigned type )
+{
+   reg.type = type;
+   return reg;
+}
+
+static __inline struct brw_reg suboffset( struct brw_reg reg,
+					  unsigned delta )
+{
+   reg.subnr += delta * type_sz(reg.type);
+   return reg;
+}
+
+
+static __inline struct brw_reg offset( struct brw_reg reg,
+				       unsigned delta )
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static __inline struct brw_reg byte_offset( struct brw_reg reg,
+					    unsigned bytes )
+{
+   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_uw16_reg( unsigned file,
+					     unsigned nr,
+					     unsigned subnr )
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw8_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_uw1_reg( unsigned file,
+					    unsigned nr,
+					    unsigned subnr )
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static __inline struct brw_reg brw_imm_reg( unsigned type )
+{
+   return brw_reg( BRW_IMMEDIATE_VALUE,
+		   0,
+		   0,
+		   type,
+		   BRW_VERTICAL_STRIDE_0,
+		   BRW_WIDTH_1,
+		   BRW_HORIZONTAL_STRIDE_0,
+		   0,
+		   0);
+}
+
+static __inline struct brw_reg brw_imm_f( float f )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.dw1.f = f;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_d( int d )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.dw1.d = d;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_ud( unsigned ud )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.dw1.ud = ud;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_uw( ushort uw )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.dw1.ud = uw;
+   return imm;
+}
+
+static __inline struct brw_reg brw_imm_w( short w )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.dw1.d = w;
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/* Vector of eight signed half-byte values:
+ */
+static __inline struct brw_reg brw_imm_v( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_8;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+/* Vector of four 8-bit float values:
+ */
+static __inline struct brw_reg brw_imm_vf( unsigned v )
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = v;
+   return imm;
+}
+
+#define VF_ZERO 0x0
+#define VF_ONE  0x30
+#define VF_NEG  (1<<7)
+
+static __inline struct brw_reg brw_imm_vf4( unsigned v0,
+					    unsigned v1,
+					    unsigned v2,
+					    unsigned v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.dw1.ud = ((v0 << 0) |
+		 (v1 << 8) |
+		 (v2 << 16) |
+		 (v3 << 24));
+   return imm;
+}
+
+
+static __inline struct brw_reg brw_address( struct brw_reg reg )
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+
+static __inline struct brw_reg brw_vec1_grf( unsigned nr,
+					       unsigned subnr )
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec8_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_vec4_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static __inline struct brw_reg brw_vec2_grf( unsigned nr,
+					     unsigned subnr )
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_uw8_grf( unsigned nr,
+					    unsigned subnr )
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static __inline struct brw_reg brw_null_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_NULL,
+		       0);
+}
+
+static __inline struct brw_reg brw_address_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_ADDRESS,
+		      subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static __inline struct brw_reg brw_ip_reg( void )
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		  BRW_ARF_IP,
+		  0,
+		  BRW_REGISTER_TYPE_UD,
+		  BRW_VERTICAL_STRIDE_4, /* ? */
+		  BRW_WIDTH_1,
+		  BRW_HORIZONTAL_STRIDE_0,
+		  BRW_SWIZZLE_XYZW, /* NOTE! */
+		  TGSI_WRITEMASK_XYZW); /* NOTE! */
+}
+
+static __inline struct brw_reg brw_acc_reg( void )
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		       BRW_ARF_ACCUMULATOR,
+		       0);
+}
+
+
+static __inline struct brw_reg brw_flag_reg( void )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_FLAG,
+		      0);
+}
+
+
+static __inline struct brw_reg brw_mask_reg( unsigned subnr )
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+		      BRW_ARF_MASK,
+		      subnr);
+}
+
+static __inline struct brw_reg brw_message_reg( unsigned nr )
+{
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
+		       nr,
+		       0);
+}
+
+
+
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static __inline unsigned cvt( unsigned val )
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static __inline struct brw_reg stride( struct brw_reg reg,
+				       unsigned vstride,
+				       unsigned width,
+				       unsigned hstride )
+{
+
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+static __inline struct brw_reg vec16( struct brw_reg reg )
+{
+   return stride(reg, 16,16,1);
+}
+
+static __inline struct brw_reg vec8( struct brw_reg reg )
+{
+   return stride(reg, 8,8,1);
+}
+
+static __inline struct brw_reg vec4( struct brw_reg reg )
+{
+   return stride(reg, 4,4,1);
+}
+
+static __inline struct brw_reg vec2( struct brw_reg reg )
+{
+   return stride(reg, 2,2,1);
+}
+
+static __inline struct brw_reg vec1( struct brw_reg reg )
+{
+   return stride(reg, 0,1,0);
+}
+
+static __inline struct brw_reg get_element( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static __inline struct brw_reg get_element_ud( struct brw_reg reg, unsigned elt )
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+
+static __inline struct brw_reg brw_swizzle( struct brw_reg reg,
+					    unsigned x,
+					    unsigned y,
+					    unsigned z,
+					    unsigned w)
+{
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(reg.dw1.bits.swizzle, x),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, y),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, z),
+				       BRW_GET_SWZ(reg.dw1.bits.swizzle, w));
+   return reg;
+}
+
+
+static __inline struct brw_reg brw_swizzle1( struct brw_reg reg,
+					     unsigned x )
+{
+   return brw_swizzle(reg, x, x, x, x);
+}
+
+static __inline struct brw_reg brw_writemask( struct brw_reg reg,
+					      unsigned mask )
+{
+   reg.dw1.bits.writemask &= mask;
+   return reg;
+}
+
+static __inline struct brw_reg brw_set_writemask( struct brw_reg reg,
+						  unsigned mask )
+{
+   reg.dw1.bits.writemask = mask;
+   return reg;
+}
+
+static __inline struct brw_reg negate( struct brw_reg reg )
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static __inline struct brw_reg brw_abs( struct brw_reg reg )
+{
+   reg.abs = 1;
+   return reg;
+}
+
+/***********************************************************************
+ */
+static __inline struct brw_reg brw_vec4_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg brw_vec1_indirect( unsigned subnr,
+						  int offset )
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.dw1.bits.indirect_offset = offset;
+   return reg;
+}
+
+static __inline struct brw_reg deref_4f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_1f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static __inline struct brw_reg deref_4b(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static __inline struct brw_reg deref_1uw(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static __inline struct brw_reg deref_1ud(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static __inline struct brw_reg get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static __inline struct brw_indirect brw_indirect_offset( struct brw_indirect ptr, int offset )
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static __inline struct brw_indirect brw_indirect( unsigned addr_subnr, int offset )
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+static __inline struct brw_instruction *current_insn( struct brw_compile *p)
+{
+	return &p->store[p->nr_insn];
+}
+
+void brw_pop_insn_state( struct brw_compile *p );
+void brw_push_insn_state( struct brw_compile *p );
+void brw_set_mask_control( struct brw_compile *p, unsigned value );
+void brw_set_saturate( struct brw_compile *p, unsigned value );
+void brw_set_access_mode( struct brw_compile *p, unsigned access_mode );
+void brw_set_compression_control( struct brw_compile *p, boolean control );
+void brw_set_predicate_control_flag_value( struct brw_compile *p, unsigned value );
+void brw_set_predicate_control( struct brw_compile *p, unsigned pc );
+void brw_set_conditionalmod( struct brw_compile *p, unsigned conditional );
+
+void brw_init_compile( struct brw_compile *p );
+const unsigned *brw_get_program( struct brw_compile *p, unsigned *sz );
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src );
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 );
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0);
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,	\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(JMPI)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+#undef ALU1
+#undef ALU2
+
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle);
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot);
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot);
+
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision );
+
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision );
+
+void brw_dp_READ_16( struct brw_compile *p,
+		     struct brw_reg dest,
+		     unsigned msg_reg_nr,
+		     unsigned scratch_offset );
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset );
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn);
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *if_or_else_insn);
+
+
+/* DO/WHILE loops:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p,
+			       unsigned execute_size);
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *patch_insn);
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p);
+struct brw_instruction *brw_CONT(struct brw_compile *p);
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn);
+
+
+
+void brw_NOP(struct brw_compile *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void brw_print_reg( struct brw_reg reg );
+
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count);
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count);
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_math_invert( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1( struct brw_instruction *insn,
+                          struct brw_reg reg );
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_eu_debug.c b/src/gallium/drivers/i965simple/brw_eu_debug.c
new file mode 100644
index 0000000000..4a94ddefa6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_debug.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#include "pipe/p_debug.h"
+
+#include "brw_eu.h"
+
+void brw_print_reg( struct brw_reg hwreg )
+{
+   static const char *file[] = {
+      "arf",
+      "grf",
+      "msg",
+      "imm"
+   };
+
+   static const char *type[] = {
+      "ud",
+      "d",
+      "uw",
+      "w",
+      "ub",
+      "vf",
+      "hf",
+      "f"
+   };
+
+   debug_printf("%s%s", 
+		hwreg.abs ? "abs/" : "",
+		hwreg.negate ? "-" : "");
+     
+   if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+       hwreg.nr % 2 == 0 &&
+       hwreg.subnr == 0 &&
+       hwreg.vstride == BRW_VERTICAL_STRIDE_8 &&
+       hwreg.width == BRW_WIDTH_8 &&
+       hwreg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
+       hwreg.type == BRW_REGISTER_TYPE_F) {
+      debug_printf("vec%d", hwreg.nr);
+   }
+   else if (hwreg.file == BRW_GENERAL_REGISTER_FILE &&
+	    hwreg.vstride == BRW_VERTICAL_STRIDE_0 &&
+	    hwreg.width == BRW_WIDTH_1 &&
+	    hwreg.hstride == BRW_HORIZONTAL_STRIDE_0 &&
+	    hwreg.type == BRW_REGISTER_TYPE_F) {      
+      debug_printf("scl%d.%d", hwreg.nr, hwreg.subnr / 4);
+   }
+   else {
+      debug_printf("%s%d.%d<%d;%d,%d>:%s", 
+		   file[hwreg.file],
+		   hwreg.nr,
+		   hwreg.subnr / type_sz(hwreg.type),
+		   hwreg.vstride ? (1<<(hwreg.vstride-1)) : 0,
+		   1<<hwreg.width,
+		   hwreg.hstride ? (1<<(hwreg.hstride-1)) : 0,		
+		   type[hwreg.type]);
+   }
+}
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_emit.c b/src/gallium/drivers/i965simple/brw_eu_emit.c
new file mode 100644
index 0000000000..400a80b6fb
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_emit.c
@@ -0,0 +1,1080 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+
+
+/***********************************************************************
+ * Internal helper for constructing instructions
+ */
+
+static void guess_execution_size( struct brw_instruction *insn,
+				  struct brw_reg reg )
+{
+   if (reg.width == BRW_WIDTH_8 &&
+       insn->header.compression_control == BRW_COMPRESSION_COMPRESSED)
+      insn->header.execution_size = BRW_EXECUTE_16;
+   else
+      insn->header.execution_size = reg.width;	/* note - definitions are compatible */
+}
+
+
+static void brw_set_dest( struct brw_instruction *insn,
+			  struct brw_reg dest )
+{
+   insn->bits1.da1.dest_reg_file = dest.file;
+   insn->bits1.da1.dest_reg_type = dest.type;
+   insn->bits1.da1.dest_address_mode = dest.address_mode;
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+      insn->bits1.da1.dest_reg_nr = dest.nr;
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.da1.dest_subreg_nr = dest.subnr;
+	 insn->bits1.da1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
+	 insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
+      }
+   }
+   else {
+      insn->bits1.ia1.dest_subreg_nr = dest.subnr;
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+	 insn->bits1.ia1.dest_horiz_stride = BRW_HORIZONTAL_STRIDE_1;
+      }
+      else {
+	 insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
+      }
+   }
+
+   /* NEW: Set the execution size based on dest.width and
+    * insn->compression_control:
+    */
+   guess_execution_size(insn, dest);
+}
+
+static void brw_set_src0( struct brw_instruction *insn,
+		      struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src0_reg_file = reg.file;
+   insn->bits1.da1.src0_reg_type = reg.type;
+   insn->bits2.da1.src0_abs = reg.abs;
+   insn->bits2.da1.src0_negate = reg.negate;
+   insn->bits2.da1.src0_address_mode = reg.address_mode;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+
+      /* Required to set some fields in src1 as well:
+       */
+      insn->bits1.da1.src1_reg_file = 0; /* arf */
+      insn->bits1.da1.src1_reg_type = reg.type;
+   }
+   else
+   {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.da1.src0_subreg_nr = reg.subnr;
+	    insn->bits2.da1.src0_reg_nr = reg.nr;
+	 }
+	 else {
+	    insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+	    insn->bits2.da16.src0_reg_nr = reg.nr;
+	 }
+      }
+      else {
+	 insn->bits2.ia1.src0_subreg_nr = reg.subnr;
+
+	 if (insn->header.access_mode == BRW_ALIGN_1) {
+	    insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
+	 }
+	 else {
+	    insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
+	 }
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits2.da1.src0_width = BRW_WIDTH_1;
+	    insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits2.da1.src0_horiz_stride = reg.hstride;
+	    insn->bits2.da1.src0_width = reg.width;
+	    insn->bits2.da1.src0_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits2.da16.src0_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+void brw_set_src1( struct brw_instruction *insn,
+			  struct brw_reg reg )
+{
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   insn->bits1.da1.src1_reg_file = reg.file;
+   insn->bits1.da1.src1_reg_type = reg.type;
+   insn->bits3.da1.src1_abs = reg.abs;
+   insn->bits3.da1.src1_negate = reg.negate;
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      insn->bits3.ud = reg.dw1.ud;
+   }
+   else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      //assert (reg.file == BRW_GENERAL_REGISTER_FILE);
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 insn->bits3.da1.src1_subreg_nr = reg.subnr;
+	 insn->bits3.da1.src1_reg_nr = reg.nr;
+      }
+      else {
+	 insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+	 insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (insn->header.access_mode == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+	     insn->header.execution_size == BRW_EXECUTE_1) {
+	    insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
+	    insn->bits3.da1.src1_width = BRW_WIDTH_1;
+	    insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
+	 }
+	 else {
+	    insn->bits3.da1.src1_horiz_stride = reg.hstride;
+	    insn->bits3.da1.src1_width = reg.width;
+	    insn->bits3.da1.src1_vert_stride = reg.vstride;
+	 }
+      }
+      else {
+	 insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
+	 insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
+	 insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
+	 insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+	    insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
+	 else
+	    insn->bits3.da16.src1_vert_stride = reg.vstride;
+      }
+   }
+}
+
+
+
+static void brw_set_math_message( struct brw_instruction *insn,
+				  unsigned msg_length,
+				  unsigned response_length,
+				  unsigned function,
+				  unsigned integer_type,
+				  boolean low_precision,
+				  boolean saturate,
+				  unsigned dataType )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.math.function = function;
+   insn->bits3.math.int_type = integer_type;
+   insn->bits3.math.precision = low_precision;
+   insn->bits3.math.saturate = saturate;
+   insn->bits3.math.data_type = dataType;
+   insn->bits3.math.response_length = response_length;
+   insn->bits3.math.msg_length = msg_length;
+   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+   insn->bits3.math.end_of_thread = 0;
+}
+
+static void brw_set_urb_message( struct brw_instruction *insn,
+				 boolean allocate,
+				 boolean used,
+				 unsigned msg_length,
+				 unsigned response_length,
+				 boolean end_of_thread,
+				 boolean complete,
+				 unsigned offset,
+				 unsigned swizzle_control )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.urb.opcode = 0;	/* ? */
+   insn->bits3.urb.offset = offset;
+   insn->bits3.urb.swizzle_control = swizzle_control;
+   insn->bits3.urb.allocate = allocate;
+   insn->bits3.urb.used = used;	/* ? */
+   insn->bits3.urb.complete = complete;
+   insn->bits3.urb.response_length = response_length;
+   insn->bits3.urb.msg_length = msg_length;
+   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_write_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned msg_length,
+				      unsigned pixel_scoreboard_clear,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_write.binding_table_index = binding_table_index;
+   insn->bits3.dp_write.msg_control = msg_control;
+   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+   insn->bits3.dp_write.msg_type = msg_type;
+   insn->bits3.dp_write.send_commit_msg = 0;
+   insn->bits3.dp_write.response_length = response_length;
+   insn->bits3.dp_write.msg_length = msg_length;
+   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+   insn->bits3.urb.end_of_thread = end_of_thread;
+}
+
+static void brw_set_dp_read_message( struct brw_instruction *insn,
+				      unsigned binding_table_index,
+				      unsigned msg_control,
+				      unsigned msg_type,
+				      unsigned target_cache,
+				      unsigned msg_length,
+				      unsigned response_length,
+				      unsigned end_of_thread )
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.dp_read.binding_table_index = binding_table_index;
+   insn->bits3.dp_read.msg_control = msg_control;
+   insn->bits3.dp_read.msg_type = msg_type;
+   insn->bits3.dp_read.target_cache = target_cache;
+   insn->bits3.dp_read.response_length = response_length;
+   insn->bits3.dp_read.msg_length = msg_length;
+   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ;
+   insn->bits3.dp_read.end_of_thread = end_of_thread;
+}
+
+static void brw_set_sampler_message( struct brw_instruction *insn,
+				     unsigned binding_table_index,
+				     unsigned sampler,
+				     unsigned msg_type,
+				     unsigned response_length,
+				     unsigned msg_length,
+				     boolean eot)
+{
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->bits3.sampler.binding_table_index = binding_table_index;
+   insn->bits3.sampler.sampler = sampler;
+   insn->bits3.sampler.msg_type = msg_type;
+   insn->bits3.sampler.return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+   insn->bits3.sampler.response_length = response_length;
+   insn->bits3.sampler.msg_length = msg_length;
+   insn->bits3.sampler.end_of_thread = eot;
+   insn->bits3.sampler.msg_target = BRW_MESSAGE_TARGET_SAMPLER;
+}
+
+
+
+static struct brw_instruction *next_insn( struct brw_compile *p,
+					  unsigned opcode )
+{
+   struct brw_instruction *insn;
+
+   assert(p->nr_insn + 1 < BRW_EU_MAX_INSN);
+
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   /* Reset this one-shot flag:
+    */
+
+   if (p->current->header.destreg__conditonalmod) {
+      p->current->header.destreg__conditonalmod = 0;
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+   }
+
+   insn->header.opcode = opcode;
+   return insn;
+}
+
+
+struct brw_instruction *brw_alu1( struct brw_compile *p,
+				  unsigned opcode,
+				  struct brw_reg dest,
+				  struct brw_reg src )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   return insn;
+}
+
+struct brw_instruction *brw_alu2(struct brw_compile *p,
+				 unsigned opcode,
+				 struct brw_reg dest,
+				 struct brw_reg src0,
+				 struct brw_reg src1 )
+{
+   struct brw_instruction *insn = next_insn(p, opcode);
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+   return insn;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+struct brw_instruction *brw_##OP(struct brw_compile *p,			\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU2(RSR)
+ALU2(RSL)
+ALU2(ASR)
+ALU2(ADD)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+
+
+
+
+void brw_NOP(struct brw_compile *p)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
+   brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+   brw_set_src1(insn, brw_imm_ud(0x0));
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+struct brw_instruction *brw_JMPI(struct brw_compile *p,
+	      struct brw_reg dest,
+	      struct brw_reg src0,
+	      struct brw_reg src1)
+{
+   struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevent flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ *
+ * No attempt is made to deal with stack overflow (14 elements?).
+ */
+struct brw_instruction *brw_IF(struct brw_compile *p, unsigned execute_size)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      assert(execute_size == BRW_EXECUTE_1);
+
+      insn = next_insn(p, BRW_OPCODE_ADD);
+      insn->header.predicate_inverse = 1;
+   } else {
+      insn = next_insn(p, BRW_OPCODE_IF);
+   }
+
+   /* Override the defaults for this instruction:
+    */
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.execution_size = execute_size;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.predicate_control = BRW_PREDICATE_NORMAL;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+
+   return insn;
+}
+
+
+struct brw_instruction *brw_ELSE(struct brw_compile *p,
+				 struct brw_instruction *if_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow) {
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_ELSE);
+   }
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = if_insn->header.execution_size;
+   insn->header.mask_control = BRW_MASK_ENABLE;
+
+   /* Patch the if instruction to point at this instruction.
+    */
+   if (p->single_program_flow) {
+      assert(if_insn->header.opcode == BRW_OPCODE_ADD);
+
+      if_insn->bits3.ud = (insn - if_insn + 1) * 16;
+   } else {
+      assert(if_insn->header.opcode == BRW_OPCODE_IF);
+
+      if_insn->bits3.if_else.jump_count = insn - if_insn;
+      if_insn->bits3.if_else.pop_count = 1;
+      if_insn->bits3.if_else.pad0 = 0;
+   }
+
+   return insn;
+}
+
+void brw_ENDIF(struct brw_compile *p,
+	       struct brw_instruction *patch_insn)
+{
+   if (p->single_program_flow) {
+      /* In single program flow mode, there's no need to execute an ENDIF,
+       * since we don't need to do any stack operations, and if we're executing
+       * currently, we want to just continue executing.
+       */
+      struct brw_instruction *next = &p->store[p->nr_insn];
+
+      assert(patch_insn->header.opcode == BRW_OPCODE_ADD);
+
+      patch_insn->bits3.ud = (next - patch_insn) * 16;
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+      brw_set_dest(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src0(insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+      brw_set_src1(insn, brw_imm_d(0x0));
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = patch_insn->header.execution_size;
+      insn->header.mask_control = BRW_MASK_ENABLE;
+
+      assert(patch_insn->bits3.if_else.jump_count == 0);
+
+      /* Patch the if or else instructions to point at this or the next
+       * instruction respectively.
+       */
+      if (patch_insn->header.opcode == BRW_OPCODE_IF) {
+	 /* Automagically turn it into an IFF:
+	  */
+	 patch_insn->header.opcode = BRW_OPCODE_IFF;
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 0;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
+	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.pop_count = 1;
+	 patch_insn->bits3.if_else.pad0 = 0;
+      } else {
+	 assert(0);
+      }
+
+      /* Also pop item off the stack in the endif instruction:
+       */
+      insn->bits3.if_else.jump_count = 0;
+      insn->bits3.if_else.pop_count = 1;
+      insn->bits3.if_else.pad0 = 0;
+   }
+}
+
+struct brw_instruction *brw_BREAK(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+struct brw_instruction *brw_CONT(struct brw_compile *p)
+{
+   struct brw_instruction *insn;
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.execution_size = BRW_EXECUTE_8;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   insn->bits3.if_else.pad0 = 0;
+   return insn;
+}
+
+/* DO/WHILE loop:
+ */
+struct brw_instruction *brw_DO(struct brw_compile *p, unsigned execute_size)
+{
+   if (p->single_program_flow) {
+      return &p->store[p->nr_insn];
+   } else {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(insn, brw_null_reg());
+      brw_set_src0(insn, brw_null_reg());
+      brw_set_src1(insn, brw_null_reg());
+
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.execution_size = execute_size;
+      insn->header.predicate_control = BRW_PREDICATE_NONE;
+      /* insn->header.mask_control = BRW_MASK_ENABLE; */
+      insn->header.mask_control = BRW_MASK_DISABLE;
+
+      return insn;
+   }
+}
+
+
+
+struct brw_instruction *brw_WHILE(struct brw_compile *p,
+	       struct brw_instruction *do_insn)
+{
+   struct brw_instruction *insn;
+
+   if (p->single_program_flow)
+      insn = next_insn(p, BRW_OPCODE_ADD);
+   else
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+
+   brw_set_dest(insn, brw_ip_reg());
+   brw_set_src0(insn, brw_ip_reg());
+   brw_set_src1(insn, brw_imm_d(0x0));
+
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+
+   if (p->single_program_flow) {
+      insn->header.execution_size = BRW_EXECUTE_1;
+
+      insn->bits3.d = (do_insn - insn) * 16;
+   } else {
+      insn->header.execution_size = do_insn->header.execution_size;
+
+      assert(do_insn->header.opcode == BRW_OPCODE_DO);
+      insn->bits3.if_else.jump_count = do_insn - insn;
+      insn->bits3.if_else.pop_count = 0;
+      insn->bits3.if_else.pad0 = 0;
+   }
+
+/*    insn->header.mask_control = BRW_MASK_ENABLE; */
+
+   insn->header.mask_control = BRW_MASK_DISABLE;
+   p->current->header.predicate_control = BRW_PREDICATE_NONE;
+   return insn;
+}
+
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_compile *p,
+		       struct brw_instruction *jmp_insn)
+{
+   struct brw_instruction *landing = &p->store[p->nr_insn];
+
+   assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
+   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+
+   jmp_insn->bits3.ud = (landing - jmp_insn) - 1;
+}
+
+
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_compile *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   insn->header.destreg__conditonalmod = conditional;
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, src1);
+
+/*    guess_execution_size(insn, src0); */
+
+
+   /* Make it so that future instructions will use the computed flag
+    * value until brw_set_predicate_control_flag_value() is called
+    * again.
+    */
+   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       dest.nr == 0) {
+      p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
+      p->flag_value = 0xff;
+   }
+}
+
+
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/* Invert 8 values
+ */
+void brw_math( struct brw_compile *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned saturate,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned data_type,
+	       unsigned precision )
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   insn->header.predicate_control = 0;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			data_type);
+}
+
+/* Use 2 send instructions to invert 16 elements
+ */
+void brw_math_16( struct brw_compile *p,
+		  struct brw_reg dest,
+		  unsigned function,
+		  unsigned saturate,
+		  unsigned msg_reg_nr,
+		  struct brw_reg src,
+		  unsigned precision )
+{
+   struct brw_instruction *insn;
+   unsigned msg_length = (function == BRW_MATH_FUNCTION_POW) ? 2 : 1;
+   unsigned response_length = (function == BRW_MATH_FUNCTION_SINCOS) ? 2 : 1;
+
+   /* First instruction:
+    */
+   brw_push_insn_state(p);
+   brw_set_predicate_control_flag_value(p, 0xff);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   /* Second instruction:
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   insn->header.compression_control = BRW_COMPRESSION_2NDHALF;
+   insn->header.destreg__conditonalmod = msg_reg_nr+1;
+
+   brw_set_dest(insn, offset(dest,1));
+   brw_set_src0(insn, src);
+   brw_set_math_message(insn,
+			msg_length, response_length,
+			function,
+			BRW_MATH_INTEGER_UNSIGNED,
+			precision,
+			saturate,
+			BRW_MATH_DATA_VECTOR);
+
+   brw_pop_insn_state(p);
+}
+
+
+
+
+void brw_dp_WRITE_16( struct brw_compile *p,
+		      struct brw_reg src,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      unsigned msg_length = 3;
+      struct brw_reg dest = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src);
+
+      brw_set_dp_write_message(insn,
+			       255, /* bti */
+			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
+			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
+			       msg_length,
+			       0, /* pixel scoreboard */
+			       0, /* response_length */
+			       0); /* eot */
+   }
+
+}
+
+
+void brw_dp_READ_16( struct brw_compile *p,
+		      struct brw_reg dest,
+		      unsigned msg_reg_nr,
+		      unsigned scratch_offset )
+{
+   {
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+      brw_MOV(p,
+	      retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_D),
+	      brw_imm_d(scratch_offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);	/* UW? */
+      brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
+
+      brw_set_dp_read_message(insn,
+			      255, /* bti */
+			      3,  /* msg_control */
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      1, /* target cache */
+			      1, /* msg_length */
+			      2, /* response_length */
+			      0); /* eot */
+   }
+}
+
+
+void brw_fb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   insn->header.predicate_control = 0; /* XXX */
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_dp_write_message(insn,
+			    binding_table_index,
+			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
+			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
+			    msg_length,
+			    1,	/* pixel scoreboard */
+			    response_length,
+			    eot);
+}
+
+
+
+void brw_SAMPLE(struct brw_compile *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned writemask,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		boolean eot)
+{
+   boolean need_stall = 0;
+
+   if(writemask == 0) {
+/*       debug_printf("%s: zero writemask??\n", __FUNCTION__); */
+      return;
+   }
+
+   /* Hardware doesn't do destination dependency checking on send
+    * instructions properly.  Add a workaround which generates the
+    * dependency by other means.  In practice it seems like this bug
+    * only crops up for texture samples, and only where registers are
+    * written by the send and then written again later without being
+    * read in between.  Luckily for us, we already track that
+    * information and use it to modify the writemask for the
+    * instruction, so that is a guide for whether a workaround is
+    * needed.
+    */
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      unsigned dst_offset = 0;
+      unsigned i, newmask = 0, len = 0;
+
+      for (i = 0; i < 4; i++) {
+	 if (writemask & (1<<i))
+	    break;
+	 dst_offset += 2;
+      }
+      for (; i < 4; i++) {
+	 if (!(writemask & (1<<i)))
+	    break;
+	 newmask |= 1<<i;
+	 len++;
+      }
+
+      if (newmask != writemask) {
+	 need_stall = 1;
+/* 	 debug_printf("need stall %x %x\n", newmask , writemask); */
+      }
+      else {
+	 struct brw_reg m1 = brw_message_reg(msg_reg_nr);
+
+	 newmask = ~newmask & TGSI_WRITEMASK_XYZW;
+
+	 brw_push_insn_state(p);
+
+	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+	 brw_set_mask_control(p, BRW_MASK_DISABLE);
+
+	 brw_MOV(p, m1, brw_vec8_grf(0,0));
+  	 brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
+
+	 brw_pop_insn_state(p);
+
+  	 src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
+	 dest = offset(dest, dst_offset);
+	 response_length = len * 2;
+      }
+   }
+
+   {
+      struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      insn->header.predicate_control = 0; /* XXX */
+      insn->header.compression_control = BRW_COMPRESSION_NONE;
+      insn->header.destreg__conditonalmod = msg_reg_nr;
+
+      brw_set_dest(insn, dest);
+      brw_set_src0(insn, src0);
+      brw_set_sampler_message(insn,
+			      binding_table_index,
+			      sampler,
+			      msg_type,
+			      response_length,
+			      msg_length,
+			      eot);
+   }
+
+   if (need_stall)
+   {
+      struct brw_reg reg = vec8(offset(dest, response_length-1));
+
+      /*  mov (8) r9.0<1>:f    r9.0<8;8,1>:f    { Align1 }
+       */
+      brw_push_insn_state(p);
+      brw_set_compression_control(p, FALSE);
+      brw_MOV(p, reg, reg);
+      brw_pop_insn_state(p);
+   }
+
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_compile *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   boolean allocate,
+		   boolean used,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   boolean eot,
+		   boolean writes_complete,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_urb_message(insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length,
+		       eot,
+		       writes_complete,
+		       offset,
+		       swizzle);
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_eu_util.c b/src/gallium/drivers/i965simple/brw_eu_util.c
new file mode 100644
index 0000000000..3a65b141f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_eu_util.c
@@ -0,0 +1,126 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+      
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_compile *p, 
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   brw_math( p, 
+	     dst,
+	     BRW_MATH_FUNCTION_INV, 
+	     BRW_MATH_SATURATE_NONE,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL, 
+	     BRW_MATH_DATA_VECTOR );
+}
+
+
+
+void brw_copy4(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_compile *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_compile *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_compile *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_flush.c b/src/gallium/drivers/i965simple/brw_flush.c
new file mode 100644
index 0000000000..e6001c30d9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_flush.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_batch.h"
+
+
+static void brw_flush( struct pipe_context *pipe,
+                       unsigned flags,
+                       struct pipe_fence_handle **fence )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   /* Do we need to emit an MI_FLUSH command to flush the hardware
+    * caches?
+    */
+   if (flags & (PIPE_FLUSH_RENDER_CACHE | PIPE_FLUSH_TEXTURE_CACHE)) {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+
+      if (!(flags & PIPE_FLUSH_RENDER_CACHE))
+	 flush.flags |= BRW_INHIBIT_FLUSH_RENDER_CACHE;
+
+      if (flags & PIPE_FLUSH_TEXTURE_CACHE)
+	 flush.flags |= BRW_FLUSH_READ_CACHE;
+
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   /* If there are no flags, just flush pending commands to hardware:
+    */
+   FLUSH_BATCH( fence );
+}
+
+
+
+void brw_init_flush_functions( struct brw_context *brw )
+{
+   brw->pipe.flush = brw_flush;
+}
diff --git a/src/gallium/drivers/i965simple/brw_gs.c b/src/gallium/drivers/i965simple/brw_gs.c
new file mode 100644
index 0000000000..de60868ccc
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.c
@@ -0,0 +1,196 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_state.h"
+#include "brw_gs.h"
+
+
+
+static void compile_gs_prog( struct brw_context *brw,
+			     struct brw_gs_prog_key *key )
+{
+   struct brw_gs_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   c.key = *key;
+
+   /* Need to locate the two positions present in vertex + header.
+    * These are currently hardcoded:
+    */
+   c.nr_attrs = brw_count_bits(c.key.attrs);
+   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   c.nr_bytes = c.nr_regs * REG_SIZE;
+
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.func.single_program_flow = 1;
+
+   /* For some reason the thread is spawned with only 4 channels
+    * unmasked.
+    */
+   brw_set_mask_control(&c.func, BRW_MASK_DISABLE);
+
+
+   /* Note that primitives which don't require a GS program have
+    * already been weeded out by this stage:
+    */
+   switch (key->primitive) {
+   case PIPE_PRIM_QUADS:
+      brw_gs_quads( &c );
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      brw_gs_quad_strip( &c );
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      brw_gs_lines( &c );
+      break;
+   case PIPE_PRIM_LINES:
+      if (key->hint_gs_always)
+	 brw_gs_lines( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      if (key->hint_gs_always)
+	 brw_gs_tris( &c );
+      else {
+	 return;
+      }
+      break;
+   case PIPE_PRIM_POINTS:
+      if (key->hint_gs_always)
+	 brw_gs_points( &c );
+      else {
+	 return;
+      }
+      break;
+   default:
+      return;
+   }
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->gs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_GS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->gs.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_gs_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_GS_PROG],
+			   key, sizeof(*key),
+			   &brw->gs.prog_data,
+			   &brw->gs.prog_gs_offset);
+}
+
+
+static const int gs_prim[PIPE_PRIM_POLYGON+1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINE_LOOP,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES
+};
+
+static void populate_key( struct brw_context *brw,
+			  struct brw_gs_prog_key *key )
+{
+   memset(key, 0, sizeof(*key));
+
+   /* CACHE_NEW_VS_PROG */
+   key->attrs = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_PRIMITIVE */
+   key->primitive = gs_prim[brw->primitive];
+
+   key->hint_gs_always = 0;	/* debug code? */
+
+   key->need_gs_prog = (key->hint_gs_always ||
+			brw->primitive == PIPE_PRIM_QUADS ||
+			brw->primitive == PIPE_PRIM_QUAD_STRIP ||
+			brw->primitive == PIPE_PRIM_LINE_LOOP);
+}
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_gs_prog( struct brw_context *brw )
+{
+   struct brw_gs_prog_key key;
+
+   /* Populate the key:
+    */
+   populate_key(brw, &key);
+
+   if (brw->gs.prog_active != key.need_gs_prog) {
+      brw->state.dirty.cache |= CACHE_NEW_GS_PROG;
+      brw->gs.prog_active = key.need_gs_prog;
+   }
+
+   if (brw->gs.prog_active) {
+      if (!search_cache(brw, &key))
+	 compile_gs_prog( brw, &key );
+   }
+}
+
+
+const struct brw_tracked_state brw_gs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_PRIMITIVE,
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_gs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_gs.h b/src/gallium/drivers/i965simple/brw_gs.h
new file mode 100644
index 0000000000..f09141c6aa
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs.h
@@ -0,0 +1,75 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+ 
+
+#ifndef BRW_GS_H
+#define BRW_GS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+#define MAX_GS_VERTS (4)	     
+
+struct brw_gs_prog_key {
+   unsigned attrs:32;
+   unsigned primitive:4;
+   unsigned hint_gs_always:1;
+   unsigned need_gs_prog:1;
+   unsigned pad:26;
+};
+
+struct brw_gs_compile {
+   struct brw_compile func;
+   struct brw_gs_prog_key key;
+   struct brw_gs_prog_data prog_data;
+   
+   struct {
+      struct brw_reg R0;
+      struct brw_reg vertex[MAX_GS_VERTS];
+   } reg;
+
+   /* 3 different ways of expressing vertex size:
+    */
+   unsigned nr_attrs;
+   unsigned nr_regs;
+   unsigned nr_bytes;
+};
+
+#define ATTR_SIZE  (4*4)
+
+void brw_gs_quads( struct brw_gs_compile *c );
+void brw_gs_quad_strip( struct brw_gs_compile *c );
+void brw_gs_tris( struct brw_gs_compile *c );
+void brw_gs_lines( struct brw_gs_compile *c );
+void brw_gs_points( struct brw_gs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_gs_emit.c b/src/gallium/drivers/i965simple/brw_gs_emit.c
new file mode 100644
index 0000000000..c3cc90b10f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_emit.c
@@ -0,0 +1,148 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_gs.h"
+
+static void brw_gs_alloc_regs( struct brw_gs_compile *c,
+			       unsigned nr_verts )
+{
+   unsigned i = 0,j;
+
+   /* Register usage is static, precompute here:
+    */
+   c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
+
+   /* Payload vertices plus space for more generated vertices:
+    */
+   for (j = 0; j < nr_verts; j++) {
+      c->reg.vertex[j] = brw_vec4_grf(i, 0);
+      i += c->nr_regs;
+   }
+
+   c->prog_data.urb_read_length = c->nr_regs; 
+   c->prog_data.total_grf = i;
+}
+
+
+static void brw_gs_emit_vue(struct brw_gs_compile *c, 
+			    struct brw_reg vert,
+			    boolean last,
+			    unsigned header)
+{
+   struct brw_compile *p = &c->func;
+   boolean allocate = !last;
+
+   /* Overwrite PrimType and PrimStart in the message header, for
+    * each vertex in turn:
+    */
+   brw_MOV(p, get_element_ud(c->reg.R0, 2), brw_imm_ud(header));
+
+   /* Copy the vertex from vertn into m1..mN+1:
+    */
+   brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
+
+   /* Send each vertex as a seperate write to the urb.  This is
+    * different to the concept in brw_sf_emit.c, where subsequent
+    * writes are used to build up a single urb entry.  Each of these
+    * writes instantiates a seperate urb entry, and a new one must be
+    * allocated each time.
+    */
+   brw_urb_WRITE(p, 
+		 allocate ? c->reg.R0 : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
+		 0,
+		 c->reg.R0,
+		 allocate,
+		 1,		/* used */
+		 c->nr_regs + 1, /* msg length */
+		 allocate ? 1 : 0, /* response length */
+		 allocate ? 0 : 1, /* eot */
+		 1,		/* writes_complete */
+		 0,		/* urb offset */
+		 BRW_URB_SWIZZLE_NONE);
+}
+
+
+
+void brw_gs_quads( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   /* Use polygons for correct edgeflag behaviour. Note that vertex 3
+    * is the PV for quads, but vertex 0 for polygons:
+    */
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_quad_strip( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 4);
+   
+   brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_POLYGON << 2) | R02_PRIM_END));
+}
+
+void brw_gs_tris( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 3);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
+   brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
+}
+
+void brw_gs_lines( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 2);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
+   brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
+}
+
+void brw_gs_points( struct brw_gs_compile *c )
+{
+   brw_gs_alloc_regs(c, 1);
+   brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
+}
+
+
+
+
+
+
+
+
diff --git a/src/gallium/drivers/i965simple/brw_gs_state.c b/src/gallium/drivers/i965simple/brw_gs_state.c
new file mode 100644
index 0000000000..5b8016b2e9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_gs_state.c
@@ -0,0 +1,90 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+
+static void upload_gs_unit( struct brw_context *brw )
+{
+   struct brw_gs_unit_state gs;
+
+   memset(&gs, 0, sizeof(gs));
+
+   /* CACHE_NEW_GS_PROG */
+   if (brw->gs.prog_active) {
+      gs.thread0.grf_reg_count =
+	 align(brw->gs.prog_data->total_grf, 16) / 16 - 1;
+      gs.thread0.kernel_start_pointer = brw->gs.prog_gs_offset >> 6;
+      gs.thread3.urb_entry_read_length = brw->gs.prog_data->urb_read_length;
+   }
+   else {
+      gs.thread0.grf_reg_count = 0;
+      gs.thread0.kernel_start_pointer = 0;
+      gs.thread3.urb_entry_read_length = 1;
+   }
+
+   /* BRW_NEW_URB_FENCE */
+   gs.thread4.nr_urb_entries = brw->urb.nr_gs_entries;
+   gs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+
+   gs.thread4.max_threads = 0; /* Hardware requirement */
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      gs.thread4.stats_enable = 1;
+
+   /* CONSTANT */
+   gs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   gs.thread1.single_program_flow = 1;
+   gs.thread3.dispatch_grf_start_reg = 1;
+   gs.thread3.const_urb_entry_read_offset = 0;
+   gs.thread3.const_urb_entry_read_length = 0;
+   gs.thread3.urb_entry_read_offset = 0;
+
+
+   brw->gs.state_gs_offset = brw_cache_data( &brw->cache[BRW_GS_UNIT], &gs );
+}
+
+
+const struct brw_tracked_state brw_gs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_GS_PROG
+   },
+   .update = upload_gs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_misc_state.c b/src/gallium/drivers/i965simple/brw_misc_state.c
new file mode 100644
index 0000000000..be812c5da9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_misc_state.c
@@ -0,0 +1,488 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_batch.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+
+
+
+
+/***********************************************************************
+ * Blend color
+ */
+
+static void upload_blend_constant_color(struct brw_context *brw)
+{
+   struct brw_blend_constant_color bcc;
+
+   memset(&bcc, 0, sizeof(bcc));
+   bcc.header.opcode = CMD_BLEND_CONSTANT_COLOR;
+   bcc.header.length = sizeof(bcc)/4-2;
+   bcc.blend_constant_color[0] = brw->attribs.BlendColor.color[0];
+   bcc.blend_constant_color[1] = brw->attribs.BlendColor.color[1];
+   bcc.blend_constant_color[2] = brw->attribs.BlendColor.color[2];
+   bcc.blend_constant_color[3] = brw->attribs.BlendColor.color[3];
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bcc);
+}
+
+
+const struct brw_tracked_state brw_blend_constant_color = {
+   .dirty = {
+      .brw = BRW_NEW_BLEND,
+      .cache = 0
+   },
+   .update = upload_blend_constant_color
+};
+
+
+/***********************************************************************
+ * Drawing rectangle 
+ */
+static void upload_drawing_rect(struct brw_context *brw)
+{
+   struct brw_drawrect bdr;
+
+   memset(&bdr, 0, sizeof(bdr));
+   bdr.header.opcode = CMD_DRAW_RECT;
+   bdr.header.length = sizeof(bdr)/4 - 2;
+   bdr.xmin = 0;
+   bdr.ymin = 0;
+   bdr.xmax = brw->attribs.FrameBuffer.cbufs[0]->width;
+   bdr.ymax = brw->attribs.FrameBuffer.cbufs[0]->height;
+   bdr.xorg = 0;
+   bdr.yorg = 0;
+
+   /* Can't use BRW_CACHED_BATCH_STRUCT because this is also emitted
+    * uncached in brw_draw.c:
+    */
+   BRW_BATCH_STRUCT(brw, &bdr);
+}
+
+const struct brw_tracked_state brw_drawing_rect = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_drawing_rect
+};
+
+/**
+ * Upload the binding table pointers, which point each stage's array of surface
+ * state pointers.
+ *
+ * The binding table pointers are relative to the surface state base address,
+ * which is the BRW_SS_POOL cache buffer.
+ */
+static void upload_binding_table_pointers(struct brw_context *brw)
+{
+   struct brw_binding_table_pointers btp;
+   memset(&btp, 0, sizeof(btp));
+
+   btp.header.opcode = CMD_BINDING_TABLE_PTRS;
+   btp.header.length = sizeof(btp)/4 - 2;
+   btp.vs = 0;
+   btp.gs = 0;
+   btp.clp = 0;
+   btp.sf = 0;
+   btp.wm = brw->wm.bind_ss_offset;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &btp);
+}
+
+const struct brw_tracked_state brw_binding_table_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = CACHE_NEW_SURF_BIND
+   },
+   .update = upload_binding_table_pointers,
+};
+
+
+/**
+ * Upload pointers to the per-stage state.
+ *
+ * The state pointers in this packet are all relative to the general state
+ * base address set by CMD_STATE_BASE_ADDRESS, which is the BRW_GS_POOL buffer.
+ */
+static void upload_pipelined_state_pointers(struct brw_context *brw )
+{
+   struct brw_pipelined_state_pointers psp;
+   memset(&psp, 0, sizeof(psp));
+
+   psp.header.opcode = CMD_PIPELINED_STATE_POINTERS;
+   psp.header.length = sizeof(psp)/4 - 2;
+
+   psp.vs.offset = brw->vs.state_gs_offset >> 5;
+   psp.sf.offset = brw->sf.state_gs_offset >> 5;
+   psp.wm.offset = brw->wm.state_gs_offset >> 5;
+   psp.cc.offset = brw->cc.state_gs_offset >> 5;
+
+   /* GS gets turned on and off regularly.  Need to re-emit URB fence
+    * after this occurs.
+    */
+   if (brw->gs.prog_active) {
+      psp.gs.offset = brw->gs.state_gs_offset >> 5;
+      psp.gs.enable = 1;
+   }
+
+   if (0) {
+      psp.clp.offset = brw->clip.state_gs_offset >> 5;
+      psp.clp.enable = 1;
+   }
+
+
+   if (BRW_CACHED_BATCH_STRUCT(brw, &psp))
+      brw->state.dirty.brw |= BRW_NEW_PSP;
+}
+
+const struct brw_tracked_state brw_pipelined_state_pointers = {
+   .dirty = {
+      .brw = 0,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_pipelined_state_pointers
+};
+
+static void upload_psp_urb_cbs(struct brw_context *brw )
+{
+   upload_pipelined_state_pointers(brw);
+   brw_upload_urb_fence(brw);
+   brw_upload_constant_buffer_state(brw);
+}
+
+
+const struct brw_tracked_state brw_psp_urb_cbs = {
+   .dirty = {
+      .brw = BRW_NEW_URB_FENCE,
+      .cache = (CACHE_NEW_VS_UNIT |
+		CACHE_NEW_GS_UNIT |
+		CACHE_NEW_GS_PROG |
+		CACHE_NEW_CLIP_UNIT |
+		CACHE_NEW_SF_UNIT |
+		CACHE_NEW_WM_UNIT |
+		CACHE_NEW_CC_UNIT)
+   },
+   .update = upload_psp_urb_cbs
+};
+
+/**
+ * Upload the depthbuffer offset and format.
+ *
+ * We have to do this per state validation as we need to emit the relocation
+ * in the batch buffer.
+ */
+static void upload_depthbuffer(struct brw_context *brw)
+{
+   struct pipe_surface *depth_surface = brw->attribs.FrameBuffer.zsbuf;
+
+   BEGIN_BATCH(5, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (5 - 2));
+   if (depth_surface == NULL) {
+      OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) |
+		(BRW_SURFACE_NULL << 29));
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      unsigned int format;
+
+      assert(depth_surface->block.width == 1);
+      assert(depth_surface->block.height == 1);
+      switch (depth_surface->block.size) {
+      case 2:
+	 format = BRW_DEPTHFORMAT_D16_UNORM;
+	 break;
+      case 4:
+	 if (depth_surface->format == PIPE_FORMAT_Z32_FLOAT)
+	    format = BRW_DEPTHFORMAT_D32_FLOAT;
+	 else
+	    format = BRW_DEPTHFORMAT_D24_UNORM_S8_UINT;
+	 break;
+      default:
+	 assert(0);
+	 return;
+      }
+
+      OUT_BATCH((depth_surface->stride - 1) |
+		(format << 18) |
+		(BRW_TILEWALK_YMAJOR << 26) |
+//		(depth_surface->region->tiled << 27) |
+		(BRW_SURFACE_2D << 29));
+      OUT_RELOC(depth_surface->buffer,
+		PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE, 0);
+      OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) |
+		((depth_surface->stride/depth_surface->block.size - 1) << 6) |
+		((depth_surface->height - 1) << 19));
+      OUT_BATCH(0);
+   }
+   ADVANCE_BATCH();
+}
+
+const struct brw_tracked_state brw_depthbuffer = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_depthbuffer,
+};
+
+
+
+
+/***********************************************************************
+ * Polygon stipple packet
+ */
+
+static void upload_polygon_stipple(struct brw_context *brw)
+{
+   struct brw_polygon_stipple bps;
+   unsigned i;
+
+   memset(&bps, 0, sizeof(bps));
+   bps.header.opcode = CMD_POLY_STIPPLE_PATTERN;
+   bps.header.length = sizeof(bps)/4-2;
+
+   /* XXX: state tracker should send *all* state down initially!
+    */
+   if (brw->attribs.PolygonStipple)
+      for (i = 0; i < 32; i++)
+	 bps.stipple[i] = brw->attribs.PolygonStipple->stipple[31 - i]; /* invert */
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bps);
+}
+
+const struct brw_tracked_state brw_polygon_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_polygon_stipple
+};
+
+
+/***********************************************************************
+ * Line stipple packet
+ */
+
+static void upload_line_stipple(struct brw_context *brw)
+{
+   struct brw_line_stipple bls;
+   float tmp;
+   int tmpi;
+
+   memset(&bls, 0, sizeof(bls));
+   bls.header.opcode = CMD_LINE_STIPPLE_PATTERN;
+   bls.header.length = sizeof(bls)/4 - 2;
+
+   bls.bits0.pattern = brw->attribs.Raster->line_stipple_pattern;
+   bls.bits1.repeat_count = brw->attribs.Raster->line_stipple_factor;
+
+   tmp = 1.0 / (float) brw->attribs.Raster->line_stipple_factor;
+   tmpi = tmp * (1<<13);
+
+
+   bls.bits1.inverse_repeat_count = tmpi;
+
+   BRW_CACHED_BATCH_STRUCT(brw, &bls);
+}
+
+const struct brw_tracked_state brw_line_stipple = {
+   .dirty = {
+      .brw = BRW_NEW_STIPPLE,
+      .cache = 0
+   },
+   .update = upload_line_stipple
+};
+
+
+/***********************************************************************
+ * Misc constant state packets
+ */
+
+static void upload_pipe_control(struct brw_context *brw)
+{
+   struct brw_pipe_control pc;
+
+   return;
+
+   memset(&pc, 0, sizeof(pc));
+
+   pc.header.opcode = CMD_PIPE_CONTROL;
+   pc.header.length = sizeof(pc)/4 - 2;
+   pc.header.post_sync_operation = PIPE_CONTROL_NOWRITE;
+
+   pc.header.instruction_state_cache_flush_enable = 1;
+
+   pc.bits1.dest_addr_type = PIPE_CONTROL_GTTWRITE_GLOBAL;
+
+   BRW_BATCH_STRUCT(brw, &pc);
+}
+
+const struct brw_tracked_state brw_pipe_control = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_pipe_control
+};
+
+
+/***********************************************************************
+ * Misc invarient state packets
+ */
+
+static void upload_invarient_state( struct brw_context *brw )
+{
+   {
+      struct brw_mi_flush flush;
+
+      memset(&flush, 0, sizeof(flush));      
+      flush.opcode = CMD_MI_FLUSH;
+      flush.flags = BRW_FLUSH_STATE_CACHE | BRW_FLUSH_READ_CACHE;
+      BRW_BATCH_STRUCT(brw, &flush);
+   }
+
+   {
+      /* 0x61040000  Pipeline Select */
+      /*     PipelineSelect            : 0 */
+      struct brw_pipeline_select ps;
+
+      memset(&ps, 0, sizeof(ps));
+      ps.header.opcode = CMD_PIPELINE_SELECT;
+      ps.header.pipeline_select = 0;
+      BRW_BATCH_STRUCT(brw, &ps);
+   }
+
+   {
+      struct brw_global_depth_offset_clamp gdo;
+      memset(&gdo, 0, sizeof(gdo));
+
+      /* Disable depth offset clamping.
+       */
+      gdo.header.opcode = CMD_GLOBAL_DEPTH_OFFSET_CLAMP;
+      gdo.header.length = sizeof(gdo)/4 - 2;
+      gdo.depth_offset_clamp = 0.0;
+
+      BRW_BATCH_STRUCT(brw, &gdo);
+   }
+
+
+   /* 0x61020000  State Instruction Pointer */
+   {
+      struct brw_system_instruction_pointer sip;
+      memset(&sip, 0, sizeof(sip));
+
+      sip.header.opcode = CMD_STATE_INSN_POINTER;
+      sip.header.length = 0;
+      sip.bits0.pad = 0;
+      sip.bits0.system_instruction_pointer = 0;
+      BRW_BATCH_STRUCT(brw, &sip);
+   }
+
+
+   {
+      struct brw_vf_statistics vfs;
+      memset(&vfs, 0, sizeof(vfs));
+
+      vfs.opcode = CMD_VF_STATISTICS;
+      if (BRW_DEBUG & DEBUG_STATS)
+	 vfs.statistics_enable = 1;
+
+      BRW_BATCH_STRUCT(brw, &vfs);
+   }
+
+   
+   {
+      struct brw_polygon_stipple_offset bpso;
+      
+      memset(&bpso, 0, sizeof(bpso));
+      bpso.header.opcode = CMD_POLY_STIPPLE_OFFSET;
+      bpso.header.length = sizeof(bpso)/4-2;      
+      bpso.bits0.x_offset = 0;
+      bpso.bits0.y_offset = 0;
+
+      BRW_BATCH_STRUCT(brw, &bpso);
+   }
+}
+
+const struct brw_tracked_state brw_invarient_state = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_invarient_state
+};
+
+/**
+ * Define the base addresses which some state is referenced from.
+ *
+ * This allows us to avoid having to emit relocations in many places for
+ * cached state, and instead emit pointers inside of large, mostly-static
+ * state pools.  This comes at the expense of memory, and more expensive cache
+ * misses.
+ */
+static void upload_state_base_address( struct brw_context *brw )
+{
+   /* Output the structure (brw_state_base_address) directly to the
+    * batchbuffer, so we can emit relocations inline.
+    */
+   BEGIN_BATCH(6, INTEL_BATCH_NO_CLIPRECTS);
+   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+   OUT_RELOC(brw->pool[BRW_GS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* General state base address */
+   OUT_RELOC(brw->pool[BRW_SS_POOL].buffer,
+	     PIPE_BUFFER_USAGE_GPU_READ,
+	     1); /* Surface state base address */
+   OUT_BATCH(1); /* Indirect object base address */
+   OUT_BATCH(1); /* General state upper bound */
+   OUT_BATCH(1); /* Indirect object upper bound */
+   ADVANCE_BATCH();
+}
+
+
+const struct brw_tracked_state brw_state_base_address = {
+   .dirty = {
+      .brw = BRW_NEW_SCENE,
+      .cache = 0
+   },
+   .update = upload_state_base_address
+};
diff --git a/src/gallium/drivers/i965simple/brw_reg.h b/src/gallium/drivers/i965simple/brw_reg.h
new file mode 100644
index 0000000000..9e885c3b3b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_reg.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#define CMD_MI				(0x0 << 29)
+#define CMD_2D				(0x2 << 29)
+#define CMD_3D				(0x3 << 29)
+
+#define MI_BATCH_BUFFER_END		(CMD_MI | 0xA << 23)
+
+/* Stalls command execution waiting for the given events to have occurred. */
+#define MI_WAIT_FOR_EVENT               (CMD_MI | (0x3 << 23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+/* Primitive dispatch on 830-945 */
+#define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
+#define PRIM_INDIRECT            (1<<23)
+#define PRIM_INLINE              (0<<23)
+#define PRIM_INDIRECT_SEQUENTIAL (0<<17)
+#define PRIM_INDIRECT_ELTS       (1<<17)
+
+#define PRIM3D_TRILIST		(0x0<<18)
+#define PRIM3D_TRISTRIP 	(0x1<<18)
+#define PRIM3D_TRISTRIP_RVRSE	(0x2<<18)
+#define PRIM3D_TRIFAN		(0x3<<18)
+#define PRIM3D_POLY		(0x4<<18)
+#define PRIM3D_LINELIST 	(0x5<<18)
+#define PRIM3D_LINESTRIP	(0x6<<18)
+#define PRIM3D_RECTLIST 	(0x7<<18)
+#define PRIM3D_POINTLIST	(0x8<<18)
+#define PRIM3D_DIB		(0x9<<18)
+#define PRIM3D_MASK		(0x1f<<18)
+
+#define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22) | 6)
+
+#define XY_COLOR_BLT_CMD		(CMD_2D | (0x50 << 22) | 4)
+
+#define XY_SRC_COPY_BLT_CMD             (CMD_2D | (0x53 << 22) | 6)
+
+/* BR00 */
+#define XY_BLT_WRITE_ALPHA	(1 << 21)
+#define XY_BLT_WRITE_RGB	(1 << 20)
+#define XY_SRC_TILED		(1 << 15)
+#define XY_DST_TILED		(1 << 11)
+
+/* BR13 */
+#define BR13_565		(0x1 << 24)
+#define BR13_8888		(0x3 << 24)
+
+#define FENCE_LINEAR 0
+#define FENCE_XMAJOR 1
+#define FENCE_YMAJOR 2
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
new file mode 100644
index 0000000000..ab7cd624b2
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -0,0 +1,244 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "util/u_string.h"
+
+#include "brw_context.h"
+#include "brw_screen.h"
+#include "brw_tex_layout.h"
+
+
+static const char *
+brw_get_vendor( struct pipe_screen *screen )
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+brw_get_name( struct pipe_screen *screen )
+{
+   static char buffer[128];
+   const char *chipset;
+
+   switch (brw_screen(screen)->pci_id) {
+   case PCI_CHIP_I965_Q:
+      chipset = "Intel(R) 965Q";
+      break;
+   case PCI_CHIP_I965_G:
+   case PCI_CHIP_I965_G_1:
+      chipset = "Intel(R) 965G";
+      break;
+   case PCI_CHIP_I965_GM:
+      chipset = "Intel(R) 965GM";
+      break;
+   case PCI_CHIP_I965_GME:
+      chipset = "Intel(R) 965GME/GLE";
+      break;
+   default:
+      chipset = "unknown";
+      break;
+   }
+
+   util_snprintf(buffer, sizeof(buffer), "i965 (chipset: %s)", chipset);
+   return buffer;
+}
+
+
+static int
+brw_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 0;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 0;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 0;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 11; /* max 1024x1024 */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 11; /* max 1024x1024 */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+brw_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 7.5;
+
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0;
+
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 4.0;
+
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0;
+
+   default:
+      return 0;
+   }
+}
+
+
+static boolean
+brw_is_format_supported( struct pipe_screen *screen,
+                         enum pipe_format format, 
+                         enum pipe_texture_target target,
+                         unsigned tex_usage, 
+                         unsigned geom_flags )
+{
+#if 0
+   /* XXX: This is broken -- rewrite if still needed. */
+   static const unsigned tex_supported[] = {
+      PIPE_FORMAT_R8G8B8A8_UNORM,
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+      PIPE_FORMAT_L8_UNORM,
+      PIPE_FORMAT_A8_UNORM,
+      PIPE_FORMAT_I8_UNORM,
+      PIPE_FORMAT_L8A8_UNORM,
+      PIPE_FORMAT_YCBCR,
+      PIPE_FORMAT_YCBCR_REV,
+      PIPE_FORMAT_S8_Z24,
+   };
+
+
+   /* Actually a lot more than this - add later:
+    */
+   static const unsigned render_supported[] = {
+      PIPE_FORMAT_A8R8G8B8_UNORM,
+      PIPE_FORMAT_R5G6B5_UNORM,
+   };
+
+   /*
+    */
+   static const unsigned z_stencil_supported[] = {
+      PIPE_FORMAT_Z16_UNORM,
+      PIPE_FORMAT_Z32_UNORM,
+      PIPE_FORMAT_S8Z24_UNORM,
+   };
+
+   switch (type) {
+   case PIPE_RENDER_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   case PIPE_TEX_FORMAT:
+      *numFormats = Elements(tex_supported);
+      return render_supported;
+
+   case PIPE_Z_STENCIL_FORMAT:
+      *numFormats = Elements(render_supported);
+      return render_supported;
+
+   default:
+      *numFormats = 0;
+      return NULL;
+   }
+#else
+   switch (format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return TRUE;
+   default:
+      return FALSE;
+   };
+   return FALSE;
+#endif
+}
+
+
+static void
+brw_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+/**
+ * Create a new brw_screen object
+ */
+struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id)
+{
+   struct brw_screen *brwscreen = CALLOC_STRUCT(brw_screen);
+
+   if (!brwscreen)
+      return NULL;
+
+   brwscreen->pci_id = pci_id;
+
+   brwscreen->screen.winsys = winsys;
+
+   brwscreen->screen.destroy = brw_destroy_screen;
+
+   brwscreen->screen.get_name = brw_get_name;
+   brwscreen->screen.get_vendor = brw_get_vendor;
+   brwscreen->screen.get_param = brw_get_param;
+   brwscreen->screen.get_paramf = brw_get_paramf;
+   brwscreen->screen.is_format_supported = brw_is_format_supported;
+
+   brw_init_screen_texture_funcs(&brwscreen->screen);
+
+   return &brwscreen->screen;
+}
diff --git a/src/gallium/drivers/i965simple/brw_screen.h b/src/gallium/drivers/i965simple/brw_screen.h
new file mode 100644
index 0000000000..d3c70387e6
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_screen.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef BRW_SCREEN_H
+#define BRW_SCREEN_H
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Subclass of pipe_screen
+ */
+struct brw_screen
+{
+   struct pipe_screen screen;
+
+   uint pci_id;
+};
+
+
+/** cast wrapper */
+static INLINE struct brw_screen *
+brw_screen(struct pipe_screen *pscreen)
+{
+   return (struct brw_screen *) pscreen;
+}
+
+
+extern struct pipe_screen *
+brw_create_screen(struct pipe_winsys *winsys, uint pci_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BRW_SCREEN_H */
diff --git a/src/gallium/drivers/i965simple/brw_sf.c b/src/gallium/drivers/i965simple/brw_sf.c
new file mode 100644
index 0000000000..b82a2e143b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.c
@@ -0,0 +1,351 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+#include "brw_state.h"
+#include "tgsi/tgsi_parse.h"
+
+
+static void compile_sf_prog( struct brw_context *brw,
+			     struct brw_sf_prog_key *key )
+{
+   struct brw_sf_compile c;
+   const unsigned *program;
+   unsigned program_size;
+
+   memset(&c, 0, sizeof(c));
+
+   /* Begin the compilation:
+    */
+   brw_init_compile(&c.func);
+
+   c.key = *key;
+
+
+   c.nr_attrs = c.key.vp_output_count;
+   c.nr_attr_regs = (c.nr_attrs+1)/2;
+
+   c.nr_setup_attrs = c.key.fp_input_count + 1; /* +1 for position */
+   c.nr_setup_regs = (c.nr_setup_attrs+1)/2;
+
+   c.prog_data.urb_read_length = c.nr_attr_regs;
+   c.prog_data.urb_entry_size = c.nr_setup_regs * 2;
+
+
+   /* Which primitive?  Or all three?
+    */
+   switch (key->primitive) {
+   case SF_TRIANGLES:
+      c.nr_verts = 3;
+      brw_emit_tri_setup( &c );
+      break;
+   case SF_LINES:
+      c.nr_verts = 2;
+      brw_emit_line_setup( &c );
+      break;
+   case SF_POINTS:
+      c.nr_verts = 1;
+      brw_emit_point_setup( &c );
+      break;
+
+   case SF_UNFILLED_TRIS:
+   default:
+      assert(0);
+      return;
+   }
+
+
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /* Upload
+    */
+   brw->sf.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_SF_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->sf.prog_data );
+}
+
+
+static boolean search_cache( struct brw_context *brw,
+			       struct brw_sf_prog_key *key )
+{
+   return brw_search_cache(&brw->cache[BRW_SF_PROG],
+			   key, sizeof(*key),
+			   &brw->sf.prog_data,
+			   &brw->sf.prog_gs_offset);
+}
+
+
+/* Calculate interpolants for triangle and line rasterization.
+ */
+static void upload_sf_prog( struct brw_context *brw )
+{
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct brw_sf_prog_key key;
+   struct tgsi_parse_context parse;
+   int i, done = 0;
+
+
+   memset(&key, 0, sizeof(key));
+
+   /* Populate the key, noting state dependencies:
+    */
+   /* CACHE_NEW_VS_PROG */
+   key.vp_output_count = brw->vs.prog_data->outputs_written;
+
+   /* BRW_NEW_FS */
+   key.fp_input_count = brw->attribs.FragmentProgram->info.file_max[TGSI_FILE_INPUT] + 1;
+
+
+   /* BRW_NEW_REDUCED_PRIMITIVE */
+   switch (brw->reduced_primitive) {
+   case PIPE_PRIM_TRIANGLES:
+//      if (key.attrs & (1<<VERT_RESULT_EDGE))
+//	 key.primitive = SF_UNFILLED_TRIS;
+//      else
+      key.primitive = SF_TRIANGLES;
+      break;
+   case PIPE_PRIM_LINES:
+      key.primitive = SF_LINES;
+      break;
+   case PIPE_PRIM_POINTS:
+      key.primitive = SF_POINTS;
+      break;
+   }
+
+
+
+   /* Scan fp inputs to figure out what interpolation modes are
+    * required for each incoming vp output.  There is an assumption
+    * that the state tracker makes sure there is a 1:1 linkage between
+    * these sets of attributes (XXX: position??)
+    */
+   tgsi_parse_init( &parse, fs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+	    int interp_mode = parse.FullToken.FullDeclaration.Declaration.Interpolate;
+	    //int semantic = parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	    //int semantic_index = parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+
+	    debug_printf("fs input %d..%d interp mode %d\n", first, last, interp_mode);
+	    
+	    switch (interp_mode) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       for (i = first; i <= last; i++) 
+		  key.const_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_LINEAR:
+	       for (i = first; i <= last; i++) 
+		  key.linear_mask |= (1 << i);
+	       break;
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       for (i = first; i <= last; i++) 
+		  key.persp_mask |= (1 << i);
+	       break;
+	    default:
+	       break;
+	    }
+
+	    /* Also need stuff for flat shading, twosided color.
+	     */
+
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   /* Hack: Adjust for position.  Optimize away when not required (ie
+    * for perspective interpolation).
+    */
+   key.persp_mask <<= 1;
+   key.linear_mask <<= 1; 
+   key.linear_mask |= 1;
+   key.const_mask <<= 1;
+
+   debug_printf("key.persp_mask: %x\n", key.persp_mask);
+   debug_printf("key.linear_mask: %x\n", key.linear_mask);
+   debug_printf("key.const_mask: %x\n", key.const_mask);
+
+
+//   key.do_point_sprite = brw->attribs.Point->PointSprite;
+//   key.SpriteOrigin = brw->attribs.Point->SpriteOrigin;
+
+//   key.do_flat_shading = (brw->attribs.Raster->flatshade);
+//   key.do_twoside_color = (brw->attribs.Light->Enabled && brw->attribs.Light->Model.TwoSide);
+
+//   if (key.do_twoside_color)
+//      key.frontface_ccw = (brw->attribs.Polygon->FrontFace == GL_CCW);
+
+
+   if (!search_cache(brw, &key))
+      compile_sf_prog( brw, &key );
+}
+
+
+const struct brw_tracked_state brw_sf_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_REDUCED_PRIMITIVE |
+		BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = upload_sf_prog
+};
+
+
+
+#if 0
+/* Build a struct like the one we'd like the state tracker to pass to
+ * us.
+ */
+static void update_sf_linkage( struct brw_context *brw )
+{
+   const struct brw_vertex_program *vs = brw->attribs.VertexProgram;
+   const struct brw_fragment_program *fs = brw->attribs.FragmentProgram;
+   struct pipe_setup_linkage state;
+   struct tgsi_parse_context parse;
+
+   int i, j;
+   int nr_vp_outputs = 0;
+   int done = 0;
+
+   struct { 
+      unsigned semantic:8;
+      unsigned semantic_index:16;
+   } fp_semantic[32], vp_semantic[32];
+
+   memset(&state, 0, sizeof(state));
+
+   state.fp_input_count = 0;
+
+
+
+   
+
+
+   assert(state.fp_input_count == fs->program.num_inputs);
+
+      
+   /* Then scan vp outputs
+    */
+   done = 0;
+   tgsi_parse_init( &parse, vs->program.tokens );
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 if (parse.FullToken.FullDeclaration.Declaration.File == TGSI_FILE_INPUT) 
+	 {
+	    int first = parse.FullToken.FullDeclaration.DeclarationRange.First;
+	    int last = parse.FullToken.FullDeclaration.DeclarationRange.Last;
+
+	    for (i = first; i < last; i++) {
+	       vp_semantic[i].semantic = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticName;
+	       vp_semantic[i].semantic_index = 
+		  parse.FullToken.FullDeclaration.Semantic.SemanticIndex;
+	    }
+	    
+	    assert(last > nr_vp_outputs);
+	    nr_vp_outputs = last;
+	 }
+	 break;
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+
+   /* Now match based on semantic information.
+    */
+   for (i = 0; i< state.fp_input_count; i++) {
+      for (j = 0; j < nr_vp_outputs; j++) {
+	 if (fp_semantic[i].semantic == vp_semantic[j].semantic &&
+	     fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	    state.fp_input[i].vp_output = j;
+	 }
+      }
+      if (fp_semantic[i].semantic == TGSI_SEMANTIC_COLOR) {
+	 for (j = 0; j < nr_vp_outputs; j++) {
+	    if (TGSI_SEMANTIC_BCOLOR == vp_semantic[j].semantic &&
+		fp_semantic[i].semantic_index == vp_semantic[j].semantic_index) {
+	       state.fp_input[i].bf_vp_output = j;
+	    }
+	 }
+      }
+   }
+
+   if (memcmp(&brw->sf.linkage, &state, sizeof(state)) != 0) {
+      brw->sf.linkage = state;
+      brw->state.dirty.brw |= BRW_NEW_SF_LINKAGE;
+   }
+}
+
+
+const struct brw_tracked_state brw_sf_linkage = {
+   .dirty = {
+      .brw   = (BRW_NEW_VS |
+		BRW_NEW_FS),
+      .cache = 0,
+   },
+   .update = update_sf_linkage
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf.h b/src/gallium/drivers/i965simple/brw_sf.h
new file mode 100644
index 0000000000..b7ada47560
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf.h
@@ -0,0 +1,122 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_SF_H
+#define BRW_SF_H
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+#define SF_POINTS    0
+#define SF_LINES     1
+#define SF_TRIANGLES 2
+#define SF_UNFILLED_TRIS   3
+
+
+
+struct brw_sf_prog_key {
+   unsigned vp_output_count:5;
+   unsigned fp_input_count:5;
+
+   unsigned primitive:2;
+   unsigned do_twoside_color:1;
+   unsigned do_flat_shading:1;
+   unsigned frontface_ccw:1;
+   unsigned do_point_sprite:1;
+
+   /* Interpolation masks;
+    */
+   unsigned linear_mask;
+   unsigned persp_mask;
+   unsigned const_mask;
+
+
+//   int SpriteOrigin;
+};
+
+struct brw_sf_point_tex {
+	boolean CoordReplace;
+};
+
+struct brw_sf_compile {
+   struct brw_compile func;
+   struct brw_sf_prog_key key;
+   struct brw_sf_prog_data prog_data;
+
+   struct brw_reg pv;
+   struct brw_reg det;
+   struct brw_reg dx0;
+   struct brw_reg dx2;
+   struct brw_reg dy0;
+   struct brw_reg dy2;
+
+   /* z and 1/w passed in seperately:
+    */
+   struct brw_reg z[3];
+   struct brw_reg inv_w[3];
+
+   /* The vertices:
+    */
+   struct brw_reg vert[3];
+
+    /* Temporaries, allocated after last vertex reg.
+    */
+   struct brw_reg inv_det;
+   struct brw_reg a1_sub_a0;
+   struct brw_reg a2_sub_a0;
+   struct brw_reg tmp;
+
+   struct brw_reg m1Cx;
+   struct brw_reg m2Cy;
+   struct brw_reg m3C0;
+
+   unsigned nr_verts;
+   unsigned nr_attrs;
+   unsigned nr_attr_regs;
+   unsigned nr_setup_attrs;
+   unsigned nr_setup_regs;
+#if 0
+   ubyte attr_to_idx[VERT_RESULT_MAX];
+   ubyte idx_to_attr[VERT_RESULT_MAX];
+   struct brw_sf_point_tex point_attrs[VERT_RESULT_MAX];
+#endif
+};
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c );
+void brw_emit_line_setup( struct brw_sf_compile *c );
+void brw_emit_point_setup( struct brw_sf_compile *c );
+void brw_emit_point_sprite_setup( struct brw_sf_compile *c );
+void brw_emit_anyprim_setup( struct brw_sf_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_sf_emit.c b/src/gallium/drivers/i965simple/brw_sf_emit.c
new file mode 100644
index 0000000000..78d6fa5e9e
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_emit.c
@@ -0,0 +1,382 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_defines.h"
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_util.h"
+#include "brw_sf.h"
+
+
+
+/***********************************************************************
+ * Triangle setup.
+ */
+
+
+static void alloc_regs( struct brw_sf_compile *c )
+{
+   unsigned reg, i;
+
+   /* Values computed by fixed function unit:
+    */
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->det = brw_vec1_grf(1, 2);
+   c->dx0 = brw_vec1_grf(1, 3);
+   c->dx2 = brw_vec1_grf(1, 4);
+   c->dy0 = brw_vec1_grf(1, 5);
+   c->dy2 = brw_vec1_grf(1, 6);
+
+   /* z and 1/w passed in seperately:
+    */
+   c->z[0]     = brw_vec1_grf(2, 0);
+   c->inv_w[0] = brw_vec1_grf(2, 1);
+   c->z[1]     = brw_vec1_grf(2, 2);
+   c->inv_w[1] = brw_vec1_grf(2, 3);
+   c->z[2]     = brw_vec1_grf(2, 4);
+   c->inv_w[2] = brw_vec1_grf(2, 5);
+
+   /* The vertices:
+    */
+   reg = 3;
+   for (i = 0; i < c->nr_verts; i++) {
+      c->vert[i] = brw_vec8_grf(reg, 0);
+      reg += c->nr_attr_regs;
+   }
+
+   /* Temporaries, allocated after last vertex reg.
+    */
+   c->inv_det = brw_vec1_grf(reg, 0);  reg++;
+   c->a1_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->a2_sub_a0 = brw_vec8_grf(reg, 0);  reg++;
+   c->tmp = brw_vec8_grf(reg, 0);  reg++;
+
+   /* Note grf allocation:
+    */
+   c->prog_data.total_grf = reg;
+
+
+   /* Outputs of this program - interpolation coefficients for
+    * rasterization:
+    */
+   c->m1Cx = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 1, 0);
+   c->m2Cy = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 2, 0);
+   c->m3C0 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 3, 0);
+}
+
+
+static void copy_z_inv_w( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   brw_push_insn_state(p);
+
+   /* Copy both scalars with a single MOV:
+    */
+   for (i = 0; i < c->nr_verts; i++)
+      brw_MOV(p, vec2(suboffset(c->vert[i], 2)), vec2(c->z[i]));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void invert_det( struct brw_sf_compile *c)
+{
+   brw_math(&c->func,
+	    c->inv_det,
+	    BRW_MATH_FUNCTION_INV,
+	    BRW_MATH_SATURATE_NONE,
+	    0,
+	    c->det,
+	    BRW_MATH_DATA_SCALAR,
+	    BRW_MATH_PRECISION_FULL);
+
+}
+
+#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
+                               FRAG_BIT_COL0 | \
+			       FRAG_BIT_COL1)
+
+static boolean calculate_masks( struct brw_sf_compile *c,
+				  unsigned reg,
+				  ushort *pc,
+				  ushort *pc_persp,
+				  ushort *pc_linear)
+{
+   boolean is_last_attr = (reg == c->nr_setup_regs - 1);
+   unsigned persp_mask = c->key.persp_mask;
+   unsigned linear_mask = c->key.linear_mask;
+
+   debug_printf("persp_mask: %x\n", persp_mask);
+   debug_printf("linear_mask: %x\n", linear_mask);
+
+   *pc_persp = 0;
+   *pc_linear = 0;
+   *pc = 0xf;
+
+   if (persp_mask & (1 << (reg*2)))
+      *pc_persp = 0xf;
+
+   if (linear_mask & (1 << (reg*2)))
+      *pc_linear = 0xf;
+
+   /* Maybe only processs one attribute on the final round:
+    */
+   if (reg*2+1 < c->nr_setup_attrs) {
+      *pc |= 0xf0;
+
+      if (persp_mask & (1 << (reg*2+1)))
+	 *pc_persp |= 0xf0;
+
+      if (linear_mask & (1 << (reg*2+1)))
+	 *pc_linear |= 0xf0;
+   }
+
+   debug_printf("pc: %x\n", *pc);
+   debug_printf("pc_persp: %x\n", *pc_persp);
+   debug_printf("pc_linear: %x\n", *pc_linear);
+   
+
+   return is_last_attr;
+}
+
+
+
+void brw_emit_tri_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   debug_printf("%s START ==============\n", __FUNCTION__);
+
+   c->nr_verts = 3;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      struct brw_reg a2 = offset(c->vert[2], i);
+      ushort pc = 0, pc_persp = 0, pc_linear = 0;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+	 brw_MUL(p, a2, a2, c->inv_w[2]);
+      }
+
+
+      /* Calculate coefficients for interpolated values:
+       */
+      if (pc_linear)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+	 brw_ADD(p, c->a2_sub_a0, a2, negate(a0));
+
+	 /* calculate dA/dx
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a1_sub_a0, c->dy2);
+	 brw_MAC(p, c->tmp, c->a2_sub_a0, negate(c->dy0));
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 /* calculate dA/dy
+	  */
+	 brw_MUL(p, brw_null_reg(), c->a2_sub_a0, c->dx0);
+	 brw_MAC(p, c->tmp, c->a1_sub_a0, negate(c->dx2));
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.  m0 is implicitly copied from r0 in
+	  * the send instruction:
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0), /* r0, will be copied to m0 */
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last,	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE); /* XXX: Swizzle control "SF to windower" */
+      }
+   }
+
+   debug_printf("%s DONE ==============\n", __FUNCTION__);
+
+}
+
+
+
+void brw_emit_line_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+
+   c->nr_verts = 2;
+   alloc_regs(c);
+   invert_det(c);
+   copy_z_inv_w(c);
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      /* Pair of incoming attributes:
+       */
+      struct brw_reg a0 = offset(c->vert[0], i);
+      struct brw_reg a1 = offset(c->vert[1], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+	 brw_MUL(p, a1, a1, c->inv_w[1]);
+      }
+
+      /* Calculate coefficients for position, color:
+       */
+      if (pc_linear) {
+	 brw_set_predicate_control_flag_value(p, pc_linear);
+
+	 brw_ADD(p, c->a1_sub_a0, a1, negate(a0));
+
+ 	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dx0);
+	 brw_MUL(p, c->m1Cx, c->tmp, c->inv_det);
+
+	 brw_MUL(p, c->tmp, c->a1_sub_a0, c->dy0);
+	 brw_MUL(p, c->m2Cy, c->tmp, c->inv_det);
+      }
+
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 /* start point for interpolation
+	  */
+	 brw_MOV(p, c->m3C0, a0);
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1, 	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
+
+
+/* Points setup - several simplifications as all attributes are
+ * constant across the face of the point (point sprites excluded!)
+ */
+void brw_emit_point_setup( struct brw_sf_compile *c )
+{
+   struct brw_compile *p = &c->func;
+   unsigned i;
+
+   c->nr_verts = 1;
+   alloc_regs(c);
+   copy_z_inv_w(c);
+
+   brw_MOV(p, c->m1Cx, brw_imm_ud(0)); /* zero - move out of loop */
+   brw_MOV(p, c->m2Cy, brw_imm_ud(0)); /* zero - move out of loop */
+
+   for (i = 0; i < c->nr_setup_regs; i++)
+   {
+      struct brw_reg a0 = offset(c->vert[0], i);
+      ushort pc, pc_persp, pc_linear;
+      boolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear);
+
+      if (pc_persp)
+      {
+	 /* This seems odd as the values are all constant, but the
+	  * fragment shader will be expecting it:
+	  */
+	 brw_set_predicate_control_flag_value(p, pc_persp);
+	 brw_MUL(p, a0, a0, c->inv_w[0]);
+      }
+
+
+      /* The delta values are always zero, just send the starting
+       * coordinate.  Again, this is to fit in with the interpolation
+       * code in the fragment shader.
+       */
+      {
+	 brw_set_predicate_control_flag_value(p, pc);
+
+	 brw_MOV(p, c->m3C0, a0); /* constant value */
+
+	 /* Copy m0..m3 to URB.
+	  */
+	 brw_urb_WRITE(p,
+		       brw_null_reg(),
+		       0,
+		       brw_vec8_grf(0, 0),
+		       0, 	/* allocate */
+		       1,	/* used */
+		       4, 	/* msg len */
+		       0,	/* response len */
+		       last, 	/* eot */
+		       last, 	/* writes complete */
+		       i*4,	/* urb destination offset */
+		       BRW_URB_SWIZZLE_TRANSPOSE);
+      }
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_sf_state.c b/src/gallium/drivers/i965simple/brw_sf_state.c
new file mode 100644
index 0000000000..2a5de61c21
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_sf_state.c
@@ -0,0 +1,181 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+static void upload_sf_vp(struct brw_context *brw)
+{
+   struct brw_sf_viewport sfv;
+
+   memset(&sfv, 0, sizeof(sfv));
+
+
+   /* BRW_NEW_VIEWPORT */
+   {
+      const float *scale = brw->attribs.Viewport.scale;
+      const float *trans = brw->attribs.Viewport.translate;
+
+      sfv.viewport.m00 = scale[0];
+      sfv.viewport.m11 = scale[1];
+      sfv.viewport.m22 = scale[2]; 
+      sfv.viewport.m30 = trans[0];
+      sfv.viewport.m31 = trans[1];
+      sfv.viewport.m32 = trans[2];
+   }
+
+   /* _NEW_SCISSOR */
+   sfv.scissor.xmin = brw->attribs.Scissor.minx;
+   sfv.scissor.xmax = brw->attribs.Scissor.maxx - 1;
+   sfv.scissor.ymin = brw->attribs.Scissor.miny;
+   sfv.scissor.ymax = brw->attribs.Scissor.maxy - 1;
+
+   brw->sf.vp_gs_offset = brw_cache_data( &brw->cache[BRW_SF_VP], &sfv );
+}
+
+const struct brw_tracked_state brw_sf_vp = {
+   .dirty = {
+      .brw   = (BRW_NEW_SCISSOR |
+		BRW_NEW_VIEWPORT),
+      .cache = 0
+   },
+   .update = upload_sf_vp
+};
+
+static void upload_sf_unit( struct brw_context *brw )
+{
+   struct brw_sf_unit_state sf;
+   memset(&sf, 0, sizeof(sf));
+
+   /* CACHE_NEW_SF_PROG */
+   sf.thread0.grf_reg_count = align(brw->sf.prog_data->total_grf, 16) / 16 - 1;
+   sf.thread0.kernel_start_pointer = brw->sf.prog_gs_offset >> 6;
+   sf.thread3.urb_entry_read_length = brw->sf.prog_data->urb_read_length;
+
+   sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   sf.thread3.dispatch_grf_start_reg = 3;
+   sf.thread3.urb_entry_read_offset = 1;
+
+   /* BRW_NEW_URB_FENCE */
+   sf.thread4.nr_urb_entries = brw->urb.nr_sf_entries;
+   sf.thread4.urb_entry_allocation_size = brw->urb.sfsize - 1;
+   sf.thread4.max_threads = MIN2(12, brw->urb.nr_sf_entries / 2) - 1;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      sf.thread4.max_threads = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      sf.thread4.stats_enable = 1;
+
+   /* CACHE_NEW_SF_VP */
+   sf.sf5.sf_viewport_state_offset = brw->sf.vp_gs_offset >> 5;
+   sf.sf5.viewport_transform = 1;
+
+   /* BRW_NEW_RASTER */
+   if (brw->attribs.Raster->scissor)
+      sf.sf6.scissor = 1;
+
+#if 0
+   if (brw->attribs.Polygon->FrontFace == GL_CCW)
+      sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   else
+      sf.sf5.front_winding = BRW_FRONTWINDING_CW;
+
+
+   if (brw->attribs.Polygon->CullFlag) {
+      switch (brw->attribs.Polygon->CullFaceMode) {
+      case GL_FRONT:
+	 sf.sf6.cull_mode = BRW_CULLMODE_FRONT;
+	 break;
+      case GL_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BACK;
+	 break;
+      case GL_FRONT_AND_BACK:
+	 sf.sf6.cull_mode = BRW_CULLMODE_BOTH;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+   }
+   else
+      sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#else
+   sf.sf5.front_winding = BRW_FRONTWINDING_CCW;
+   sf.sf6.cull_mode = BRW_CULLMODE_NONE;
+#endif
+
+   sf.sf6.line_width = CLAMP(brw->attribs.Raster->line_width, 1.0, 5.0) * (1<<1);
+
+   sf.sf6.line_endcap_aa_region_width = 1;
+   if (brw->attribs.Raster->line_smooth)
+      sf.sf6.aa_enable = 1;
+   else if (sf.sf6.line_width <= 0x2)
+       sf.sf6.line_width = 0;
+
+   sf.sf6.point_rast_rule = 1;	/* opengl conventions */
+
+   sf.sf7.sprite_point = brw->attribs.Raster->point_sprite;
+   sf.sf7.point_size = CLAMP(brw->attribs.Raster->line_width, 1.0, 255.0) * (1<<3);
+   sf.sf7.use_point_size_state = !brw->attribs.Raster->point_size_per_vertex;
+
+   /* might be BRW_NEW_PRIMITIVE if we have to adjust pv for polygons:
+    */
+   sf.sf7.trifan_pv = 2;
+   sf.sf7.linestrip_pv = 1;
+   sf.sf7.tristrip_pv = 2;
+   sf.sf7.line_last_pixel_enable = 0;
+
+   /* Set bias for OpenGL rasterization rules:
+    */
+   sf.sf6.dest_org_vbias = 0x8;
+   sf.sf6.dest_org_hbias = 0x8;
+
+   brw->sf.state_gs_offset = brw_cache_data( &brw->cache[BRW_SF_UNIT], &sf );
+}
+
+
+const struct brw_tracked_state brw_sf_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_RASTERIZER |
+		BRW_NEW_URB_FENCE),
+      .cache = (CACHE_NEW_SF_VP |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = upload_sf_unit
+};
+
+
diff --git a/src/gallium/drivers/i965simple/brw_shader_info.c b/src/gallium/drivers/i965simple/brw_shader_info.c
new file mode 100644
index 0000000000..86d877d7ef
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_shader_info.c
@@ -0,0 +1,48 @@
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+/**
+ * XXX this obsolete new and no longer compiled.
+ */
+void brw_shader_info(const struct tgsi_token *tokens,
+		     struct brw_shader_info *info )
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned last = decl->DeclarationRange.Last;
+      
+	 // Broken by crazy wpos init:
+	 //assert( info->nr_regs[decl->Declaration.File] <= last);
+
+	 info->nr_regs[decl->Declaration.File] = MAX2(info->nr_regs[decl->Declaration.File],
+						      last+1);
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+	 done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.c b/src/gallium/drivers/i965simple/brw_state.c
new file mode 100644
index 0000000000..af46cb546f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <zack@tungstengraphics.com>
+ *           Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_winsys.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_parse.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "brw_state.h"
+#include "brw_draw.h"
+
+
+#define DUP( TYPE, VAL )                        \
+do {                                            \
+   struct TYPE *x = malloc(sizeof(*x));         \
+   memcpy(x, VAL, sizeof(*x) );                 \
+   return x;                                    \
+} while (0)
+
+/************************************************************************
+ * Blend 
+ */
+static void *
+brw_create_blend_state(struct pipe_context *pipe,
+                        const struct pipe_blend_state *blend)
+{   
+   DUP( pipe_blend_state, blend );
+}
+
+static void brw_bind_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Blend = (struct pipe_blend_state*)blend;
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+
+static void brw_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   free(blend);
+}
+
+static void brw_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.BlendColor = *blend_color;
+
+   brw->state.dirty.brw |= BRW_NEW_BLEND;
+}
+
+/************************************************************************
+ * Sampler 
+ */
+
+static void *
+brw_create_sampler_state(struct pipe_context *pipe,
+                          const struct pipe_sampler_state *sampler)
+{
+   DUP( pipe_sampler_state, sampler );
+}
+
+static void brw_bind_sampler_states(struct pipe_context *pipe,
+                                    unsigned num, void **sampler)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_samplers &&
+       !memcmp(brw->attribs.Samplers, sampler, num * sizeof(void *)))
+      return;
+
+   memcpy(brw->attribs.Samplers, sampler, num * sizeof(void *));
+   memset(&brw->attribs.Samplers[num], 0, (PIPE_MAX_SAMPLERS - num) *
+          sizeof(void *));
+
+   brw->num_samplers = num;
+
+   brw->state.dirty.brw |= BRW_NEW_SAMPLER;
+}
+
+static void brw_delete_sampler_state(struct pipe_context *pipe,
+                                      void *sampler)
+{
+   free(sampler);
+}
+
+
+/************************************************************************
+ * Depth stencil 
+ */
+
+static void *
+brw_create_depth_stencil_state(struct pipe_context *pipe,
+                           const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   DUP( pipe_depth_stencil_alpha_state, depth_stencil );
+}
+
+static void brw_bind_depth_stencil_state(struct pipe_context *pipe,
+                                         void *depth_stencil)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.DepthStencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   brw->state.dirty.brw |= BRW_NEW_DEPTH_STENCIL;
+}
+
+static void brw_delete_depth_stencil_state(struct pipe_context *pipe,
+                                           void *depth_stencil)
+{
+   free(depth_stencil);
+}
+
+/************************************************************************
+ * Scissor
+ */
+static void brw_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   memcpy( &brw->attribs.Scissor, scissor, sizeof(*scissor) );
+   brw->state.dirty.brw |= BRW_NEW_SCISSOR;
+}
+
+
+/************************************************************************
+ * Stipple
+ */
+
+static void brw_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+}
+
+
+/************************************************************************
+ * Fragment shader
+ */
+
+static void * brw_create_fs_state(struct pipe_context *pipe,
+                                   const struct pipe_shader_state *shader)
+{
+   struct brw_fragment_program *brw_fp = CALLOC_STRUCT(brw_fragment_program);
+
+   brw_fp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_fp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_fp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_fp->info2);
+#endif
+
+   tgsi_dump(shader->tokens, 0);
+
+
+   return (void *)brw_fp;
+}
+
+static void brw_bind_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FragmentProgram = (struct brw_fragment_program *)shader;
+   brw->state.dirty.brw |= BRW_NEW_FS;
+}
+
+static void brw_delete_fs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_fragment_program *brw_fp = (struct brw_fragment_program *) shader;
+
+   FREE((void *) brw_fp->program.tokens);
+   FREE(brw_fp);
+}
+
+
+/************************************************************************
+ * Vertex shader and other TNL state 
+ */
+
+static void *brw_create_vs_state(struct pipe_context *pipe,
+                                 const struct pipe_shader_state *shader)
+{
+   struct brw_vertex_program *brw_vp = CALLOC_STRUCT(brw_vertex_program);
+
+   brw_vp->program.tokens = tgsi_dup_tokens(shader->tokens);
+   brw_vp->id = brw_context(pipe)->program_id++;
+
+   tgsi_scan_shader(shader->tokens, &brw_vp->info);
+
+#if 0
+   brw_shader_info(shader->tokens,
+		   &brw_vp->info2);
+#endif
+   tgsi_dump(shader->tokens, 0);
+
+   return (void *)brw_vp;
+}
+
+static void brw_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.VertexProgram = (struct brw_vertex_program *)vs;
+   brw->state.dirty.brw |= BRW_NEW_VS;
+
+   debug_printf("YYYYYYYYYYYYY BINDING VERTEX SHADER\n");
+}
+
+static void brw_delete_vs_state(struct pipe_context *pipe, void *shader)
+{
+   struct brw_vertex_program *brw_vp = (struct brw_vertex_program *) shader;
+
+   FREE((void *) brw_vp->program.tokens);
+   FREE(brw_vp);
+}
+
+
+static void brw_set_clip_state( struct pipe_context *pipe,
+                                const struct pipe_clip_state *clip )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Clip = *clip;
+}
+
+
+static void brw_set_viewport_state( struct pipe_context *pipe,
+				     const struct pipe_viewport_state *viewport )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Viewport = *viewport; /* struct copy */
+   brw->state.dirty.brw |= BRW_NEW_VIEWPORT;
+
+   /* pass the viewport info to the draw module */
+   //draw_set_viewport_state(brw->draw, viewport);
+}
+
+
+static void brw_set_vertex_buffers(struct pipe_context *pipe,
+				   unsigned count,
+				   const struct pipe_vertex_buffer *buffers)
+{
+   struct brw_context *brw = brw_context(pipe);
+   memcpy(brw->vb.vbo_array, buffers, count * sizeof(buffers[0]));
+}
+
+static void brw_set_vertex_elements(struct pipe_context *pipe,
+                                    unsigned count,
+                                    const struct pipe_vertex_element *elements)
+{
+   /* flush ? */
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   for (i = 0; i < count; i++) {
+      struct brw_vertex_element_state el;
+      memset(&el, 0, sizeof(el));
+
+      el.ve0.src_offset = elements[i].src_offset;
+      el.ve0.src_format = brw_translate_surface_format(elements[i].src_format);
+      el.ve0.valid = 1;
+      el.ve0.vertex_buffer_index = elements[i].vertex_buffer_index;
+
+      el.ve1.dst_offset   = i * 4;
+
+      el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_SRC;
+      el.ve1.vfcomponent0 = BRW_VFCOMPONENT_STORE_SRC;
+
+      switch (elements[i].nr_components) {
+      case 1: el.ve1.vfcomponent1 = BRW_VFCOMPONENT_STORE_0;
+      case 2: el.ve1.vfcomponent2 = BRW_VFCOMPONENT_STORE_0;
+      case 3: el.ve1.vfcomponent3 = BRW_VFCOMPONENT_STORE_1_FLT;
+         break;
+      }
+
+      brw->vb.inputs[i] = el;
+   }
+}
+
+
+
+/************************************************************************
+ * Constant buffers
+ */
+
+static void brw_set_constant_buffer(struct pipe_context *pipe,
+                                     uint shader, uint index,
+                                     const struct pipe_constant_buffer *buf)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   assert(buf == 0 || index == 0);
+
+   brw->attribs.Constants[shader] = buf;
+   brw->state.dirty.brw |= BRW_NEW_CONSTANTS;
+}
+
+
+/************************************************************************
+ * Texture surfaces
+ */
+
+
+static void brw_set_sampler_textures(struct pipe_context *pipe,
+                                     unsigned num,
+                                     struct pipe_texture **texture)
+{
+   struct brw_context *brw = brw_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == brw->num_textures &&
+       !memcmp(brw->attribs.Texture, texture, num *
+               sizeof(struct pipe_texture *)))
+      return;
+
+   for (i = 0; i < num; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             texture[i]);
+
+   for (i = num; i < brw->num_textures; i++)
+      pipe_texture_reference((struct pipe_texture **) &brw->attribs.Texture[i],
+                             NULL);
+
+   brw->num_textures = num;
+
+   brw->state.dirty.brw |= BRW_NEW_TEXTURE;
+}
+
+
+/************************************************************************
+ * Render targets, etc
+ */
+
+static void brw_set_framebuffer_state(struct pipe_context *pipe,
+				       const struct pipe_framebuffer_state *fb)
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.FrameBuffer = *fb; /* struct copy */
+
+   brw->state.dirty.brw |= BRW_NEW_FRAMEBUFFER;
+}
+
+
+
+/************************************************************************
+ * Rasterizer state
+ */
+
+static void *
+brw_create_rasterizer_state(struct pipe_context *pipe,
+                             const struct pipe_rasterizer_state *rasterizer)
+{
+   DUP(pipe_rasterizer_state, rasterizer);
+}
+
+static void brw_bind_rasterizer_state( struct pipe_context *pipe,
+                                        void *setup )
+{
+   struct brw_context *brw = brw_context(pipe);
+
+   brw->attribs.Raster = (struct pipe_rasterizer_state *)setup;
+
+   /* Also pass-through to draw module:
+    */
+   //draw_set_rasterizer_state(brw->draw, setup);
+
+   brw->state.dirty.brw |= BRW_NEW_RASTERIZER;
+}
+
+static void brw_delete_rasterizer_state(struct pipe_context *pipe,
+                                         void *setup)
+{
+   free(setup);
+}
+
+
+
+void
+brw_init_state_functions( struct brw_context *brw )
+{
+   brw->pipe.create_blend_state = brw_create_blend_state;
+   brw->pipe.bind_blend_state = brw_bind_blend_state;
+   brw->pipe.delete_blend_state = brw_delete_blend_state;
+
+   brw->pipe.create_sampler_state = brw_create_sampler_state;
+   brw->pipe.bind_sampler_states = brw_bind_sampler_states;
+   brw->pipe.delete_sampler_state = brw_delete_sampler_state;
+
+   brw->pipe.create_depth_stencil_alpha_state = brw_create_depth_stencil_state;
+   brw->pipe.bind_depth_stencil_alpha_state = brw_bind_depth_stencil_state;
+   brw->pipe.delete_depth_stencil_alpha_state = brw_delete_depth_stencil_state;
+
+   brw->pipe.create_rasterizer_state = brw_create_rasterizer_state;
+   brw->pipe.bind_rasterizer_state = brw_bind_rasterizer_state;
+   brw->pipe.delete_rasterizer_state = brw_delete_rasterizer_state;
+   brw->pipe.create_fs_state = brw_create_fs_state;
+   brw->pipe.bind_fs_state = brw_bind_fs_state;
+   brw->pipe.delete_fs_state = brw_delete_fs_state;
+   brw->pipe.create_vs_state = brw_create_vs_state;
+   brw->pipe.bind_vs_state = brw_bind_vs_state;
+   brw->pipe.delete_vs_state = brw_delete_vs_state;
+
+   brw->pipe.set_blend_color = brw_set_blend_color;
+   brw->pipe.set_clip_state = brw_set_clip_state;
+   brw->pipe.set_constant_buffer = brw_set_constant_buffer;
+   brw->pipe.set_framebuffer_state = brw_set_framebuffer_state;
+
+//   brw->pipe.set_feedback_state = brw_set_feedback_state;
+//   brw->pipe.set_feedback_buffer = brw_set_feedback_buffer;
+
+   brw->pipe.set_polygon_stipple = brw_set_polygon_stipple;
+   brw->pipe.set_scissor_state = brw_set_scissor_state;
+   brw->pipe.set_sampler_textures = brw_set_sampler_textures;
+   brw->pipe.set_viewport_state = brw_set_viewport_state;
+   brw->pipe.set_vertex_buffers = brw_set_vertex_buffers;
+   brw->pipe.set_vertex_elements = brw_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/i965simple/brw_state.h b/src/gallium/drivers/i965simple/brw_state.h
new file mode 100644
index 0000000000..de0a6371b8
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state.h
@@ -0,0 +1,151 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+    
+
+#ifndef BRW_STATE_H
+#define BRW_STATE_H
+
+#include "brw_context.h"
+#include "brw_winsys.h"
+
+
+const struct brw_tracked_state brw_blend_constant_color;
+const struct brw_tracked_state brw_cc_unit;
+const struct brw_tracked_state brw_cc_vp;
+const struct brw_tracked_state brw_clip_prog;
+const struct brw_tracked_state brw_clip_unit;
+const struct brw_tracked_state brw_constant_buffer_state;
+const struct brw_tracked_state brw_constant_buffer;
+const struct brw_tracked_state brw_curbe_offsets;
+const struct brw_tracked_state brw_invarient_state;
+const struct brw_tracked_state brw_gs_prog;
+const struct brw_tracked_state brw_gs_unit;
+const struct brw_tracked_state brw_drawing_rect;
+const struct brw_tracked_state brw_line_stipple;
+const struct brw_tracked_state brw_pipelined_state_pointers;
+const struct brw_tracked_state brw_binding_table_pointers;
+const struct brw_tracked_state brw_depthbuffer;
+const struct brw_tracked_state brw_polygon_stipple_offset;
+const struct brw_tracked_state brw_polygon_stipple;
+const struct brw_tracked_state brw_program_parameters;
+const struct brw_tracked_state brw_recalculate_urb_fence;
+const struct brw_tracked_state brw_sf_prog;
+const struct brw_tracked_state brw_sf_unit;
+const struct brw_tracked_state brw_sf_vp;
+const struct brw_tracked_state brw_state_base_address;
+const struct brw_tracked_state brw_urb_fence;
+const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_prog;
+const struct brw_tracked_state brw_vs_unit;
+const struct brw_tracked_state brw_wm_prog;
+const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_surfaces;
+const struct brw_tracked_state brw_wm_unit;
+
+const struct brw_tracked_state brw_psp_urb_cbs;
+
+const struct brw_tracked_state brw_active_vertprog;
+const struct brw_tracked_state brw_tnl_vertprog;
+const struct brw_tracked_state brw_pipe_control;
+
+const struct brw_tracked_state brw_clear_surface_cache;
+const struct brw_tracked_state brw_clear_batch_cache;
+
+/***********************************************************************
+ * brw_state_cache.c
+ */
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data );
+
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_sz);
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_sz,
+			 const void *data,
+			 unsigned data_sz,
+			 const void *aux,
+			 void *aux_return );
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return);
+
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
+
+static inline struct pipe_buffer *brw_cache_buffer(struct brw_context *brw,
+                                                          enum brw_cache_id id)
+{
+   return brw->cache[id].pool->buffer;
+}
+
+/***********************************************************************
+ * brw_state_batch.c
+ */
+#define BRW_CACHED_BATCH_STRUCT(brw, s) brw_cached_batch_struct( brw, (s), sizeof(*(s)) )
+
+boolean brw_cached_batch_struct( struct brw_context *brw,
+				   const void *data,
+				   unsigned sz );
+
+void brw_destroy_batch_cache( struct brw_context *brw );
+
+
+/***********************************************************************
+ * brw_state_pool.c
+ */
+void brw_init_pools( struct brw_context *brw );
+void brw_destroy_pools( struct brw_context *brw );
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return);
+
+void brw_pool_fence( struct brw_context *brw,
+		     struct brw_mem_pool *pool,
+		     unsigned fence );
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool );
+
+void brw_clear_all_caches( struct brw_context *brw );
+void brw_invalidate_pools( struct brw_context *brw );
+void brw_clear_batch_cache_flush( struct brw_context *brw );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_state_batch.c b/src/gallium/drivers/i965simple/brw_state_batch.c
new file mode 100644
index 0000000000..43a1c89fc4
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_batch.c
@@ -0,0 +1,113 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_state.h"
+#include "brw_winsys.h"
+
+#include "util/u_memory.h"
+
+/* A facility similar to the data caching code above, which aims to
+ * prevent identical commands being issued repeatedly.
+ */
+boolean brw_cached_batch_struct( struct brw_context *brw,
+                                 const void *data,
+                                 unsigned sz )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+   struct header *newheader = (struct header *)data;
+
+   if (brw->emit_state_always) {
+      brw_batchbuffer_data(brw->winsys, data, sz);
+      return TRUE;
+   }
+
+   while (item) {
+      if (item->header->opcode == newheader->opcode) {
+	 if (item->sz == sz && memcmp(item->header, newheader, sz) == 0)
+	    return FALSE;
+	 if (item->sz != sz) {
+	    FREE(item->header);
+	    item->header = MALLOC(sz);
+	    item->sz = sz;
+	 }
+	 goto emit;
+      }
+      item = item->next;
+   }
+
+   assert(!item);
+   item = CALLOC_STRUCT(brw_cached_batch_item);
+   item->header = MALLOC(sz);
+   item->sz = sz;
+   item->next = brw->cached_batch_items;
+   brw->cached_batch_items = item;
+
+emit:
+   memcpy(item->header, newheader, sz);
+   brw_batchbuffer_data(brw->winsys, data, sz);
+   return TRUE;
+}
+
+static void clear_batch_cache( struct brw_context *brw )
+{
+   struct brw_cached_batch_item *item = brw->cached_batch_items;
+
+   while (item) {
+      struct brw_cached_batch_item *next = item->next;
+      free((void *)item->header);
+      free(item);
+      item = next;
+   }
+
+   brw->cached_batch_items = NULL;
+
+
+   brw_clear_all_caches(brw);
+
+   brw_invalidate_pools(brw);
+}
+
+void brw_clear_batch_cache_flush( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+
+/*    brw_do_flush(brw, BRW_FLUSH_STATE_CACHE|BRW_FLUSH_READ_CACHE); */
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+void brw_destroy_batch_cache( struct brw_context *brw )
+{
+   clear_batch_cache(brw);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_cache.c b/src/gallium/drivers/i965simple/brw_state_cache.c
new file mode 100644
index 0000000000..094248fa69
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_cache.c
@@ -0,0 +1,443 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_state.h"
+
+#include "brw_wm.h"
+#include "brw_vs.h"
+#include "brw_clip.h"
+#include "brw_sf.h"
+#include "brw_gs.h"
+
+#include "util/u_memory.h"
+
+
+
+/***********************************************************************
+ * Check cache for uploaded version of struct, else upload new one.
+ * Fail when memory is exhausted.
+ *
+ * XXX: FIXME: Currently search is so slow it would be quicker to
+ * regenerate the data every time...
+ */
+
+static unsigned hash_key( const void *key, unsigned key_size )
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+
+static struct brw_cache_item *search_cache( struct brw_cache *cache,
+					     unsigned hash,
+					     const void *key,
+					     unsigned key_size)
+{
+   struct brw_cache_item *c;
+
+   for (c = cache->items[hash % cache->size]; c; c = c->next) {
+      if (c->hash == hash &&
+	  c->key_size == key_size &&
+	  memcmp(c->key, key, key_size) == 0)
+	 return c;
+   }
+
+   return NULL;
+}
+
+
+static void rehash( struct brw_cache *cache )
+{
+   struct brw_cache_item **items;
+   struct brw_cache_item *c, *next;
+   unsigned size, i;
+
+   size = cache->size * 3;
+   items = (struct brw_cache_item**) MALLOC(size * sizeof(*items));
+   memset(items, 0, size * sizeof(*items));
+
+   for (i = 0; i < cache->size; i++)
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 c->next = items[c->hash % size];
+	 items[c->hash % size] = c;
+      }
+
+   FREE(cache->items);
+   cache->items = items;
+   cache->size = size;
+}
+
+
+boolean brw_search_cache( struct brw_cache *cache,
+			    const void *key,
+			    unsigned key_size,
+			    void *aux_return,
+			    unsigned *offset_return)
+{
+   struct brw_cache_item *item;
+   unsigned addr = 0;
+   unsigned hash = hash_key(key, key_size);
+
+   item = search_cache(cache, hash, key, key_size);
+
+   if (item) {
+      if (aux_return)
+	 *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+
+      *offset_return = addr = item->offset;
+   }
+
+   if (item == NULL || addr != cache->last_addr) {
+      cache->brw->state.dirty.cache |= 1<<cache->id;
+      cache->last_addr = addr;
+   }
+
+   return item != NULL;
+}
+
+unsigned brw_upload_cache( struct brw_cache *cache,
+			 const void *key,
+			 unsigned key_size,
+			 const void *data,
+			 unsigned data_size,
+			 const void *aux,
+			 void *aux_return )
+{
+   unsigned offset;
+   struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
+   unsigned hash = hash_key(key, key_size);
+   void *tmp = MALLOC(key_size + cache->aux_size);
+
+   if (!brw_pool_alloc(cache->pool, data_size, 1 << 6, &offset)) {
+      /* Should not be possible:
+       */
+      debug_printf("brw_pool_alloc failed\n");
+      exit(1);
+   }
+
+   memcpy(tmp, key, key_size);
+
+   if (cache->aux_size)
+      memcpy(tmp+key_size, aux, cache->aux_size);
+
+   item->key = tmp;
+   item->hash = hash;
+   item->key_size = key_size;
+   item->offset = offset;
+   item->data_size = data_size;
+
+   if (++cache->n_items > cache->size * 1.5)
+      rehash(cache);
+
+   hash %= cache->size;
+   item->next = cache->items[hash];
+   cache->items[hash] = item;
+
+   if (aux_return) {
+      assert(cache->aux_size);
+      *(void **)aux_return = (void *)((char *)item->key + item->key_size);
+   }
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("upload %s: %d bytes to pool buffer %p offset %x\n",
+             cache->name, 
+	     data_size,
+             (void*)cache->pool->buffer,
+             offset);
+
+   /* Copy data to the buffer:
+    */
+   cache->brw->winsys->buffer_subdata_typed(cache->brw->winsys,
+					    cache->pool->buffer, 
+					    offset, 
+					    data_size, 
+					    data,
+					    cache->id);
+
+   cache->brw->state.dirty.cache |= 1<<cache->id;
+   cache->last_addr = offset;
+
+   return offset;
+}
+
+/* This doesn't really work with aux data.  Use search/upload instead
+ */
+unsigned brw_cache_data_sz(struct brw_cache *cache,
+			 const void *data,
+			 unsigned data_size)
+{
+   unsigned addr;
+
+   if (!brw_search_cache(cache, data, data_size, NULL, &addr)) {
+      addr = brw_upload_cache(cache,
+			      data, data_size,
+			      data, data_size,
+			      NULL, NULL);
+   }
+
+   return addr;
+}
+
+unsigned brw_cache_data(struct brw_cache *cache,
+		      const void *data)
+{
+   return brw_cache_data_sz(cache, data, cache->key_size);
+}
+
+enum pool_type {
+   DW_SURFACE_STATE,
+   DW_GENERAL_STATE
+};
+
+static void brw_init_cache( struct brw_context *brw,
+			    const char *name,
+			    unsigned id,
+			    unsigned key_size,
+			    unsigned aux_size,
+			    enum pool_type pool_type)
+{
+   struct brw_cache *cache = &brw->cache[id];
+   cache->brw = brw;
+   cache->id = id;
+   cache->name = name;
+   cache->items = NULL;
+
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      CALLOC(cache->size, sizeof(struct brw_cache_item));
+
+
+   cache->key_size = key_size;
+   cache->aux_size = aux_size;
+   switch (pool_type) {
+   case DW_GENERAL_STATE: cache->pool = &brw->pool[BRW_GS_POOL]; break;
+   case DW_SURFACE_STATE: cache->pool = &brw->pool[BRW_SS_POOL]; break;
+   default: assert(0); break;
+   }
+}
+
+void brw_init_caches( struct brw_context *brw )
+{
+
+   brw_init_cache(brw,
+		  "CC_VP",
+		  BRW_CC_VP,
+		  sizeof(struct brw_cc_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CC_UNIT",
+		  BRW_CC_UNIT,
+		  sizeof(struct brw_cc_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_PROG",
+		  BRW_WM_PROG,
+		  sizeof(struct brw_wm_prog_key),
+		  sizeof(struct brw_wm_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER_DEFAULT_COLOR",
+		  BRW_SAMPLER_DEFAULT_COLOR,
+		  sizeof(struct brw_sampler_default_color),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SAMPLER",
+		  BRW_SAMPLER,
+		  0,		/* variable key/data size */
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "WM_UNIT",
+		  BRW_WM_UNIT,
+		  sizeof(struct brw_wm_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_PROG",
+		  BRW_SF_PROG,
+		  sizeof(struct brw_sf_prog_key),
+		  sizeof(struct brw_sf_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_VP",
+		  BRW_SF_VP,
+		  sizeof(struct brw_sf_viewport),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SF_UNIT",
+		  BRW_SF_UNIT,
+		  sizeof(struct brw_sf_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_UNIT",
+		  BRW_VS_UNIT,
+		  sizeof(struct brw_vs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "VS_PROG",
+		  BRW_VS_PROG,
+		  sizeof(struct brw_vs_prog_key),
+		  sizeof(struct brw_vs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_UNIT",
+		  BRW_CLIP_UNIT,
+		  sizeof(struct brw_clip_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "CLIP_PROG",
+		  BRW_CLIP_PROG,
+		  sizeof(struct brw_clip_prog_key),
+		  sizeof(struct brw_clip_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_UNIT",
+		  BRW_GS_UNIT,
+		  sizeof(struct brw_gs_unit_state),
+		  0,
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "GS_PROG",
+		  BRW_GS_PROG,
+		  sizeof(struct brw_gs_prog_key),
+		  sizeof(struct brw_gs_prog_data),
+		  DW_GENERAL_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURFACE",
+		  BRW_SS_SURFACE,
+		  sizeof(struct brw_surface_state),
+		  0,
+		  DW_SURFACE_STATE);
+
+   brw_init_cache(brw,
+		  "SS_SURF_BIND",
+		  BRW_SS_SURF_BIND,
+		  sizeof(struct brw_surface_binding_table),
+		  0,
+		  DW_SURFACE_STATE);
+}
+
+
+/* When we lose hardware context, need to invalidate the surface cache
+ * as these structs must be explicitly re-uploaded.  They are subject
+ * to fixup by the memory manager as they contain absolute agp
+ * offsets, so we need to ensure there is a fresh version of the
+ * struct available to receive the fixup.
+ *
+ * XXX: Need to ensure that there aren't two versions of a surface or
+ * bufferobj with different backing data active in the same buffer at
+ * once?  Otherwise the cache could confuse them.  Maybe better not to
+ * cache at all?
+ *
+ * --> Isn't this the same as saying need to ensure batch is flushed
+ *         before new data is uploaded to an existing buffer?  We
+ *         already try to make sure of that.
+ */
+static void clear_cache( struct brw_cache *cache )
+{
+   struct brw_cache_item *c, *next;
+   unsigned i;
+
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
+	 next = c->next;
+	 free((void *)c->key);
+	 free(c);
+      }
+      cache->items[i] = NULL;
+   }
+
+   cache->n_items = 0;
+}
+
+void brw_clear_all_caches( struct brw_context *brw )
+{
+   int i;
+
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("%s\n", __FUNCTION__);
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+
+   if (brw->curbe.last_buf) {
+      FREE(brw->curbe.last_buf);
+      brw->curbe.last_buf = NULL;
+   }
+
+   brw->state.dirty.brw |= ~0;
+   brw->state.dirty.cache |= ~0;
+}
+
+
+
+
+
+void brw_destroy_caches( struct brw_context *brw )
+{
+   unsigned i;
+
+   for (i = 0; i < BRW_MAX_CACHE; i++)
+      clear_cache(&brw->cache[i]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
new file mode 100644
index 0000000000..007dc8f9de
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -0,0 +1,138 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+/** @file brw_state_pool.c
+ * Implements the state pool allocator.
+ *
+ * For the 965, we create two state pools for state cache entries.  Objects
+ * will be allocated into the pools depending on which state base address
+ * their pointer is relative to in other 965 state.
+ *
+ * The state pools are relatively simple: As objects are allocated, increment
+ * the offset to allocate space.  When the pool is "full" (rather, close to
+ * full), we reset the pool and reset the state cache entries that point into
+ * the pool.
+ */
+
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "brw_context.h"
+#include "brw_state.h"
+
+boolean brw_pool_alloc( struct brw_mem_pool *pool,
+			  unsigned size,
+			  unsigned alignment,
+			  unsigned *offset_return)
+{
+   unsigned fixup = align(pool->offset, alignment) - pool->offset;
+
+   size = align(size, 4);
+
+   if (pool->offset + fixup + size >= pool->size) {
+      debug_printf("%s failed\n", __FUNCTION__);
+      assert(0);
+      exit(0);
+   }
+
+   pool->offset += fixup;
+   *offset_return = pool->offset;
+   pool->offset += size;
+
+   return TRUE;
+}
+
+static
+void brw_invalidate_pool( struct brw_mem_pool *pool )
+{
+   if (BRW_DEBUG & DEBUG_STATE)
+      debug_printf("\n\n\n %s \n\n\n", __FUNCTION__);
+
+   pool->offset = 0;
+
+   brw_clear_all_caches(pool->brw);
+}
+
+
+static void brw_init_pool( struct brw_context *brw,
+			   unsigned pool_id,
+			   unsigned size )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pool->size = size;
+   pool->brw = brw;
+
+   pool->buffer = pipe_buffer_create(brw->pipe.screen,
+                                     4096,
+                                     0 /*  DRM_BO_FLAG_MEM_TT */,
+                                     size);
+}
+
+static void brw_destroy_pool( struct brw_context *brw,
+			      unsigned pool_id )
+{
+   struct brw_mem_pool *pool = &brw->pool[pool_id];
+
+   pipe_buffer_reference( pool->brw->pipe.screen,
+			  &pool->buffer,
+			  NULL );
+}
+
+
+void brw_pool_check_wrap( struct brw_context *brw,
+			  struct brw_mem_pool *pool )
+{
+   if (pool->offset > (pool->size * 3) / 4) {
+      brw->state.dirty.brw |= BRW_NEW_SCENE;
+   }
+
+}
+
+void brw_init_pools( struct brw_context *brw )
+{
+   brw_init_pool(brw, BRW_GS_POOL, 0x80000);
+   brw_init_pool(brw, BRW_SS_POOL, 0x80000);
+}
+
+void brw_destroy_pools( struct brw_context *brw )
+{
+   brw_destroy_pool(brw, BRW_GS_POOL);
+   brw_destroy_pool(brw, BRW_SS_POOL);
+}
+
+
+void brw_invalidate_pools( struct brw_context *brw )
+{
+   brw_invalidate_pool(&brw->pool[BRW_GS_POOL]);
+   brw_invalidate_pool(&brw->pool[BRW_SS_POOL]);
+}
diff --git a/src/gallium/drivers/i965simple/brw_state_upload.c b/src/gallium/drivers/i965simple/brw_state_upload.c
new file mode 100644
index 0000000000..bac9161b5f
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_state_upload.c
@@ -0,0 +1,202 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+
+#include "util/u_memory.h"
+
+/* This is used to initialize brw->state.atoms[].  We could use this
+ * list directly except for a single atom, brw_constant_buffer, which
+ * has a .dirty value which changes according to the parameters of the
+ * current fragment and vertex programs, and so cannot be a static
+ * value.
+ */
+const struct brw_tracked_state *atoms[] =
+{
+   &brw_vs_prog,
+   &brw_gs_prog,
+   &brw_clip_prog,
+   &brw_sf_prog,
+   &brw_wm_prog,
+
+   /* Once all the programs are done, we know how large urb entry
+    * sizes need to be and can decide if we need to change the urb
+    * layout.
+    */
+   &brw_curbe_offsets,
+   &brw_recalculate_urb_fence,
+
+
+   &brw_cc_vp,
+   &brw_cc_unit,
+
+   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_wm_samplers,
+
+   &brw_wm_unit,
+   &brw_sf_vp,
+   &brw_sf_unit,
+   &brw_vs_unit,		/* always required, enabled or not */
+   &brw_clip_unit,
+   &brw_gs_unit,
+
+   /* Command packets:
+    */
+   &brw_invarient_state,
+   &brw_state_base_address,
+   &brw_pipe_control,
+
+   &brw_binding_table_pointers,
+   &brw_blend_constant_color,
+
+   &brw_drawing_rect,
+   &brw_depthbuffer,
+
+   &brw_polygon_stipple,
+   &brw_line_stipple,
+
+   &brw_psp_urb_cbs,
+
+   &brw_constant_buffer
+};
+
+
+void brw_init_state( struct brw_context *brw )
+{
+   brw_init_pools(brw);
+   brw_init_caches(brw);
+
+   brw->state.dirty.brw = ~0;
+   brw->emit_state_always = 0;
+}
+
+
+void brw_destroy_state( struct brw_context *brw )
+{
+   brw_destroy_caches(brw);
+   brw_destroy_batch_cache(brw);
+   brw_destroy_pools(brw);
+}
+
+/***********************************************************************
+ */
+
+static boolean check_state( const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   return ((a->brw & b->brw) ||
+	   (a->cache & b->cache));
+}
+
+static void accumulate_state( struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   a->brw |= b->brw;
+   a->cache |= b->cache;
+}
+
+
+static void xor_states( struct brw_state_flags *result,
+			     const struct brw_state_flags *a,
+			      const struct brw_state_flags *b )
+{
+   result->brw = a->brw ^ b->brw;
+   result->cache = a->cache ^ b->cache;
+}
+
+
+/***********************************************************************
+ * Emit all state:
+ */
+void brw_validate_state( struct brw_context *brw )
+{
+   struct brw_state_flags *state = &brw->state.dirty;
+   unsigned i;
+
+   if (brw->emit_state_always) 
+      state->brw |= ~0;
+
+   if (state->cache == 0 &&
+       state->brw == 0)
+      return;
+
+   if (brw->state.dirty.brw & BRW_NEW_SCENE)
+      brw_clear_batch_cache_flush(brw);
+
+   if (BRW_DEBUG) {
+      /* Debug version which enforces various sanity checks on the
+       * state flags which are generated and checked to help ensure
+       * state atoms are ordered correctly in the list.
+       */
+      struct brw_state_flags examined, prev;
+      memset(&examined, 0, sizeof(examined));
+      prev = *state;
+
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+	 struct brw_state_flags generated;
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty)) {
+	    atom->update( brw );
+	 }
+
+	 accumulate_state(&examined, &atom->dirty);
+
+	 /* generated = (prev ^ state)
+	  * if (examined & generated)
+	  *     fail;
+	  */
+	 xor_states(&generated, &prev, state);
+	 assert(!check_state(&examined, &generated));
+	 prev = *state;
+      }
+   }
+   else {
+      for (i = 0; i < Elements(atoms); i++) {
+	 const struct brw_tracked_state *atom = atoms[i];
+
+	 assert(atom->dirty.brw ||
+		atom->dirty.cache);
+	 assert(atom->update);
+
+	 if (check_state(state, &atom->dirty))
+	    atom->update( brw );
+      }
+   }
+
+   memset(state, 0, sizeof(*state));
+}
diff --git a/src/gallium/drivers/i965simple/brw_structs.h b/src/gallium/drivers/i965simple/brw_structs.h
new file mode 100644
index 0000000000..bbb087e95d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_structs.h
@@ -0,0 +1,1348 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_STRUCTS_H
+#define BRW_STRUCTS_H
+
+#include "pipe/p_compiler.h"
+
+/* Command packets:
+ */
+struct header
+{
+   unsigned length:16;
+   unsigned opcode:16;
+};
+
+
+union header_union
+{
+   struct header bits;
+   unsigned dword;
+};
+
+struct brw_3d_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:3;
+      unsigned wc_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned operation:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } dest;
+
+   unsigned dword2;
+   unsigned dword3;
+};
+
+
+struct brw_3d_primitive
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned pad:2;
+      unsigned topology:5;
+      unsigned indexed:1;
+      unsigned opcode:16;
+   } header;
+
+   unsigned verts_per_instance;
+   unsigned start_vert_location;
+   unsigned instance_count;
+   unsigned start_instance_location;
+   unsigned base_vert_location;
+};
+
+/* These seem to be passed around as function args, so it works out
+ * better to keep them as #defines:
+ */
+#define BRW_FLUSH_READ_CACHE           0x1
+#define BRW_FLUSH_STATE_CACHE          0x2
+#define BRW_INHIBIT_FLUSH_RENDER_CACHE 0x4
+#define BRW_FLUSH_SNAPSHOT_COUNTERS    0x8
+
+struct brw_mi_flush
+{
+   unsigned flags:4;
+   unsigned pad:12;
+   unsigned opcode:16;
+};
+
+struct brw_vf_statistics
+{
+   unsigned statistics_enable:1;
+   unsigned pad:15;
+   unsigned opcode:16;
+};
+
+
+
+struct brw_binding_table_pointers
+{
+   struct header header;
+   unsigned vs;
+   unsigned gs;
+   unsigned clp;
+   unsigned sf;
+   unsigned wm;
+};
+
+
+struct brw_blend_constant_color
+{
+   struct header header;
+   float blend_constant_color[4];
+};
+
+
+struct brw_depthbuffer
+{
+   union header_union header;
+
+   union {
+      struct {
+	 unsigned pitch:18;
+	 unsigned format:3;
+	 unsigned pad:4;
+	 unsigned depth_offset_disable:1;
+	 unsigned tile_walk:1;
+	 unsigned tiled_surface:1;
+	 unsigned pad2:1;
+	 unsigned surface_type:3;
+      } bits;
+      unsigned dword;
+   } dword1;
+
+   unsigned dword2_base_addr;
+
+   union {
+      struct {
+	 unsigned pad:1;
+	 unsigned mipmap_layout:1;
+	 unsigned lod:4;
+	 unsigned width:13;
+	 unsigned height:13;
+      } bits;
+      unsigned dword;
+   } dword3;
+
+   union {
+      struct {
+	 unsigned pad:12;
+	 unsigned min_array_element:9;
+	 unsigned depth:11;
+      } bits;
+      unsigned dword;
+   } dword4;
+};
+
+struct brw_drawrect
+{
+   struct header header;
+   unsigned xmin:16;
+   unsigned ymin:16;
+   unsigned xmax:16;
+   unsigned ymax:16;
+   unsigned xorg:16;
+   unsigned yorg:16;
+};
+
+
+
+
+struct brw_global_depth_offset_clamp
+{
+   struct header header;
+   float depth_offset_clamp;
+};
+
+struct brw_indexbuffer
+{
+   union {
+      struct
+      {
+	 unsigned length:8;
+	 unsigned index_format:2;
+	 unsigned cut_index_enable:1;
+	 unsigned pad:5;
+	 unsigned opcode:16;
+      } bits;
+      unsigned dword;
+
+   } header;
+
+   unsigned buffer_start;
+   unsigned buffer_end;
+};
+
+
+struct brw_line_stipple
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pattern:16;
+      unsigned pad:16;
+   } bits0;
+
+   struct
+   {
+      unsigned repeat_count:9;
+      unsigned pad:7;
+      unsigned inverse_repeat_count:16;
+   } bits1;
+};
+
+
+struct brw_pipelined_state_pointers
+{
+   struct header header;
+
+   struct {
+      unsigned pad:5;
+      unsigned offset:27;
+   } vs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } gs;
+
+   struct
+   {
+      unsigned enable:1;
+      unsigned pad:4;
+      unsigned offset:27;
+   } clp;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } sf;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27;
+   } wm;
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned offset:27; /* KW: check me! */
+   } cc;
+};
+
+
+struct brw_polygon_stipple_offset
+{
+   struct header header;
+
+   struct {
+      unsigned y_offset:5;
+      unsigned pad:3;
+      unsigned x_offset:5;
+      unsigned pad0:19;
+   } bits0;
+};
+
+
+
+struct brw_polygon_stipple
+{
+   struct header header;
+   unsigned stipple[32];
+};
+
+
+
+struct brw_pipeline_select
+{
+   struct
+   {
+      unsigned pipeline_select:1;
+      unsigned pad:15;
+      unsigned opcode:16;
+   } header;
+};
+
+
+struct brw_pipe_control
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned notify_enable:1;
+      unsigned pad:2;
+      unsigned instruction_state_cache_flush_enable:1;
+      unsigned write_cache_flush_enable:1;
+      unsigned depth_stall_enable:1;
+      unsigned post_sync_operation:2;
+
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned pad:2;
+      unsigned dest_addr_type:1;
+      unsigned dest_addr:29;
+   } bits1;
+
+   unsigned data0;
+   unsigned data1;
+};
+
+
+struct brw_urb_fence
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned vs_realloc:1;
+      unsigned gs_realloc:1;
+      unsigned clp_realloc:1;
+      unsigned sf_realloc:1;
+      unsigned vfe_realloc:1;
+      unsigned cs_realloc:1;
+      unsigned pad:2;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned vs_fence:10;
+      unsigned gs_fence:10;
+      unsigned clp_fence:10;
+      unsigned pad:2;
+   } bits0;
+
+   struct
+   {
+      unsigned sf_fence:10;
+      unsigned vf_fence:10;
+      unsigned cs_fence:10;
+      unsigned pad:2;
+   } bits1;
+};
+
+struct brw_constant_buffer_state /* previously brw_command_streamer */
+{
+   struct header header;
+
+   struct
+   {
+      unsigned nr_urb_entries:3;
+      unsigned pad:1;
+      unsigned urb_entry_size:5;
+      unsigned pad0:23;
+   } bits0;
+};
+
+struct brw_constant_buffer
+{
+   struct
+   {
+      unsigned length:8;
+      unsigned valid:1;
+      unsigned pad:7;
+      unsigned opcode:16;
+   } header;
+
+   struct
+   {
+      unsigned buffer_length:6;
+      unsigned buffer_address:26;
+   } bits0;
+};
+
+struct brw_state_base_address
+{
+   struct header header;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned general_state_address:27;
+   } bits0;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned surface_state_address:27;
+   } bits1;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:4;
+      unsigned indirect_object_state_address:27;
+   } bits2;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned general_state_upper_bound:20;
+   } bits3;
+
+   struct
+   {
+      unsigned modify_enable:1;
+      unsigned pad:11;
+      unsigned indirect_object_state_upper_bound:20;
+   } bits4;
+};
+
+struct brw_state_prefetch
+{
+   struct header header;
+
+   struct
+   {
+      unsigned prefetch_count:3;
+      unsigned pad:3;
+      unsigned prefetch_pointer:26;
+   } bits0;
+};
+
+struct brw_system_instruction_pointer
+{
+   struct header header;
+
+   struct
+   {
+      unsigned pad:4;
+      unsigned system_instruction_pointer:28;
+   } bits0;
+};
+
+
+
+
+/* State structs for the various fixed function units:
+ */
+
+
+struct thread0
+{
+   unsigned pad0:1;
+   unsigned grf_reg_count:3;
+   unsigned pad1:2;
+   unsigned kernel_start_pointer:26;
+};
+
+struct thread1
+{
+   unsigned ext_halt_exception_enable:1;
+   unsigned sw_exception_enable:1;
+   unsigned mask_stack_exception_enable:1;
+   unsigned timeout_exception_enable:1;
+   unsigned illegal_op_exception_enable:1;
+   unsigned pad0:3;
+   unsigned depth_coef_urb_read_offset:6;	/* WM only */
+   unsigned pad1:2;
+   unsigned floating_point_mode:1;
+   unsigned thread_priority:1;
+   unsigned binding_table_entry_count:8;
+   unsigned pad3:5;
+   unsigned single_program_flow:1;
+};
+
+struct thread2
+{
+   unsigned per_thread_scratch_space:4;
+   unsigned pad0:6;
+   unsigned scratch_space_base_pointer:22;
+};
+
+
+struct thread3
+{
+   unsigned dispatch_grf_start_reg:4;
+   unsigned urb_entry_read_offset:6;
+   unsigned pad0:1;
+   unsigned urb_entry_read_length:6;
+   unsigned pad1:1;
+   unsigned const_urb_entry_read_offset:6;
+   unsigned pad2:1;
+   unsigned const_urb_entry_read_length:6;
+   unsigned pad3:1;
+};
+
+
+
+struct brw_clip_unit_state
+{
+   struct thread0 thread0;
+   struct
+   {
+      unsigned pad0:7;
+      unsigned sw_exception_enable:1;
+      unsigned pad1:3;
+      unsigned mask_stack_exception_enable:1;
+      unsigned pad2:1;
+      unsigned illegal_op_exception_enable:1;
+      unsigned pad3:2;
+      unsigned floating_point_mode:1;
+      unsigned thread_priority:1;
+      unsigned binding_table_entry_count:8;
+      unsigned pad4:5;
+      unsigned single_program_flow:1;
+   } thread1;
+
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned gs_output_stats:1; /* not always */
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1; 	/* may be less */
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned pad0:13;
+      unsigned clip_mode:3;
+      unsigned userclip_enable_flags:8;
+      unsigned userclip_must_clip:1;
+      unsigned pad1:1;
+      unsigned guard_band_enable:1;
+      unsigned viewport_z_clip_enable:1;
+      unsigned viewport_xy_clip_enable:1;
+      unsigned vertex_position_space:1;
+      unsigned api_mode:1;
+      unsigned pad2:1;
+   } clip5;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned clipper_viewport_state_ptr:27;
+   } clip6;
+
+
+   float viewport_xmin;
+   float viewport_xmax;
+   float viewport_ymin;
+   float viewport_ymax;
+};
+
+
+
+struct brw_cc_unit_state
+{
+   struct
+   {
+      unsigned pad0:3;
+      unsigned bf_stencil_pass_depth_pass_op:3;
+      unsigned bf_stencil_pass_depth_fail_op:3;
+      unsigned bf_stencil_fail_op:3;
+      unsigned bf_stencil_func:3;
+      unsigned bf_stencil_enable:1;
+      unsigned pad1:2;
+      unsigned stencil_write_enable:1;
+      unsigned stencil_pass_depth_pass_op:3;
+      unsigned stencil_pass_depth_fail_op:3;
+      unsigned stencil_fail_op:3;
+      unsigned stencil_func:3;
+      unsigned stencil_enable:1;
+   } cc0;
+
+
+   struct
+   {
+      unsigned bf_stencil_ref:8;
+      unsigned stencil_write_mask:8;
+      unsigned stencil_test_mask:8;
+      unsigned stencil_ref:8;
+   } cc1;
+
+
+   struct
+   {
+      unsigned logicop_enable:1;
+      unsigned pad0:10;
+      unsigned depth_write_enable:1;
+      unsigned depth_test_function:3;
+      unsigned depth_test:1;
+      unsigned bf_stencil_write_mask:8;
+      unsigned bf_stencil_test_mask:8;
+   } cc2;
+
+
+   struct
+   {
+      unsigned pad0:8;
+      unsigned alpha_test_func:3;
+      unsigned alpha_test:1;
+      unsigned blend_enable:1;
+      unsigned ia_blend_enable:1;
+      unsigned pad1:1;
+      unsigned alpha_test_format:1;
+      unsigned pad2:16;
+   } cc3;
+
+   struct
+   {
+      unsigned pad0:5;
+      unsigned cc_viewport_state_offset:27;
+   } cc4;
+
+   struct
+   {
+      unsigned pad0:2;
+      unsigned ia_dest_blend_factor:5;
+      unsigned ia_src_blend_factor:5;
+      unsigned ia_blend_function:3;
+      unsigned statistics_enable:1;
+      unsigned logicop_func:4;
+      unsigned pad1:11;
+      unsigned dither_enable:1;
+   } cc5;
+
+   struct
+   {
+      unsigned clamp_post_alpha_blend:1;
+      unsigned clamp_pre_alpha_blend:1;
+      unsigned clamp_range:2;
+      unsigned pad0:11;
+      unsigned y_dither_offset:2;
+      unsigned x_dither_offset:2;
+      unsigned dest_blend_factor:5;
+      unsigned src_blend_factor:5;
+      unsigned blend_function:3;
+   } cc6;
+
+   struct {
+      union {
+	 float f;
+	 ubyte ub[4];
+      } alpha_ref;
+   } cc7;
+};
+
+
+
+struct brw_sf_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:6;
+      unsigned pad3:1;
+   } thread4;
+
+   struct
+   {
+      unsigned front_winding:1;
+      unsigned viewport_transform:1;
+      unsigned pad0:3;
+      unsigned sf_viewport_state_offset:27;
+   } sf5;
+
+   struct
+   {
+      unsigned pad0:9;
+      unsigned dest_org_vbias:4;
+      unsigned dest_org_hbias:4;
+      unsigned scissor:1;
+      unsigned disable_2x2_trifilter:1;
+      unsigned disable_zero_pix_trifilter:1;
+      unsigned point_rast_rule:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned line_width:4;
+      unsigned fast_scissor_disable:1;
+      unsigned cull_mode:2;
+      unsigned aa_enable:1;
+   } sf6;
+
+   struct
+   {
+      unsigned point_size:11;
+      unsigned use_point_size_state:1;
+      unsigned subpixel_precision:1;
+      unsigned sprite_point:1;
+      unsigned pad0:11;
+      unsigned trifan_pv:2;
+      unsigned linestrip_pv:2;
+      unsigned tristrip_pv:2;
+      unsigned line_last_pixel_enable:1;
+   } sf7;
+
+};
+
+
+struct brw_gs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:1;
+      unsigned pad3:6;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } gs5;
+
+
+   struct
+   {
+      unsigned max_vp_index:4;
+      unsigned pad0:26;
+      unsigned reorder_enable:1;
+      unsigned pad1:1;
+   } gs6;
+};
+
+
+struct brw_vs_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct
+   {
+      unsigned pad0:10;
+      unsigned stats_enable:1;
+      unsigned nr_urb_entries:7;
+      unsigned pad1:1;
+      unsigned urb_entry_allocation_size:5;
+      unsigned pad2:1;
+      unsigned max_threads:4;
+      unsigned pad3:3;
+   } thread4;
+
+   struct
+   {
+      unsigned sampler_count:3;
+      unsigned pad0:2;
+      unsigned sampler_state_pointer:27;
+   } vs5;
+
+   struct
+   {
+      unsigned vs_enable:1;
+      unsigned vert_cache_disable:1;
+      unsigned pad0:30;
+   } vs6;
+};
+
+
+struct brw_wm_unit_state
+{
+   struct thread0 thread0;
+   struct thread1 thread1;
+   struct thread2 thread2;
+   struct thread3 thread3;
+
+   struct {
+      unsigned stats_enable:1;
+      unsigned pad0:1;
+      unsigned sampler_count:3;
+      unsigned sampler_state_pointer:27;
+   } wm4;
+
+   struct
+   {
+      unsigned enable_8_pix:1;
+      unsigned enable_16_pix:1;
+      unsigned enable_32_pix:1;
+      unsigned pad0:7;
+      unsigned legacy_global_depth_bias:1;
+      unsigned line_stipple:1;
+      unsigned depth_offset:1;
+      unsigned polygon_stipple:1;
+      unsigned line_aa_region_width:2;
+      unsigned line_endcap_aa_region_width:2;
+      unsigned early_depth_test:1;
+      unsigned thread_dispatch_enable:1;
+      unsigned program_uses_depth:1;
+      unsigned program_computes_depth:1;
+      unsigned program_uses_killpixel:1;
+      unsigned legacy_line_rast: 1;
+      unsigned pad1:1;
+      unsigned max_threads:6;
+      unsigned pad2:1;
+   } wm5;
+
+   float global_depth_offset_constant;
+   float global_depth_offset_scale;
+};
+
+struct brw_sampler_default_color {
+   float color[4];
+};
+
+struct brw_sampler_state
+{
+
+   struct
+   {
+      unsigned shadow_function:3;
+      unsigned lod_bias:11;
+      unsigned min_filter:3;
+      unsigned mag_filter:3;
+      unsigned mip_filter:2;
+      unsigned base_level:5;
+      unsigned pad:1;
+      unsigned lod_preclamp:1;
+      unsigned default_color_mode:1;
+      unsigned pad0:1;
+      unsigned disable:1;
+   } ss0;
+
+   struct
+   {
+      unsigned r_wrap_mode:3;
+      unsigned t_wrap_mode:3;
+      unsigned s_wrap_mode:3;
+      unsigned pad:3;
+      unsigned max_lod:10;
+      unsigned min_lod:10;
+   } ss1;
+
+
+   struct
+   {
+      unsigned pad:5;
+      unsigned default_color_pointer:27;
+   } ss2;
+
+   struct
+   {
+      unsigned pad:19;
+      unsigned max_aniso:3;
+      unsigned chroma_key_mode:1;
+      unsigned chroma_key_index:2;
+      unsigned chroma_key_enable:1;
+      unsigned monochrome_filter_width:3;
+      unsigned monochrome_filter_height:3;
+   } ss3;
+};
+
+
+struct brw_clipper_viewport
+{
+   float xmin;
+   float xmax;
+   float ymin;
+   float ymax;
+};
+
+struct brw_cc_viewport
+{
+   float min_depth;
+   float max_depth;
+};
+
+struct brw_sf_viewport
+{
+   struct {
+      float m00;
+      float m11;
+      float m22;
+      float m30;
+      float m31;
+      float m32;
+   } viewport;
+
+   struct {
+      short xmin;
+      short ymin;
+      short xmax;
+      short ymax;
+   } scissor;
+};
+
+/* Documented in the subsystem/shared-functions/sampler chapter...
+ */
+struct brw_surface_state
+{
+   struct {
+      unsigned cube_pos_z:1;
+      unsigned cube_neg_z:1;
+      unsigned cube_pos_y:1;
+      unsigned cube_neg_y:1;
+      unsigned cube_pos_x:1;
+      unsigned cube_neg_x:1;
+      unsigned pad:4;
+      unsigned mipmap_layout_mode:1;
+      unsigned vert_line_stride_ofs:1;
+      unsigned vert_line_stride:1;
+      unsigned color_blend:1;
+      unsigned writedisable_blue:1;
+      unsigned writedisable_green:1;
+      unsigned writedisable_red:1;
+      unsigned writedisable_alpha:1;
+      unsigned surface_format:9;
+      unsigned data_return_format:1;
+      unsigned pad0:1;
+      unsigned surface_type:3;
+   } ss0;
+
+   struct {
+      unsigned base_addr;
+   } ss1;
+
+   struct {
+      unsigned pad:2;
+      unsigned mip_count:4;
+      unsigned width:13;
+      unsigned height:13;
+   } ss2;
+
+   struct {
+      unsigned tile_walk:1;
+      unsigned tiled_surface:1;
+      unsigned pad:1;
+      unsigned pitch:18;
+      unsigned depth:11;
+   } ss3;
+
+   struct {
+      unsigned pad:19;
+      unsigned min_array_elt:9;
+      unsigned min_lod:4;
+   } ss4;
+};
+
+
+
+struct brw_vertex_buffer_state
+{
+   struct {
+      unsigned pitch:11;
+      unsigned pad:15;
+      unsigned access_type:1;
+      unsigned vb_index:5;
+   } vb0;
+
+   unsigned start_addr;
+   unsigned max_index;
+#if 1
+   unsigned instance_data_step_rate; /* not included for sequential/random vertices? */
+#endif
+};
+
+#define BRW_VBP_MAX 17
+
+struct brw_vb_array_state {
+   struct header header;
+   struct brw_vertex_buffer_state vb[BRW_VBP_MAX];
+};
+
+
+struct brw_vertex_element_state
+{
+   struct
+   {
+      unsigned src_offset:11;
+      unsigned pad:5;
+      unsigned src_format:9;
+      unsigned pad0:1;
+      unsigned valid:1;
+      unsigned vertex_buffer_index:5;
+   } ve0;
+
+   struct
+   {
+      unsigned dst_offset:8;
+      unsigned pad:8;
+      unsigned vfcomponent3:4;
+      unsigned vfcomponent2:4;
+      unsigned vfcomponent1:4;
+      unsigned vfcomponent0:4;
+   } ve1;
+};
+
+#define BRW_VEP_MAX 18
+
+struct brw_vertex_element_packet {
+   struct header header;
+   struct brw_vertex_element_state ve[BRW_VEP_MAX]; /* note: less than _TNL_ATTRIB_MAX */
+};
+
+
+struct brw_urb_immediate {
+   unsigned opcode:4;
+   unsigned offset:6;
+   unsigned swizzle_control:2;
+   unsigned pad:1;
+   unsigned allocate:1;
+   unsigned used:1;
+   unsigned complete:1;
+   unsigned response_length:4;
+   unsigned msg_length:4;
+   unsigned msg_target:4;
+   unsigned pad1:3;
+   unsigned end_of_thread:1;
+};
+
+/* Instruction format for the execution units:
+ */
+
+struct brw_instruction
+{
+   struct
+   {
+      unsigned opcode:7;
+      unsigned pad:1;
+      unsigned access_mode:1;
+      unsigned mask_control:1;
+      unsigned dependency_control:2;
+      unsigned compression_control:2;
+      unsigned thread_control:2;
+      unsigned predicate_control:4;
+      unsigned predicate_inverse:1;
+      unsigned execution_size:3;
+      unsigned destreg__conditonalmod:4; /* destreg - send, conditionalmod - others */
+      unsigned pad0:2;
+      unsigned debug_control:1;
+      unsigned saturate:1;
+   } header;
+
+   union {
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad:1;
+	 unsigned dest_subreg_nr:5;
+	 unsigned dest_reg_nr:8;
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } da1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad:6;
+	 int dest_indirect_offset:10;	/* offset against the deref'd address reg */
+	 unsigned dest_subreg_nr:3; /* subnr for the address reg a0.x */
+	 unsigned dest_horiz_stride:2;
+	 unsigned dest_address_mode:1;
+      } ia1;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned src1_reg_file:2;
+	 unsigned src1_reg_type:3;
+	 unsigned pad0:1;
+	 unsigned dest_writemask:4;
+	 unsigned dest_subreg_nr:1;
+	 unsigned dest_reg_nr:8;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } da16;
+
+      struct
+      {
+	 unsigned dest_reg_file:2;
+	 unsigned dest_reg_type:3;
+	 unsigned src0_reg_file:2;
+	 unsigned src0_reg_type:3;
+	 unsigned pad0:6;
+	 unsigned dest_writemask:4;
+	 int dest_indirect_offset:6;
+	 unsigned dest_subreg_nr:3;
+	 unsigned pad1:2;
+	 unsigned dest_address_mode:1;
+      } ia16;
+   } bits1;
+
+
+   union {
+      struct
+      {
+	 unsigned src0_subreg_nr:5;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } da1;
+
+      struct
+      {
+	 int src0_indirect_offset:10;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_horiz_stride:2;
+	 unsigned src0_width:3;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 unsigned src0_subreg_nr:1;
+	 unsigned src0_reg_nr:8;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } da16;
+
+      struct
+      {
+	 unsigned src0_swz_x:2;
+	 unsigned src0_swz_y:2;
+	 int src0_indirect_offset:6;
+	 unsigned src0_subreg_nr:3;
+	 unsigned src0_abs:1;
+	 unsigned src0_negate:1;
+	 unsigned src0_address_mode:1;
+	 unsigned src0_swz_z:2;
+	 unsigned src0_swz_w:2;
+	 unsigned pad0:1;
+	 unsigned src0_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia16;
+
+   } bits2;
+
+   union
+   {
+      struct
+      {
+	 unsigned src1_subreg_nr:5;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad0:7;
+      } da1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 unsigned src1_subreg_nr:1;
+	 unsigned src1_reg_nr:8;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned pad2:7;
+      } da16;
+
+      struct
+      {
+	 int  src1_indirect_offset:10;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_horiz_stride:2;
+	 unsigned src1_width:3;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad1:6;
+      } ia1;
+
+      struct
+      {
+	 unsigned src1_swz_x:2;
+	 unsigned src1_swz_y:2;
+	 int  src1_indirect_offset:6;
+	 unsigned src1_subreg_nr:3;
+	 unsigned src1_abs:1;
+	 unsigned src1_negate:1;
+	 unsigned pad0:1;
+	 unsigned src1_swz_z:2;
+	 unsigned src1_swz_w:2;
+	 unsigned pad1:1;
+	 unsigned src1_vert_stride:4;
+	 unsigned flag_reg_nr:1;
+	 unsigned pad2:6;
+      } ia16;
+
+
+      struct
+      {
+	 int  jump_count:16;	/* note: signed */
+	 unsigned  pop_count:4;
+	 unsigned  pad0:12;
+      } if_else;
+
+      struct {
+	 unsigned function:4;
+	 unsigned int_type:1;
+	 unsigned precision:1;
+	 unsigned saturate:1;
+	 unsigned data_type:1;
+	 unsigned pad0:8;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } math;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned sampler:4;
+	 unsigned return_format:2;
+	 unsigned msg_type:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } sampler;
+
+      struct brw_urb_immediate urb;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:4;
+	 unsigned msg_type:2;
+	 unsigned target_cache:2;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_read;
+
+      struct {
+	 unsigned binding_table_index:8;
+	 unsigned msg_control:3;
+	 unsigned pixel_scoreboard_clear:1;
+	 unsigned msg_type:3;
+	 unsigned send_commit_msg:1;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } dp_write;
+
+      struct {
+	 unsigned pad:16;
+	 unsigned response_length:4;
+	 unsigned msg_length:4;
+	 unsigned msg_target:4;
+	 unsigned pad1:3;
+	 unsigned end_of_thread:1;
+      } generic;
+
+      int d;
+      unsigned ud;
+   } bits3;
+};
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_surface.c b/src/gallium/drivers/i965simple/brw_surface.c
new file mode 100644
index 0000000000..b89756c47b
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_surface.c
@@ -0,0 +1,124 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "brw_blit.h"
+#include "brw_context.h"
+#include "brw_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_tile.h"
+#include "util/u_rect.h"
+
+
+
+/* Assumes all values are within bounds -- no checking at this level -
+ * do it higher up if required.
+ */
+static void
+brw_surface_copy(struct pipe_context *pipe,
+                 boolean do_flip,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 struct pipe_surface *src,
+                 unsigned srcx, unsigned srcy, unsigned width, unsigned height)
+{
+   assert( dst != src );
+   assert( dst->block.size == src->block.size );
+   assert( dst->block.width == src->block.height );
+   assert( dst->block.height == src->block.height );
+
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+      
+      const void *src_map = pipe->screen->surface_map( pipe->screen,
+                                                       src,
+                                                       PIPE_BUFFER_USAGE_CPU_READ );
+      
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dstx, dsty, 
+                     width, height, 
+                     src_map, 
+                     do_flip ? -(int) src->stride : src->stride, 
+                     srcx, do_flip ? height - 1 - srcy : srcy);
+
+      pipe->screen->surface_unmap(pipe->screen, src);
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_copy_blit(brw_context(pipe),
+                    do_flip,
+                    dst->block.size,
+                    (short) src->stride/src->block.size, src->buffer, src->offset, FALSE,
+                    (short) dst->stride/dst->block.size, dst->buffer, dst->offset, FALSE,
+                    (short) srcx, (short) srcy, (short) dstx, (short) dsty,
+                    (short) width, (short) height, PIPE_LOGICOP_COPY);
+   }
+}
+
+
+static void
+brw_surface_fill(struct pipe_context *pipe,
+                 struct pipe_surface *dst,
+                 unsigned dstx, unsigned dsty,
+                 unsigned width, unsigned height, unsigned value)
+{
+   if (0) {
+      void *dst_map = pipe->screen->surface_map( pipe->screen,
+                                                 dst,
+                                                 PIPE_BUFFER_USAGE_CPU_WRITE );
+
+      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
+
+      pipe->screen->surface_unmap(pipe->screen, dst);
+   }
+   else {
+      assert(dst->block.width == 1);
+      assert(dst->block.height == 1);
+      brw_fill_blit(brw_context(pipe),
+                    dst->block.size,
+                    (short) dst->stride/dst->block.size, 
+                    dst->buffer, dst->offset, FALSE,
+                    (short) dstx, (short) dsty,
+                    (short) width, (short) height,
+                    value);
+   }
+}
+
+
+void
+brw_init_surface_functions(struct brw_context *brw)
+{
+   brw->pipe.surface_copy  = brw_surface_copy;
+   brw->pipe.surface_fill  = brw_surface_fill;
+}
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
new file mode 100644
index 0000000000..cc0c665e02
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -0,0 +1,400 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+/* Code to layout images in a mipmap tree for i965.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "brw_context.h"
+#include "brw_tex_layout.h"
+
+
+#define FILE_DEBUG_FLAG DEBUG_TEXTURE
+
+#if 0
+unsigned intel_compressed_alignment(unsigned internalFormat)
+{
+    unsigned alignment = 4;
+
+    switch (internalFormat) {
+    case GL_COMPRESSED_RGB_FXT1_3DFX:
+    case GL_COMPRESSED_RGBA_FXT1_3DFX:
+        alignment = 8;
+        break;
+
+    default:
+        break;
+    }
+
+    return alignment;
+}
+#endif
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+
+static void intel_miptree_set_image_offset(struct brw_texture *tex,
+                                           unsigned level,
+                                           unsigned img,
+                                           unsigned x, unsigned y)
+{
+   struct pipe_texture *pt = &tex->base;
+   if (img == 0 && level == 0)
+      assert(x == 0 && y == 0);
+   assert(img < tex->nr_images[level]);
+
+   tex->image_offset[level][img] = y * tex->stride + x * pt->block.size;
+}
+
+static void intel_miptree_set_level_info(struct brw_texture *tex,
+                                         unsigned level,
+                                         unsigned nr_images,
+                                         unsigned x, unsigned y,
+                                         unsigned w, unsigned h, unsigned d)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   assert(level < PIPE_MAX_TEXTURE_LEVELS);
+
+   pt->width[level] = w;
+   pt->height[level] = h;
+   pt->depth[level] = d;
+   
+   pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w);
+   pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h);
+
+   tex->level_offset[level] = y * tex->stride + x * tex->base.block.size;
+   tex->nr_images[level] = nr_images;
+
+   /*
+   DBG("%s level %d size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+       level, w, h, d, x, y, tex->level_offset[level]);
+   */
+
+   /* Not sure when this would happen, but anyway: 
+    */
+   if (tex->image_offset[level]) {
+      FREE(tex->image_offset[level]);
+      tex->image_offset[level] = NULL;
+   }
+
+   assert(nr_images);
+   assert(!tex->image_offset[level]);
+
+   tex->image_offset[level] = (unsigned *) MALLOC(nr_images * sizeof(unsigned));
+   tex->image_offset[level][0] = 0;
+}
+
+static void i945_miptree_layout_2d(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   const int align_x = 2, align_y = 4;
+   unsigned level;
+   unsigned x = 0;
+   unsigned y = 0;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned nblocksx = pt->nblocksx[0];
+   unsigned nblocksy = pt->nblocksy[0];
+
+   tex->stride = align(pt->nblocksx[0] * pt->block.size, 4);
+
+   /* May need to adjust pitch to accomodate the placement of
+    * the 2nd mipmap level.  This occurs when the alignment
+    * constraints of mipmap placement push the right edge of the
+    * 2nd mipmap level out past the width of its parent.
+    */
+   if (pt->last_level > 0) {
+      unsigned mip1_nblocksx 
+	 = align(pf_get_nblocksx(&pt->block, minify(width)), align_x)
+         + pf_get_nblocksx(&pt->block, minify(minify(width)));
+
+      if (mip1_nblocksx > nblocksx)
+	 tex->stride = mip1_nblocksx * pt->block.size;
+   }
+
+   /* Pitch must be a whole number of dwords
+    */
+   tex->stride = align(tex->stride, 64);
+   tex->total_nblocksy = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      intel_miptree_set_level_info(tex, level, 1, x, y, width,
+				   height, 1);
+
+      nblocksy = align(nblocksy, align_y);
+
+      /* Because the images are packed better, the final offset
+       * might not be the maximal one:
+       */
+      tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy);
+
+      /* Layout_below: step right after second mipmap level.
+       */
+      if (level == 1) {
+	 x += align(nblocksx, align_x);
+      }
+      else {
+	 y += nblocksy;
+      }
+
+      width  = minify(width);
+      height = minify(height);
+      nblocksx = pf_get_nblocksx(&pt->block, width);
+      nblocksy = pf_get_nblocksy(&pt->block, height);
+   }
+}
+
+static boolean brw_miptree_layout(struct brw_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+   /* XXX: these vary depending on image format:
+    */
+/*    int align_w = 4; */
+
+   switch (pt->target) {
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_3D: {
+      unsigned width  = pt->width[0];
+      unsigned height = pt->height[0];
+      unsigned depth = pt->depth[0];
+      unsigned nblocksx = pt->nblocksx[0];
+      unsigned nblocksy = pt->nblocksy[0];
+      unsigned pack_x_pitch, pack_x_nr;
+      unsigned pack_y_pitch;
+      unsigned level;
+      unsigned align_h = 2;
+      unsigned align_w = 4;
+
+      tex->total_nblocksy = 0;
+
+      tex->stride = align(pt->nblocksx[0], 4);
+      pack_y_pitch = align(pt->nblocksy[0], align_h);
+
+      pack_x_pitch = tex->stride / pt->block.size;
+      pack_x_nr = 1;
+
+      for (level = 0; level <= pt->last_level; level++) {
+	 unsigned nr_images = pt->target == PIPE_TEXTURE_3D ? depth : 6;
+	 int x = 0;
+	 int y = 0;
+	 uint q, j;
+
+	 intel_miptree_set_level_info(tex, level, nr_images,
+				      0, tex->total_nblocksy,
+				      width, height, depth);
+
+	 for (q = 0; q < nr_images;) {
+	    for (j = 0; j < pack_x_nr && q < nr_images; j++, q++) {
+	       intel_miptree_set_image_offset(tex, level, q, x, y);
+	       x += pack_x_pitch;
+	    }
+
+	    x = 0;
+	    y += pack_y_pitch;
+	 }
+
+
+	 tex->total_nblocksy += y;
+	 width  = minify(width);
+	 height = minify(height);
+	 depth  = minify(depth);
+         nblocksx = pf_get_nblocksx(&pt->block, width);
+         nblocksy = pf_get_nblocksy(&pt->block, height);
+
+         if (pt->compressed) {
+            pack_y_pitch = (height + 3) / 4;
+
+            if (pack_x_pitch > align(width, align_w)) {
+               pack_x_pitch = align(width, align_w);
+               pack_x_nr <<= 1;
+            }
+         } else {
+            if (pack_x_pitch > 4) {
+               pack_x_pitch >>= 1;
+               pack_x_nr <<= 1;
+               assert(pack_x_pitch * pack_x_nr * pt->block.size <= tex->stride);
+            }
+
+            if (pack_y_pitch > 2) {
+               pack_y_pitch >>= 1;
+               pack_y_pitch = align(pack_y_pitch, align_h);
+            }
+         }
+
+      }
+      break;
+   }
+
+   default:
+      i945_miptree_layout_2d(tex);
+      break;
+   }
+#if 0
+   PRINT("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+       pt->pitch,
+       pt->total_nblocksy,
+       pt->block.size,
+       pt->stride * pt->total_nblocksy );
+#endif
+
+   return TRUE;
+}
+
+
+static struct pipe_texture *
+brw_texture_create_screen(struct pipe_screen *screen,
+                          const struct pipe_texture *templat)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct brw_texture *tex = CALLOC_STRUCT(brw_texture);
+
+   if (tex) {
+      tex->base = *templat;
+      tex->base.refcount = 1;
+
+      tex->base.nblocksx[0] = pf_get_nblocksx(&tex->base.block, tex->base.width[0]);
+      tex->base.nblocksy[0] = pf_get_nblocksy(&tex->base.block, tex->base.height[0]);
+   
+      if (brw_miptree_layout(tex))
+	 tex->buffer = ws->buffer_create(ws, 64,
+                                         PIPE_BUFFER_USAGE_PIXEL,
+                                         tex->stride *
+                                         tex->total_nblocksy);
+
+      if (!tex->buffer) {
+	 FREE(tex);
+         return NULL;
+      }
+   }
+
+   return &tex->base;
+}
+
+
+static void
+brw_texture_release_screen(struct pipe_screen *screen,
+                           struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   /*
+   DBG("%s %p refcount will be %d\n",
+       __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
+   */
+   if (--(*pt)->refcount <= 0) {
+      struct pipe_winsys *ws = screen->winsys;
+      struct brw_texture *tex = (struct brw_texture *)*pt;
+      uint i;
+
+      /*
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
+      */
+
+      winsys_buffer_reference(ws, &tex->buffer, NULL);
+
+      for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
+         if (tex->image_offset[i])
+            free(tex->image_offset[i]);
+
+      free(tex);
+   }
+   *pt = NULL;
+}
+
+
+static struct pipe_surface *
+brw_get_tex_surface_screen(struct pipe_screen *screen,
+                           struct pipe_texture *pt,
+                           unsigned face, unsigned level, unsigned zslice)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct brw_texture *tex = (struct brw_texture *)pt;
+   struct pipe_surface *ps;
+   unsigned offset;  /* in bytes */
+
+   offset = tex->level_offset[level];
+
+   if (pt->target == PIPE_TEXTURE_CUBE) {
+      offset += tex->image_offset[level][face];
+   }
+   else if (pt->target == PIPE_TEXTURE_3D) {
+      offset += tex->image_offset[level][zslice];
+   }
+   else {
+      assert(face == 0);
+      assert(zslice == 0);
+   }
+
+   ps = ws->surface_alloc(ws);
+   if (ps) {
+      assert(ps->format);
+      assert(ps->refcount);
+      winsys_buffer_reference(ws, &ps->buffer, tex->buffer);
+      ps->format = pt->format;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->block = pt->block;
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = tex->stride;
+      ps->offset = offset;
+   }
+   return ps;
+}
+
+
+void
+brw_init_texture_functions(struct brw_context *brw)
+{
+//   brw->pipe.texture_update = brw_texture_update;
+}
+
+
+void
+brw_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create  = brw_texture_create_screen;
+   screen->texture_release = brw_texture_release_screen;
+   screen->get_tex_surface = brw_get_tex_surface_screen;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.h b/src/gallium/drivers/i965simple/brw_tex_layout.h
new file mode 100644
index 0000000000..a6b6ba8146
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.h
@@ -0,0 +1,44 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+
+#ifndef BRW_TEX_LAYOUT_H
+#define BRW_TEX_LAYOUT_H
+
+
+struct brw_context;
+struct pipe_screen;
+
+
+extern void
+brw_init_texture_functions(struct brw_context *brw);
+
+extern void
+brw_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_urb.c b/src/gallium/drivers/i965simple/brw_urb.c
new file mode 100644
index 0000000000..101a4367b9
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_urb.c
@@ -0,0 +1,186 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+//#include "brw_state.h"
+#include "brw_batch.h"
+#include "brw_defines.h"
+
+#define VS 0
+#define GS 1
+#define CLP 2
+#define SF 3
+#define CS 4
+
+/* XXX: Are the min_entry_size numbers useful?
+ * XXX: Verify min_nr_entries, esp for VS.
+ * XXX: Verify SF min_entry_size.
+ */
+static const struct {
+   unsigned min_nr_entries;
+   unsigned preferred_nr_entries;
+   unsigned min_entry_size;
+   unsigned max_entry_size;
+} limits[CS+1] = {
+   { 8, 32, 1, 5 },			/* vs */
+   { 4, 8,  1, 5 },			/* gs */
+   { 6, 8,  1, 5 },			/* clp */
+   { 1, 8,  1, 12 },		        /* sf */
+   { 1, 4,  1, 32 }			/* cs */
+};
+
+
+static boolean check_urb_layout( struct brw_context *brw )
+{
+   brw->urb.vs_start = 0;
+   brw->urb.gs_start = brw->urb.nr_vs_entries * brw->urb.vsize;
+   brw->urb.clip_start = brw->urb.gs_start + brw->urb.nr_gs_entries * brw->urb.vsize;
+   brw->urb.sf_start = brw->urb.clip_start + brw->urb.nr_clip_entries * brw->urb.vsize;
+   brw->urb.cs_start = brw->urb.sf_start + brw->urb.nr_sf_entries * brw->urb.sfsize;
+
+   return brw->urb.cs_start + brw->urb.nr_cs_entries * brw->urb.csize <= 256;
+}
+
+/* Most minimal update, forces re-emit of URB fence packet after GS
+ * unit turned on/off.
+ */
+static void recalculate_urb_fence( struct brw_context *brw )
+{
+   unsigned csize = brw->curbe.total_size;
+   unsigned vsize = brw->vs.prog_data->urb_entry_size;
+   unsigned sfsize = brw->sf.prog_data->urb_entry_size;
+
+   if (csize < limits[CS].min_entry_size)
+      csize = limits[CS].min_entry_size;
+
+   if (vsize < limits[VS].min_entry_size)
+      vsize = limits[VS].min_entry_size;
+
+   if (sfsize < limits[SF].min_entry_size)
+      sfsize = limits[SF].min_entry_size;
+
+   if (brw->urb.vsize < vsize ||
+       brw->urb.sfsize < sfsize ||
+       brw->urb.csize < csize ||
+       (brw->urb.constrained && (brw->urb.vsize > brw->urb.vsize ||
+				 brw->urb.sfsize > brw->urb.sfsize ||
+				 brw->urb.csize > brw->urb.csize))) {
+
+
+      brw->urb.csize = csize;
+      brw->urb.sfsize = sfsize;
+      brw->urb.vsize = vsize;
+
+      brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+      brw->urb.nr_gs_entries = limits[GS].preferred_nr_entries;
+      brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
+      brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+      brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;
+
+      if (!check_urb_layout(brw)) {
+	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;
+	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;
+	 brw->urb.nr_clip_entries = limits[CLP].min_nr_entries;
+	 brw->urb.nr_sf_entries = limits[SF].min_nr_entries;
+	 brw->urb.nr_cs_entries = limits[CS].min_nr_entries;
+
+	 brw->urb.constrained = 1;
+
+	 if (!check_urb_layout(brw)) {
+	    /* This is impossible, given the maximal sizes of urb
+	     * entries and the values for minimum nr of entries
+	     * provided above.
+	     */
+	    debug_printf("couldn't calculate URB layout!\n");
+	    exit(1);
+	 }
+
+	 if (BRW_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
+	    debug_printf("URB CONSTRAINED\n");
+      }
+      else
+	 brw->urb.constrained = 0;
+
+      if (BRW_DEBUG & DEBUG_URB)
+	 debug_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
+		      brw->urb.vs_start,
+		      brw->urb.gs_start,
+		      brw->urb.clip_start,
+		      brw->urb.sf_start,
+		      brw->urb.cs_start,
+		      256);
+
+      brw->state.dirty.brw |= BRW_NEW_URB_FENCE;
+   }
+}
+
+
+const struct brw_tracked_state brw_recalculate_urb_fence = {
+   .dirty = {
+      .brw = BRW_NEW_CURBE_OFFSETS,
+      .cache = (CACHE_NEW_VS_PROG |
+		CACHE_NEW_SF_PROG)
+   },
+   .update = recalculate_urb_fence
+};
+
+
+
+
+
+void brw_upload_urb_fence(struct brw_context *brw)
+{
+   struct brw_urb_fence uf;
+   memset(&uf, 0, sizeof(uf));
+
+   uf.header.opcode = CMD_URB_FENCE;
+   uf.header.length = sizeof(uf)/4-2;
+   uf.header.vs_realloc = 1;
+   uf.header.gs_realloc = 1;
+   uf.header.clp_realloc = 1;
+   uf.header.sf_realloc = 1;
+   uf.header.vfe_realloc = 1;
+   uf.header.cs_realloc = 1;
+
+   /* The ordering below is correct, not the layout in the
+    * instruction.
+    *
+    * There are 256 urb reg pairs in total.
+    */
+   uf.bits0.vs_fence  = brw->urb.gs_start;
+   uf.bits0.gs_fence  = brw->urb.clip_start;
+   uf.bits0.clp_fence = brw->urb.sf_start;
+   uf.bits1.sf_fence  = brw->urb.cs_start;
+   uf.bits1.cs_fence  = 256;
+
+   BRW_BATCH_STRUCT(brw, &uf);
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.c b/src/gallium/drivers/i965simple/brw_util.c
new file mode 100644
index 0000000000..42391d7c8c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.c
@@ -0,0 +1,104 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_util.h"
+#include "brw_defines.h"
+
+#include "pipe/p_defines.h"
+
+unsigned brw_count_bits( unsigned val )
+{
+   unsigned i;
+   for (i = 0; val ; val >>= 1)
+      if (val & 1)
+	 i++;
+   return i;
+}
+
+
+unsigned brw_translate_blend_equation( int mode )
+{
+   switch (mode) {
+   case PIPE_BLEND_ADD:
+      return BRW_BLENDFUNCTION_ADD;
+   case PIPE_BLEND_MIN:
+      return BRW_BLENDFUNCTION_MIN;
+   case PIPE_BLEND_MAX:
+      return BRW_BLENDFUNCTION_MAX;
+   case PIPE_BLEND_SUBTRACT:
+      return BRW_BLENDFUNCTION_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BRW_BLENDFUNCTION_REVERSE_SUBTRACT;
+   default:
+      assert(0);
+      return BRW_BLENDFUNCTION_ADD;
+   }
+}
+
+unsigned brw_translate_blend_factor( int factor )
+{
+   switch(factor) {
+   case PIPE_BLENDFACTOR_ZERO:
+      return BRW_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BRW_BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_ONE:
+      return BRW_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BRW_BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BRW_BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BRW_BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BRW_BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BRW_BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BRW_BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BRW_BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BRW_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BRW_BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BRW_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BRW_BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BRW_BLENDFACTOR_INV_CONST_ALPHA;
+   default:
+      assert(0);
+      return BRW_BLENDFACTOR_ZERO;
+   }
+}
diff --git a/src/gallium/drivers/i965simple/brw_util.h b/src/gallium/drivers/i965simple/brw_util.h
new file mode 100644
index 0000000000..d60e5934db
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_util.h
@@ -0,0 +1,43 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+          
+
+#ifndef BRW_UTIL_H
+#define BRW_UTIL_H
+
+#include "pipe/p_state.h"
+
+extern unsigned brw_count_bits( unsigned val );
+extern unsigned brw_translate_blend_factor( int factor );
+extern unsigned brw_translate_blend_equation( int mode );
+
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs.c b/src/gallium/drivers/i965simple/brw_vs.c
new file mode 100644
index 0000000000..92327e896d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.c
@@ -0,0 +1,120 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_vs.h"
+#include "brw_util.h"
+#include "brw_state.h"
+
+
+static void do_vs_prog( struct brw_context *brw,
+			const struct brw_vertex_program *vp,
+			struct brw_vs_prog_key *key )
+{
+   unsigned program_size;
+   const unsigned *program;
+   struct brw_vs_compile c;
+
+   memset(&c, 0, sizeof(c));
+   memcpy(&c.key, key, sizeof(*key));
+
+   brw_init_compile(&c.func);
+   c.vp = vp;
+
+   c.prog_data.outputs_written = vp->info.num_outputs;
+   c.prog_data.inputs_read = vp->info.num_inputs;
+
+#if 0
+   if (c.key.copy_edgeflag) {
+      c.prog_data.outputs_written |= 1<<VERT_RESULT_EDGE;
+      c.prog_data.inputs_read |= 1<<VERT_ATTRIB_EDGEFLAG;
+   }
+#endif
+
+   /* Emit GEN4 code.
+    */
+   brw_vs_emit(&c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c.func, &program_size);
+
+   /*
+    */
+   brw->vs.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_VS_PROG],
+					      &c.key,
+					      sizeof(c.key),
+					      program,
+					      program_size,
+					      &c.prog_data,
+					      &brw->vs.prog_data);
+}
+
+
+static void brw_upload_vs_prog( struct brw_context *brw )
+{
+   struct brw_vs_prog_key key;
+   const struct brw_vertex_program *vp = brw->attribs.VertexProgram;
+
+   assert(vp);
+
+   memset(&key, 0, sizeof(key));
+
+   /* Just upload the program verbatim for now.  Always send it all
+    * the inputs it asks for, whether they are varying or not.
+    */
+   key.program_string_id = vp->id;
+   key.nr_userclip = brw->attribs.Clip.nr;
+   key.copy_edgeflag = (brw->attribs.Raster->fill_cw != PIPE_POLYGON_MODE_FILL ||
+			brw->attribs.Raster->fill_ccw != PIPE_POLYGON_MODE_FILL);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_VS_PROG],
+			&key, sizeof(key),
+			&brw->vs.prog_data,
+			&brw->vs.prog_gs_offset))
+       return;
+
+   do_vs_prog(brw, vp, &key);
+}
+
+
+/* See brw_vs.c:
+ */
+const struct brw_tracked_state brw_vs_prog = {
+   .dirty = {
+      .brw   = BRW_NEW_VS,
+      .cache = 0
+   },
+   .update = brw_upload_vs_prog
+};
diff --git a/src/gallium/drivers/i965simple/brw_vs.h b/src/gallium/drivers/i965simple/brw_vs.h
new file mode 100644
index 0000000000..070f9dfcae
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs.h
@@ -0,0 +1,82 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#ifndef BRW_VS_H
+#define BRW_VS_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+
+struct brw_vs_prog_key {
+   unsigned program_string_id;
+   unsigned nr_userclip:4;
+   unsigned copy_edgeflag:1;
+   unsigned know_w_is_one:1;
+   unsigned pad:26;
+};
+
+
+struct brw_vs_compile {
+   struct brw_compile func;
+   struct brw_vs_prog_key key;
+   struct brw_vs_prog_data prog_data;
+
+   const struct brw_vertex_program *vp;
+
+   unsigned nr_inputs;
+
+   unsigned first_output;
+   unsigned nr_outputs;
+
+   unsigned first_tmp;
+   unsigned last_tmp;
+
+   struct brw_reg r0;
+   struct brw_reg r1;
+   struct brw_reg regs[12][128];
+   struct brw_reg tmp;
+   struct brw_reg stack;
+
+   struct {
+       boolean used_in_src;
+       struct brw_reg reg;
+   } output_regs[128];
+
+   struct brw_reg userplane[6];
+
+};
+
+void brw_vs_emit( struct brw_vs_compile *c );
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_vs_emit.c b/src/gallium/drivers/i965simple/brw_vs_emit.c
new file mode 100644
index 0000000000..34dbc0624d
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_emit.c
@@ -0,0 +1,1330 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_vs.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+struct brw_prog_info {
+   unsigned num_temps;
+   unsigned num_addrs;
+   unsigned num_consts;
+
+   unsigned writes_psize;
+
+   unsigned pos_idx;
+   unsigned result_edge_idx;
+   unsigned edge_flag_idx;
+   unsigned psize_idx;
+};
+
+/* Do things as simply as possible.  Allocate and populate all regs
+ * ahead of time.
+ */
+static void brw_vs_alloc_regs( struct brw_vs_compile *c,
+                               struct brw_prog_info *info )
+{
+   unsigned i, reg = 0, mrf;
+   unsigned nr_params;
+
+   /* r0 -- reserved as usual
+    */
+   c->r0 = brw_vec8_grf(reg, 0); reg++;
+
+   /* User clip planes from curbe:
+    */
+   if (c->key.nr_userclip) {
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 c->userplane[i] = stride( brw_vec4_grf(reg+3+i/2, (i%2) * 4), 0, 4, 1);
+      }
+
+      /* Deal with curbe alignment:
+       */
+      reg += ((6+c->key.nr_userclip+3)/4)*2;
+   }
+
+   /* Vertex program parameters from curbe:
+    */
+   nr_params = c->prog_data.max_const;
+   for (i = 0; i < nr_params; i++) {
+      c->regs[TGSI_FILE_CONSTANT][i] = stride(brw_vec4_grf(reg+i/2, (i%2) * 4), 0, 4, 1);
+   }
+   reg += (nr_params+1)/2;
+   c->prog_data.curb_read_length = reg - 1;
+
+
+
+   /* Allocate input regs:
+    */
+   c->nr_inputs = c->vp->info.num_inputs;
+   for (i = 0; i < c->nr_inputs; i++) {
+	 c->regs[TGSI_FILE_INPUT][i] = brw_vec8_grf(reg, 0);
+	 reg++;
+   }
+
+
+   /* Allocate outputs: TODO: could organize the non-position outputs
+    * to go straight into message regs.
+    */
+   c->nr_outputs = 0;
+   c->first_output = reg;
+   mrf = 4;
+   for (i = 0; i < c->vp->info.num_outputs; i++) {
+      c->nr_outputs++;
+#if 0
+      if (i == VERT_RESULT_HPOS) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+      else if (i == VERT_RESULT_PSIZ) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+         mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
+      }
+      else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#else
+      /*treat pos differently for now */
+      if (i == info->pos_idx) {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_vec8_grf(reg, 0);
+         reg++;
+      } else {
+         c->regs[TGSI_FILE_OUTPUT][i] = brw_message_reg(mrf);
+         mrf++;
+      }
+#endif
+   }
+
+   /* Allocate program temporaries:
+    */
+   for (i = 0; i < info->num_temps; i++) {
+      c->regs[TGSI_FILE_TEMPORARY][i] = brw_vec8_grf(reg, 0);
+      reg++;
+   }
+
+   /* Address reg(s).  Don't try to use the internal address reg until
+    * deref time.
+    */
+   for (i = 0; i < info->num_addrs; i++) {
+      c->regs[TGSI_FILE_ADDRESS][i] =  brw_reg(BRW_GENERAL_REGISTER_FILE,
+                                               reg,
+                                               0,
+                                               BRW_REGISTER_TYPE_D,
+                                               BRW_VERTICAL_STRIDE_8,
+                                               BRW_WIDTH_8,
+                                               BRW_HORIZONTAL_STRIDE_1,
+                                               BRW_SWIZZLE_XXXX,
+                                               TGSI_WRITEMASK_X);
+      reg++;
+   }
+
+   for (i = 0; i < 128; i++) {
+      if (c->output_regs[i].used_in_src) {
+         c->output_regs[i].reg = brw_vec8_grf(reg, 0);
+         reg++;
+      }
+   }
+
+   c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
+   reg += 2;
+
+
+   /* Some opcodes need an internal temporary:
+    */
+   c->first_tmp = reg;
+   c->last_tmp = reg;		/* for allocation purposes */
+
+   /* Each input reg holds data from two vertices.  The
+    * urb_read_length is the number of registers read from *each*
+    * vertex urb, so is half the amount:
+    */
+   c->prog_data.urb_read_length = (c->nr_inputs+1)/2;
+
+   c->prog_data.urb_entry_size = (c->nr_outputs+2+3)/4;
+   c->prog_data.total_grf = reg;
+}
+
+
+static struct brw_reg get_tmp( struct brw_vs_compile *c )
+{
+   struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
+
+   if (++c->last_tmp > c->prog_data.total_grf)
+      c->prog_data.total_grf = c->last_tmp;
+
+   return tmp;
+}
+
+static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
+{
+   if (tmp.nr == c->last_tmp-1)
+      c->last_tmp--;
+}
+
+static void release_tmps( struct brw_vs_compile *c )
+{
+   c->last_tmp = c->first_tmp;
+}
+
+
+static void unalias1( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if (dst.file == arg0.file && dst.nr == arg0.nr) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0);
+   }
+}
+
+static void unalias2( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1,
+		      void (*func)( struct brw_vs_compile *,
+				    struct brw_reg,
+				    struct brw_reg,
+				    struct brw_reg ))
+{
+   if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
+       (dst.file == arg1.file && dst.nr == arg1.nr)) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
+      func(c, tmp, arg0, arg1);
+      brw_MOV(p, dst, tmp);
+   }
+   else {
+      func(c, dst, arg0, arg1);
+   }
+}
+
+static void emit_sop( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1,
+		      unsigned cond)
+{
+   brw_push_insn_state(p);
+   brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_MOV(p, dst, brw_imm_f(1.0f));
+   brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   brw_MOV(p, dst, brw_imm_f(0.0f));
+   brw_pop_insn_state(p);
+}
+
+static void emit_seq( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
+}
+
+static void emit_sne( struct brw_compile *p,
+                      struct brw_reg dst,
+                      struct brw_reg arg0,
+                      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
+}
+static void emit_slt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_L);
+}
+
+static void emit_sle( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_LE);
+}
+
+static void emit_sgt( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_G);
+}
+
+static void emit_sge( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+  emit_sop(p, dst, arg0, arg1, BRW_CONDITIONAL_GE);
+}
+
+static void emit_max( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg1, arg0);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+static void emit_min( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg arg0,
+		      struct brw_reg arg1 )
+{
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
+   brw_SEL(p, dst, arg0, arg1);
+   brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+}
+
+
+static void emit_math1( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			unsigned precision)
+{
+   /* There are various odd behaviours with SEND on the simulator.  In
+    * addition there are documented issues with the fact that the GEN4
+    * processor doesn't do dependency control properly on SEND
+    * results.  So, on balance, this kludge to get around failures
+    * with writemasked math results looks like it might be necessary
+    * whether that turns out to be a simulator bug or not:
+    */
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+static void emit_math2( struct brw_vs_compile *c,
+			unsigned function,
+			struct brw_reg dst,
+			struct brw_reg arg0,
+			struct brw_reg arg1,
+			unsigned precision)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_message_reg(3), arg1);
+
+   brw_math(p,
+	    tmp,
+	    function,
+	    BRW_MATH_SATURATE_NONE,
+	    2,
+ 	    arg0,
+	    BRW_MATH_DATA_SCALAR,
+	    precision);
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+static void emit_exp_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X) {
+      struct brw_reg tmp = get_tmp(c);
+      struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+
+      /* tmp_d = floor(arg0.x) */
+      brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
+
+      /* result[0] = 2.0 ^ tmp */
+
+      /* Adjust exponent for floating point:
+       * exp += 127
+       */
+      brw_ADD(p, brw_writemask(tmp_d, TGSI_WRITEMASK_X), tmp_d, brw_imm_d(127));
+
+      /* Install exponent and sign.
+       * Excess drops off the edge:
+       */
+      brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), TGSI_WRITEMASK_X),
+	      tmp_d, brw_imm_d(23));
+
+      release_tmp(c, tmp);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y) {
+      /* result[1] = arg0.x - floor(arg0.x) */
+      brw_FRC(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* As with the LOG instruction, we might be better off just
+       * doing a taylor expansion here, seeing as we have to do all
+       * the prep work.
+       *
+       * If mathbox partial precision is too low, consider also:
+       * result[3] = result[0] * EXP(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_EXP,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(arg0, 0),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+}
+
+
+static void emit_log_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
+   boolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp) {
+      tmp = get_tmp(c);
+      tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
+   }
+
+   /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
+    * according to spec:
+    *
+    * These almost look likey they could be joined up, but not really
+    * practical:
+    *
+    * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
+    * result[1].i = (x.i & ((1<<23)-1)        + (127<<23)
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_XZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1U<<31)-1));
+
+      brw_SHR(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_X),
+	      tmp_ud,
+	      brw_imm_ud(23));
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_X),
+	      retype(tmp_ud, BRW_REGISTER_TYPE_D),	/* does it matter? */
+	      brw_imm_d(-127));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_YZ) {
+      brw_AND(p,
+	      brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	      brw_swizzle1(arg0_ud, 0),
+	      brw_imm_ud((1<<23)-1));
+
+      brw_OR(p,
+	     brw_writemask(tmp_ud, TGSI_WRITEMASK_Y),
+	     tmp_ud,
+	     brw_imm_ud(127<<23));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z) {
+      /* result[2] = result[0] + LOG2(result[1]); */
+
+      /* Why bother?  The above is just a hint how to do this with a
+       * taylor series.  Maybe we *should* use a taylor series as by
+       * the time all the above has been done it's almost certainly
+       * quicker than calling the mathbox, even with low precision.
+       *
+       * Options are:
+       *    - result[0] + mathbox.LOG2(result[1])
+       *    - mathbox.LOG2(arg0.x)
+       *    - result[0] + inline_taylor_approx(result[1])
+       */
+      emit_math1(c,
+		 BRW_MATH_FUNCTION_LOG,
+		 brw_writemask(tmp, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 1),
+		 BRW_MATH_PRECISION_FULL);
+
+      brw_ADD(p,
+	      brw_writemask(tmp, TGSI_WRITEMASK_Z),
+	      brw_swizzle1(tmp, 2),
+	      brw_swizzle1(tmp, 0));
+   }
+
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W) {
+      /* result[3] = 1.0; */
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_W), brw_imm_f(1));
+   }
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+
+/* Need to unalias - consider swizzles:   r0 = DST r0.xxxx r1
+ */
+static void emit_dst_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0,
+			      struct brw_reg arg1)
+{
+   struct brw_compile *p = &c->func;
+
+   /* There must be a better way to do this:
+    */
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_X)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_X), brw_imm_f(1.0));
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Y)
+      brw_MUL(p, brw_writemask(dst, TGSI_WRITEMASK_Y), arg0, arg1);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_Z)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Z), arg0);
+   if (dst.dw1.bits.writemask & TGSI_WRITEMASK_W)
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_W), arg1);
+}
+
+static void emit_xpd( struct brw_compile *p,
+		      struct brw_reg dst,
+		      struct brw_reg t,
+		      struct brw_reg u)
+{
+   brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3),  brw_swizzle(u,2,0,1,3));
+   brw_MAC(p, dst,     negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
+}
+
+
+
+static void emit_lit_noalias( struct brw_vs_compile *c,
+			      struct brw_reg dst,
+			      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *if_insn;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_YZ), brw_imm_f(0));
+   brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_XW), brw_imm_f(1));
+
+   /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
+    * to get all channels active inside the IF.  In the clipping code
+    * we run with NoMask, so it's not an option and we can use
+    * BRW_EXECUTE_1 for all comparisions.
+    */
+   brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
+   if_insn = brw_IF(p, BRW_EXECUTE_8);
+   {
+      brw_MOV(p, brw_writemask(dst, TGSI_WRITEMASK_Y), brw_swizzle1(arg0,0));
+
+      brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
+      brw_MOV(p, brw_writemask(tmp, TGSI_WRITEMASK_Z),  brw_swizzle1(arg0,1));
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+
+      emit_math2(c,
+		 BRW_MATH_FUNCTION_POW,
+		 brw_writemask(dst, TGSI_WRITEMASK_Z),
+		 brw_swizzle1(tmp, 2),
+		 brw_swizzle1(arg0, 3),
+		 BRW_MATH_PRECISION_PARTIAL);
+   }
+
+   brw_ENDIF(p, if_insn);
+}
+
+
+
+
+
+/* TODO: relative addressing!
+ */
+static struct brw_reg get_reg( struct brw_vs_compile *c,
+			       unsigned file,
+			       unsigned index )
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+      assert(c->regs[file][index].nr != 0);
+      return c->regs[file][index];
+   case TGSI_FILE_CONSTANT:
+      assert(c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index + c->prog_data.num_imm];
+   case TGSI_FILE_IMMEDIATE:
+      assert(c->regs[TGSI_FILE_CONSTANT][index].nr != 0);
+      return c->regs[TGSI_FILE_CONSTANT][index];
+   case TGSI_FILE_ADDRESS:
+      assert(index == 0);
+      return c->regs[file][index];
+
+   case TGSI_FILE_NULL:			/* undef values */
+      return brw_null_reg();
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+
+static struct brw_reg deref( struct brw_vs_compile *c,
+			     struct brw_reg arg,
+			     int offset)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = vec4(get_tmp(c));
+   struct brw_reg vp_address = retype(vec1(get_reg(c, TGSI_FILE_ADDRESS, 0)), BRW_REGISTER_TYPE_UW);
+   unsigned byte_offset = arg.nr * 32 + arg.subnr + offset * 16;
+   struct brw_reg indirect = brw_vec4_indirect(0,0);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+
+      /* This is pretty clunky - load the address register twice and
+       * fetch each 4-dword value in turn.  There must be a way to do
+       * this in a single pass, but I couldn't get it to work.
+       */
+      brw_ADD(p, brw_address_reg(0), vp_address, brw_imm_d(byte_offset));
+      brw_MOV(p, tmp, indirect);
+
+      brw_ADD(p, brw_address_reg(0), suboffset(vp_address, 8), brw_imm_d(byte_offset));
+      brw_MOV(p, suboffset(tmp, 4), indirect);
+
+      brw_pop_insn_state(p);
+   }
+
+   return vec8(tmp);
+}
+
+
+static void emit_arl( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct brw_reg arg0 )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg tmp = dst;
+   boolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   brw_RNDD(p, tmp, arg0);
+   brw_MUL(p, dst, tmp, brw_imm_d(16));
+
+   if (need_tmp)
+      release_tmp(c, tmp);
+}
+
+
+/* Will return mangled results for SWZ op.  The emit_swz() function
+ * ignores this result and recalculates taking extended swizzles into
+ * account.
+ */
+static struct brw_reg get_arg( struct brw_vs_compile *c,
+			       struct tgsi_src_register *src )
+{
+   struct brw_reg reg;
+
+   if (src->File == TGSI_FILE_NULL)
+      return brw_null_reg();
+
+#if 0
+   if (src->RelAddr)
+      reg = deref(c, c->regs[PROGRAM_STATE_VAR][0], src->Index);
+   else
+#endif
+      reg = get_reg(c, src->File, src->Index);
+
+   /* Convert 3-bit swizzle to 2-bit.
+    */
+   reg.dw1.bits.swizzle = BRW_SWIZZLE4(src->SwizzleX,
+				       src->SwizzleY,
+				       src->SwizzleZ,
+				       src->SwizzleW);
+
+   /* Note this is ok for non-swizzle instructions:
+    */
+   reg.negate = src->Negate ? 1 : 0;
+
+   return reg;
+}
+
+
+static struct brw_reg get_dst( struct brw_vs_compile *c,
+			       const struct tgsi_dst_register *dst )
+{
+   struct brw_reg reg = get_reg(c, dst->File, dst->Index);
+
+   reg.dw1.bits.writemask = dst->WriteMask;
+
+   return reg;
+}
+
+
+
+
+static void emit_swz( struct brw_vs_compile *c,
+		      struct brw_reg dst,
+		      struct tgsi_src_register src )
+{
+   struct brw_compile *p = &c->func;
+   unsigned zeros_mask = 0;
+   unsigned ones_mask = 0;
+   unsigned src_mask = 0;
+   ubyte src_swz[4];
+   boolean need_tmp = (src.Negate &&
+			 dst.file != BRW_GENERAL_REGISTER_FILE);
+   struct brw_reg tmp = dst;
+   unsigned i;
+
+   if (need_tmp)
+      tmp = get_tmp(c);
+
+   for (i = 0; i < 4; i++) {
+      if (dst.dw1.bits.writemask & (1<<i)) {
+	 ubyte s = 0;
+         switch(i) {
+         case 0:
+            s = src.SwizzleX;
+            break;
+            s = src.SwizzleY;
+         case 1:
+            break;
+            s = src.SwizzleZ;
+         case 2:
+            break;
+            s = src.SwizzleW;
+         case 3:
+            break;
+         }
+	 switch (s) {
+	 case TGSI_SWIZZLE_X:
+	 case TGSI_SWIZZLE_Y:
+	 case TGSI_SWIZZLE_Z:
+	 case TGSI_SWIZZLE_W:
+	    src_mask |= 1<<i;
+	    src_swz[i] = s;
+	    break;
+	 case TGSI_EXTSWIZZLE_ZERO:
+	    zeros_mask |= 1<<i;
+	    break;
+	 case TGSI_EXTSWIZZLE_ONE:
+	    ones_mask |= 1<<i;
+	    break;
+	 }
+      }
+   }
+
+   /* Do src first, in case dst aliases src:
+    */
+   if (src_mask) {
+      struct brw_reg arg0;
+
+#if 0
+      if (src.RelAddr)
+	 arg0 = deref(c, c->regs[PROGRAM_STATE_VAR][0], src.Index);
+      else
+#endif
+	 arg0 = get_reg(c, src.File, src.Index);
+
+      arg0 = brw_swizzle(arg0,
+			 src_swz[0], src_swz[1],
+			 src_swz[2], src_swz[3]);
+
+      brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
+   }
+
+   if (zeros_mask)
+      brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
+
+   if (ones_mask)
+      brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
+
+   if (src.Negate)
+      brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
+
+   if (need_tmp) {
+      brw_MOV(p, dst, tmp);
+      release_tmp(c, tmp);
+   }
+}
+
+
+
+/* Post-vertex-program processing.  Send the results to the URB.
+ */
+static void emit_vertex_write( struct brw_vs_compile *c, struct brw_prog_info *info)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg m0 = brw_message_reg(0);
+   struct brw_reg pos = c->regs[TGSI_FILE_OUTPUT][info->pos_idx];
+   struct brw_reg ndc;
+
+   if (c->key.copy_edgeflag) {
+      brw_MOV(p,
+	      get_reg(c, TGSI_FILE_OUTPUT, info->result_edge_idx),
+	      get_reg(c, TGSI_FILE_INPUT, info->edge_flag_idx));
+   }
+
+
+   /* Build ndc coords?   TODO: Shortcircuit when w is known to be one.
+    */
+   if (!c->key.know_w_is_one) {
+      ndc = get_tmp(c);
+      emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+      brw_MUL(p, brw_writemask(ndc, TGSI_WRITEMASK_XYZ), pos, ndc);
+   }
+   else {
+      ndc = pos;
+   }
+
+   /* This includes the workaround for -ve rhw, so is no longer an
+    * optional step:
+    */
+   if (info->writes_psize ||
+       c->key.nr_userclip ||
+       !c->key.know_w_is_one)
+   {
+      struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
+      unsigned i;
+
+      brw_MOV(p, header1, brw_imm_ud(0));
+
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      if (info->writes_psize) {
+	 struct brw_reg psiz = c->regs[TGSI_FILE_OUTPUT][info->psize_idx];
+	 brw_MUL(p, brw_writemask(header1, TGSI_WRITEMASK_W),
+                 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
+	 brw_AND(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1,
+                 brw_imm_ud(0x7ff<<8));
+      }
+
+
+      for (i = 0; i < c->key.nr_userclip; i++) {
+	 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
+	 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<i));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (!c->key.know_w_is_one) {
+	 brw_CMP(p,
+		 vec8(brw_null_reg()),
+		 BRW_CONDITIONAL_L,
+		 brw_swizzle1(ndc, 3),
+		 brw_imm_f(0));
+
+	 brw_OR(p, brw_writemask(header1, TGSI_WRITEMASK_W), header1, brw_imm_ud(1<<6));
+	 brw_MOV(p, ndc, brw_imm_f(0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      }
+
+      brw_set_access_mode(p, BRW_ALIGN_1);	/* why? */
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
+      brw_set_access_mode(p, BRW_ALIGN_16);
+
+      release_tmp(c, header1);
+   }
+   else {
+      brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+   }
+
+
+   /* Emit the (interleaved) headers for the two vertices - an 8-reg
+    * of zeros followed by two sets of NDC coordinates:
+    */
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, offset(m0, 2), ndc);
+   brw_MOV(p, offset(m0, 3), pos);
+
+
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 0,		/* starting mrf reg nr */
+		 c->r0,		/* src */
+		 0,		/* allocate */
+		 1,		/* used */
+		 c->nr_outputs + 3, /* msg len */
+		 0,		/* response len */
+		 1, 		/* eot */
+		 1, 		/* writes complete */
+		 0, 		/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+}
+
+static void
+post_vs_emit( struct brw_vs_compile *c, struct brw_instruction *end_inst )
+{
+   struct tgsi_parse_context parse;
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   tgsi_parse_init(&parse, tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
+#if 0
+         struct brw_instruction *brw_inst1, *brw_inst2;
+         const struct tgsi_full_instruction *inst1, *inst2;
+         int offset;
+         inst1 = &parse.FullToken.FullInstruction;
+         brw_inst1 = inst1->Data;
+         switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	 case TGSI_OPCODE_BRA:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->vp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 case TGSI_OPCODE_END:
+	    offset = end_inst - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+         }
+#endif
+      }
+   }
+   tgsi_parse_free(&parse);
+}
+
+static void process_declaration(const struct tgsi_full_declaration *decl,
+                                struct brw_prog_info *info)
+{
+   int first = decl->DeclarationRange.First;
+   int last = decl->DeclarationRange.Last;
+   
+   switch(decl->Declaration.File) {
+   case TGSI_FILE_CONSTANT: 
+      info->num_consts += last - first + 1;
+      break;
+   case TGSI_FILE_INPUT: {
+   }
+      break;
+   case TGSI_FILE_OUTPUT: {
+      assert(last == first);	/* for now */
+      if (decl->Declaration.Semantic) {
+         switch (decl->Semantic.SemanticName) {
+         case TGSI_SEMANTIC_POSITION: {
+            info->pos_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_COLOR:
+            break;
+         case TGSI_SEMANTIC_BCOLOR:
+            break;
+         case TGSI_SEMANTIC_FOG:
+            break;
+         case TGSI_SEMANTIC_PSIZE: {
+            info->writes_psize = TRUE;
+            info->psize_idx = first;
+         }
+            break;
+         case TGSI_SEMANTIC_GENERIC:
+            break;
+         }
+      }
+   }
+      break;
+   case TGSI_FILE_TEMPORARY: {
+      info->num_temps += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_SAMPLER: {
+   }
+      break;
+   case TGSI_FILE_ADDRESS: {
+      info->num_addrs += (last - first) + 1;
+   }
+      break;
+   case TGSI_FILE_IMMEDIATE: {
+   }
+      break;
+   case TGSI_FILE_NULL: {
+   }
+      break;
+   }
+}
+
+static void process_instruction(struct brw_vs_compile *c,
+                                struct tgsi_full_instruction *inst,
+                                struct brw_prog_info *info)
+{
+   struct brw_reg args[3], dst;
+   struct brw_compile *p = &c->func;
+   /*struct brw_indirect stack_index = brw_indirect(0, 0);*/
+   unsigned i;
+   unsigned index;
+   unsigned file;
+   /*FIXME: might not be the only one*/
+   const struct tgsi_dst_register *dst_reg = &inst->FullDstRegisters[0].DstRegister;
+   /*
+   struct brw_instruction *if_inst[MAX_IFSN];
+   unsigned insn, if_insn = 0;
+   */
+
+   for (i = 0; i < 3; i++) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      index = src->SrcRegister.Index;
+      file = src->SrcRegister.File;
+      if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+         args[i] = c->output_regs[index].reg;
+      else
+         args[i] = get_arg(c, &src->SrcRegister);
+   }
+
+   /* Get dest regs.  Note that it is possible for a reg to be both
+    * dst and arg, given the static allocation of registers.  So
+    * care needs to be taken emitting multi-operation instructions.
+    */
+   index = dst_reg->Index;
+   file = dst_reg->File;
+   if (file == TGSI_FILE_OUTPUT && c->output_regs[index].used_in_src)
+      dst = c->output_regs[index].reg;
+   else
+      dst = get_dst(c, dst_reg);
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      brw_MOV(p, dst, brw_abs(args[0]));
+      break;
+   case TGSI_OPCODE_ADD:
+      brw_ADD(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP3:
+      brw_DP3(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DP4:
+      brw_DP4(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DPH:
+      brw_DPH(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_DST:
+      unalias2(c, dst, args[0], args[1], emit_dst_noalias);
+      break;
+   case TGSI_OPCODE_EXP:
+      unalias1(c, dst, args[0], emit_exp_noalias);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_ARL:
+      emit_arl(c, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FLR:
+      brw_RNDD(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_FRC:
+      brw_FRC(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_LOG:
+      unalias1(c, dst, args[0], emit_log_noalias);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_LIT:
+      unalias1(c, dst, args[0], emit_lit_noalias);
+      break;
+   case TGSI_OPCODE_MAD:
+      brw_MOV(p, brw_acc_reg(), args[2]);
+      brw_MAC(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_SWZ:
+#if 0
+      /* The args[0] value can't be used here as it won't have
+       * correctly encoded the full swizzle:
+       */
+      emit_swz(c, dst, inst->SrcReg[0] );
+#endif
+      brw_MOV(p, dst, args[0]);
+      break;
+   case TGSI_OPCODE_MUL:
+      brw_MUL(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, args[0], BRW_MATH_PRECISION_FULL);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      emit_seq(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sne(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sge(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sgt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_slt(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sle(p, dst, args[0], args[1]);
+      break;
+   case TGSI_OPCODE_SUB:
+      brw_ADD(p, dst, args[0], negate(args[1]));
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(p, dst, args[0], args[1]);
+      break;
+#if 0
+   case TGSI_OPCODE_IF:
+      assert(if_insn < MAX_IFSN);
+      if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(if_insn > 0);
+      brw_ENDIF(p, if_inst[--if_insn]);
+      break;
+   case TGSI_OPCODE_BRA:
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_set_predicate_control_flag_value(p, 0xff);
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1uw(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(4));
+      inst->Data = &p->store[p->nr_insn];
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+#endif
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_ADD(p, get_addr_reg(stack_index),
+              get_addr_reg(stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1uw(stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+#else
+      /*brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));*/
+#endif
+      break;
+   case TGSI_OPCODE_END:
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   default:
+      debug_printf("Unsupport opcode %d in vertex shader\n", inst->Instruction.Opcode);
+      break;
+   }
+
+   if (dst_reg->File == TGSI_FILE_OUTPUT
+       && dst_reg->Index != info->pos_idx
+       && c->output_regs[dst_reg->Index].used_in_src)
+      brw_MOV(p, get_dst(c, dst_reg), dst);
+
+   release_tmps(c);
+}
+
+/* Emit the fragment program instructions here.
+ */
+void brw_vs_emit(struct brw_vs_compile *c)
+{
+#define MAX_IFSN 32
+   struct brw_compile *p = &c->func;
+   struct brw_instruction *end_inst;
+   struct tgsi_parse_context parse;
+   struct brw_indirect stack_index = brw_indirect(0, 0);
+   const struct tgsi_token *tokens = c->vp->program.tokens;
+   struct brw_prog_info prog_info;
+   unsigned allocated_registers = 0;
+   memset(&prog_info, 0, sizeof(struct brw_prog_info));
+
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_access_mode(p, BRW_ALIGN_16);
+
+   tgsi_parse_init(&parse, tokens);
+   /* Message registers can't be read, so copy the output into GRF register
+      if they are used in source registers */
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+      unsigned i;
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         for (i = 0; i < 3; ++i) {
+            const struct tgsi_src_register *src = &inst->FullSrcRegisters[i].SrcRegister;
+            unsigned index = src->Index;
+            unsigned file = src->File;
+            if (file == TGSI_FILE_OUTPUT)
+               c->output_regs[index].used_in_src = TRUE;
+         }
+      }
+         break;
+      default:
+         /* nothing */
+         break;
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   tgsi_parse_init(&parse, tokens);
+
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION: {
+         struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+         process_declaration(decl, &prog_info);
+      }
+         break;
+      case TGSI_TOKEN_TYPE_IMMEDIATE: {
+         struct tgsi_full_immediate *imm = &parse.FullToken.FullImmediate;
+         /*assert(imm->Immediate.Size == 4);*/
+         c->prog_data.imm_buf[c->prog_data.num_imm][0] = imm->u.ImmediateFloat32[0].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][1] = imm->u.ImmediateFloat32[1].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][2] = imm->u.ImmediateFloat32[2].Float;
+         c->prog_data.imm_buf[c->prog_data.num_imm][3] = imm->u.ImmediateFloat32[3].Float;
+         c->prog_data.num_imm++;
+      }
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION: {
+         struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
+         if (!allocated_registers) {
+            /* first instruction (declerations finished).
+             * now that we know what vars are being used allocate
+             * registers for them.*/
+            c->prog_data.num_consts = prog_info.num_consts;
+            c->prog_data.max_const = prog_info.num_consts + c->prog_data.num_imm;
+            brw_vs_alloc_regs(c, &prog_info);
+
+	    brw_set_access_mode(p, BRW_ALIGN_1);
+            brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
+	    brw_set_access_mode(p, BRW_ALIGN_16);
+            allocated_registers = 1;
+         }
+         process_instruction(c, inst, &prog_info);
+      }
+         break;
+      }
+   }
+
+   end_inst = &p->store[p->nr_insn];
+   emit_vertex_write(c, &prog_info);
+   post_vs_emit(c, end_inst);
+   tgsi_parse_free(&parse);
+
+}
diff --git a/src/gallium/drivers/i965simple/brw_vs_state.c b/src/gallium/drivers/i965simple/brw_vs_state.c
new file mode 100644
index 0000000000..1eaff87892
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_vs_state.c
@@ -0,0 +1,103 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+static void upload_vs_unit( struct brw_context *brw )
+{
+   struct brw_vs_unit_state vs;
+
+   memset(&vs, 0, sizeof(vs));
+
+   /* CACHE_NEW_VS_PROG */
+   vs.thread0.kernel_start_pointer = brw->vs.prog_gs_offset >> 6;
+   vs.thread0.grf_reg_count = align(brw->vs.prog_data->total_grf, 16) / 16 - 1;
+   vs.thread3.urb_entry_read_length = brw->vs.prog_data->urb_read_length;
+   vs.thread3.const_urb_entry_read_length = brw->vs.prog_data->curb_read_length;
+   vs.thread3.dispatch_grf_start_reg = 1;
+
+
+   /* BRW_NEW_URB_FENCE  */
+   vs.thread4.nr_urb_entries = brw->urb.nr_vs_entries;
+   vs.thread4.urb_entry_allocation_size = brw->urb.vsize - 1;
+   vs.thread4.max_threads = MIN2(
+      MAX2(0, (brw->urb.nr_vs_entries - 6) / 2 - 1),
+      15);
+
+
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      vs.thread4.max_threads = 0;
+
+   /* BRW_NEW_CURBE_OFFSETS, _NEW_TRANSFORM */
+   if (0 /*brw->attribs.Clip->ClipPlanesEnabled*/) {
+      /* Note that we read in the userclip planes as well, hence
+       * clip_start:
+       */
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.clip_start * 2;
+   }
+   else {
+      vs.thread3.const_urb_entry_read_offset = brw->curbe.vs_start * 2;
+   }
+
+   vs.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+   vs.thread3.urb_entry_read_offset = 0;
+
+   /* No samplers for ARB_vp programs:
+    */
+   vs.vs5.sampler_count = 0;
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      vs.thread4.stats_enable = 1;
+
+   /* Vertex program always enabled:
+    */
+   vs.vs6.vs_enable = 1;
+
+   brw->vs.state_gs_offset = brw_cache_data( &brw->cache[BRW_VS_UNIT], &vs );
+}
+
+
+const struct brw_tracked_state brw_vs_unit = {
+   .dirty = {
+      .brw   = (BRW_NEW_CLIP |
+		BRW_NEW_CURBE_OFFSETS |
+		BRW_NEW_URB_FENCE),
+      .cache = CACHE_NEW_VS_PROG
+   },
+   .update = upload_vs_unit
+};
diff --git a/src/gallium/drivers/i965simple/brw_winsys.h b/src/gallium/drivers/i965simple/brw_winsys.h
new file mode 100644
index 0000000000..ec1e400418
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_winsys.h
@@ -0,0 +1,209 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that i965simple requires any window system
+ * hosting it to implement.  This is the only include file in i965simple
+ * which is public.
+ *
+ */
+
+#ifndef BRW_WINSYS_H
+#define BRW_WINSYS_H
+
+
+#include "pipe/p_defines.h"
+
+
+/* Pipe drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+
+struct pipe_buffer;
+struct pipe_fence_handle;
+struct pipe_winsys;
+struct pipe_screen;
+
+
+/* The pipe driver currently understands the following chipsets:
+ */
+#define PCI_CHIP_I965_G			0x29A2
+#define PCI_CHIP_I965_Q			0x2992
+#define PCI_CHIP_I965_G_1		0x2982
+#define PCI_CHIP_I965_GM                0x2A02
+#define PCI_CHIP_I965_GME               0x2A12
+
+
+/* These are the names of all the state caches managed by the driver.
+ * 
+ * When data is uploaded to a buffer with buffer_subdata, we use the
+ * special version of that function below so that information about
+ * what type of data this is can be passed to the winsys backend.
+ * That in turn allows the correct flags to be set in the aub file
+ * dump to allow human-readable file dumps later on.
+ */
+
+enum brw_cache_id {
+   BRW_CC_VP,
+   BRW_CC_UNIT,
+   BRW_WM_PROG,
+   BRW_SAMPLER_DEFAULT_COLOR,
+   BRW_SAMPLER,
+   BRW_WM_UNIT,
+   BRW_SF_PROG,
+   BRW_SF_VP,
+   BRW_SF_UNIT,
+   BRW_VS_UNIT,
+   BRW_VS_PROG,
+   BRW_GS_UNIT,
+   BRW_GS_PROG,
+   BRW_CLIP_VP,
+   BRW_CLIP_UNIT,
+   BRW_CLIP_PROG,
+   BRW_SS_SURFACE,
+   BRW_SS_SURF_BIND,
+
+   BRW_MAX_CACHE
+};
+
+#define BRW_CONSTANT_BUFFER BRW_MAX_CACHE
+
+/**
+ * Additional winsys interface for i965simple.
+ *
+ * It is an over-simple batchbuffer mechanism.  Will want to improve the
+ * performance of this, perhaps based on the cmdstream stuff.  It
+ * would be pretty impossible to implement swz on top of this
+ * interface.
+ *
+ * Will also need additions/changes to implement static/dynamic
+ * indirect state.
+ */
+struct brw_winsys {
+
+   void (*destroy)(struct brw_winsys *);
+   
+   /**
+    * Reserve space on batch buffer.
+    *
+    * Returns a null pointer if there is insufficient space in the batch buffer
+    * to hold the requested number of dwords and relocations.
+    *
+    * The number of dwords should also include the number of relocations.
+    */
+   unsigned *(*batch_start)(struct brw_winsys *sws,
+                            unsigned dwords,
+                            unsigned relocs);
+
+   void (*batch_dword)(struct brw_winsys *sws,
+                       unsigned dword);
+
+   /**
+    * Emit a relocation to a buffer.
+    *
+    * Used not only when the buffer addresses are not pinned, but also to
+    * ensure refered buffers will not be destroyed until the current batch
+    * buffer execution is finished.
+    *
+    * The access flags is a combination of I915_BUFFER_ACCESS_WRITE and
+    * I915_BUFFER_ACCESS_READ macros.
+    */
+   void (*batch_reloc)(struct brw_winsys *sws,
+                       struct pipe_buffer *buf,
+                       unsigned access_flags,
+                       unsigned delta);
+
+
+   /* Not used yet, but really want this:
+    */
+   void (*batch_end)( struct brw_winsys *sws );
+
+   /**
+    * Flush the batch buffer.
+    *
+    * Fence argument must point to NULL or to a previous fence, and the caller
+    * must call fence_reference when done with the fence.
+    */
+   void (*batch_flush)(struct brw_winsys *sws,
+                       struct pipe_fence_handle **fence);
+
+
+   /* A version of buffer_subdata that includes information for the
+    * simulator:
+    */
+   void (*buffer_subdata_typed)(struct brw_winsys *sws, 
+				struct pipe_buffer *buf,
+				unsigned long offset, 
+				unsigned long size, 
+				const void *data,
+				unsigned data_type);
+   
+
+   /* A cheat so we don't have to think about relocations in a couple
+    * of places yet:
+    */
+   unsigned (*get_buffer_offset)( struct brw_winsys *sws,
+				  struct pipe_buffer *buf,
+				  unsigned flags );
+
+};
+
+#define BRW_BUFFER_ACCESS_WRITE   0x1
+#define BRW_BUFFER_ACCESS_READ    0x2
+
+#define BRW_BUFFER_USAGE_LIT_VERTEX  (PIPE_BUFFER_USAGE_CUSTOM << 0)
+
+
+struct pipe_context *brw_create(struct pipe_screen *,
+                                struct brw_winsys *,
+                                unsigned pci_id);
+
+static inline boolean brw_batchbuffer_data(struct brw_winsys *winsys,
+                                           const void *data,
+                                           unsigned bytes)
+{
+   static const unsigned incr = sizeof(unsigned);
+   uint i;
+   const unsigned *udata = (const unsigned*)(data);
+   unsigned size = bytes/incr;
+
+   winsys->batch_start(winsys, size, 0);
+   for (i = 0; i < size; ++i) {
+      winsys->batch_dword(winsys, udata[i]);
+   }
+   winsys->batch_end(winsys);
+
+   return (i == size);
+}
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm.c b/src/gallium/drivers/i965simple/brw_wm.c
new file mode 100644
index 0000000000..8de565b96c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.c
@@ -0,0 +1,209 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_util.h"
+#include "brw_wm.h"
+#include "brw_eu.h"
+#include "brw_state.h"
+#include "util/u_memory.h"
+
+
+
+static void do_wm_prog( struct brw_context *brw,
+			struct brw_fragment_program *fp,
+			struct brw_wm_prog_key *key)
+{
+   struct brw_wm_compile *c = CALLOC_STRUCT(brw_wm_compile);
+   const unsigned *program;
+   unsigned program_size;
+
+   c->key = *key;
+   c->fp = fp;
+   
+   c->delta_xy[0] = brw_null_reg();
+   c->delta_xy[1] = brw_null_reg();
+   c->pixel_xy[0] = brw_null_reg();
+   c->pixel_xy[1] = brw_null_reg();
+   c->pixel_w = brw_null_reg();
+
+
+   debug_printf("XXXXXXXX FP\n");
+   
+   brw_wm_glsl_emit(c);
+
+   /* get the program
+    */
+   program = brw_get_program(&c->func, &program_size);
+
+   /*
+    */
+   brw->wm.prog_gs_offset = brw_upload_cache( &brw->cache[BRW_WM_PROG],
+					      &c->key,
+					      sizeof(c->key),
+					      program,
+					      program_size,
+					      &c->prog_data,
+					      &brw->wm.prog_data );
+
+   FREE(c);
+}
+
+
+
+static void brw_wm_populate_key( struct brw_context *brw,
+				 struct brw_wm_prog_key *key )
+{
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *)brw->attribs.FragmentProgram;
+   unsigned lookup = 0;
+   unsigned line_aa;
+   
+   memset(key, 0, sizeof(*key));
+
+   /* Build the index for table lookup
+    */
+   /* BRW_NEW_DEPTH_STENCIL */
+   if (fp->info.uses_kill ||
+       brw->attribs.DepthStencil->alpha.enabled)
+      lookup |= IZ_PS_KILL_ALPHATEST_BIT;
+
+   if (fp->info.writes_z)
+      lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled)
+      lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->depth.enabled &&
+       brw->attribs.DepthStencil->depth.writemask) /* ?? */
+      lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
+
+   if (brw->attribs.DepthStencil->stencil[0].enabled) {
+      lookup |= IZ_STENCIL_TEST_ENABLE_BIT;
+
+      if (brw->attribs.DepthStencil->stencil[0].write_mask ||
+	  brw->attribs.DepthStencil->stencil[1].write_mask)
+	 lookup |= IZ_STENCIL_WRITE_ENABLE_BIT;
+   }
+
+   /* XXX: when should this be disabled?
+    */
+   if (1)
+      lookup |= IZ_EARLY_DEPTH_TEST_BIT;
+
+
+   line_aa = AA_NEVER;
+
+   /* _NEW_LINE, _NEW_POLYGON, BRW_NEW_REDUCED_PRIMITIVE */
+   if (brw->attribs.Raster->line_smooth) {
+      if (brw->reduced_primitive == PIPE_PRIM_LINES) {
+	 line_aa = AA_ALWAYS;
+      }
+      else if (brw->reduced_primitive == PIPE_PRIM_TRIANGLES) {
+	 if (brw->attribs.Raster->fill_ccw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE ||
+		(brw->attribs.Raster->cull_mode == PIPE_WINDING_CW))
+	       line_aa = AA_ALWAYS;
+	 }
+	 else if (brw->attribs.Raster->fill_cw == PIPE_POLYGON_MODE_LINE) {
+	    line_aa = AA_SOMETIMES;
+
+	    if (brw->attribs.Raster->cull_mode == PIPE_WINDING_CCW)
+	       line_aa = AA_ALWAYS;
+	 }
+      }
+   }
+
+   brw_wm_lookup_iz(line_aa,
+		    lookup,
+		    key);
+
+
+#if 0
+   /* BRW_NEW_SAMPLER 
+    *
+    * Not doing any of this at the moment:
+    */
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
+      const struct pipe_sampler_state *unit = brw->attribs.Samplers[i];
+
+      if (unit) {
+
+	 if (unit->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+	    key->shadowtex_mask |= 1<<i;
+	 }
+	 if (t->Image[0][t->BaseLevel]->InternalFormat == GL_YCBCR_MESA)
+	    key->yuvtex_mask |= 1<<i;
+      }
+   }
+#endif
+
+
+   /* Extra info:
+    */
+   key->program_string_id = fp->id;
+
+}
+
+
+static void brw_upload_wm_prog( struct brw_context *brw )
+{
+   struct brw_wm_prog_key key;
+   struct brw_fragment_program *fp = (struct brw_fragment_program *)
+      brw->attribs.FragmentProgram;
+
+   brw_wm_populate_key(brw, &key);
+
+   /* Make an early check for the key.
+    */
+   if (brw_search_cache(&brw->cache[BRW_WM_PROG],
+			&key, sizeof(key),
+			&brw->wm.prog_data,
+			&brw->wm.prog_gs_offset))
+      return;
+
+   do_wm_prog(brw, fp, &key);
+}
+
+
+const struct brw_tracked_state brw_wm_prog = {
+   .dirty = {
+      .brw   = (BRW_NEW_FS |
+		BRW_NEW_REDUCED_PRIMITIVE),
+      .cache = 0
+   },
+   .update = brw_upload_wm_prog
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm.h b/src/gallium/drivers/i965simple/brw_wm.h
new file mode 100644
index 0000000000..b29c4393f0
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm.h
@@ -0,0 +1,142 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+              
+
+#ifndef BRW_WM_H
+#define BRW_WM_H
+
+
+#include "brw_context.h"
+#include "brw_eu.h"
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+#define IZ_PS_KILL_ALPHATEST_BIT    0x1
+#define IZ_PS_COMPUTES_DEPTH_BIT    0x2
+#define IZ_DEPTH_WRITE_ENABLE_BIT   0x4
+#define IZ_DEPTH_TEST_ENABLE_BIT    0x8
+#define IZ_STENCIL_WRITE_ENABLE_BIT 0x10
+#define IZ_STENCIL_TEST_ENABLE_BIT  0x20
+#define IZ_EARLY_DEPTH_TEST_BIT     0x40
+#define IZ_BIT_MAX                  0x80
+
+#define AA_NEVER     0
+#define AA_SOMETIMES 1
+#define AA_ALWAYS    2
+
+struct brw_wm_prog_key {
+   unsigned source_depth_reg:3;
+   unsigned aa_dest_stencil_reg:3;
+   unsigned dest_depth_reg:3;
+   unsigned nr_depth_regs:3;
+   unsigned shadowtex_mask:8;
+   unsigned computes_depth:1;	/* could be derived from program string */
+   unsigned source_depth_to_render_target:1;
+   unsigned runtime_check_aads_emit:1;
+
+   unsigned yuvtex_mask:8;
+
+   unsigned program_string_id;
+};
+
+
+
+
+
+#define PROGRAM_INTERNAL_PARAM
+#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
+#define BRW_WM_MAX_INSN  (MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS*3 + PIPE_MAX_ATTRIBS + 3)
+#define BRW_WM_MAX_GRF   128		/* hardware limit */
+#define BRW_WM_MAX_VREG  (BRW_WM_MAX_INSN * 4)
+#define BRW_WM_MAX_REF   (BRW_WM_MAX_INSN * 12)
+#define BRW_WM_MAX_PARAM 256
+#define BRW_WM_MAX_CONST 256
+#define BRW_WM_MAX_KILLS MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS
+
+#define PAYLOAD_DEPTH     (PIPE_MAX_ATTRIBS)
+
+#define MAX_IFSN 32
+#define MAX_LOOP_DEPTH 32
+
+struct brw_wm_compile {
+   struct brw_compile func;
+   struct brw_wm_prog_key key;
+   struct brw_wm_prog_data prog_data; /* result */
+
+   struct brw_fragment_program *fp;
+
+   unsigned grf_limit;
+   unsigned max_wm_grf;
+
+
+   struct brw_reg pixel_xy[2];
+   struct brw_reg delta_xy[2];
+   struct brw_reg pixel_w;
+
+
+   struct brw_reg wm_regs[8][32][4];
+
+   struct brw_reg payload_depth[4];
+   struct brw_reg payload_coef[16];
+
+   struct brw_reg emit_mask_reg;
+
+   struct brw_instruction *if_inst[MAX_IFSN];
+   int if_insn;
+
+   struct brw_instruction *loop_inst[MAX_LOOP_DEPTH];
+   int loop_insn;
+
+   struct brw_instruction *inst0;
+   struct brw_instruction *inst1;
+
+   struct brw_reg stack;
+   struct brw_indirect stack_index;
+
+   unsigned reg_index;
+
+   unsigned tmp_start;
+   unsigned tmp_index;
+};
+
+
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key );
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c);
+void brw_wm_emit_decls(struct brw_wm_compile *c);
+
+#endif
diff --git a/src/gallium/drivers/i965simple/brw_wm_decl.c b/src/gallium/drivers/i965simple/brw_wm_decl.c
new file mode 100644
index 0000000000..d50e66f613
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_decl.c
@@ -0,0 +1,392 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_start + c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+
+static int is_null( struct brw_reg reg )
+{
+   return (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+	   reg.nr == BRW_ARF_NULL);
+}
+
+static void emit_pixel_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_xy[0])) {
+
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+      c->pixel_xy[0] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+      c->pixel_xy[1] = vec8(retype(alloc_tmp(c), BRW_REGISTER_TYPE_UW));
+
+      /* Calculate pixel centers by adding 1 or 0 to each of the
+       * micro-tile coordinates passed in r1.
+       */
+      brw_ADD(p,
+	      c->pixel_xy[0],
+	      stride(suboffset(r1_uw, 4), 2, 4, 0),
+	      brw_imm_v(0x10101010));
+
+      brw_ADD(p,
+	      c->pixel_xy[1],
+	      stride(suboffset(r1_uw, 5), 2, 4, 0),
+	      brw_imm_v(0x11001100));
+   }
+}
+
+
+
+
+
+
+static void emit_delta_xy( struct brw_wm_compile *c )
+{
+   if (is_null(c->delta_xy[0])) {
+      struct brw_compile *p = &c->func;
+      struct brw_reg r1 = brw_vec1_grf(1, 0);
+
+      emit_pixel_xy(c);
+
+      c->delta_xy[0] = alloc_tmp(c);
+      c->delta_xy[1] = alloc_tmp(c);
+
+      /* Calc delta X,Y by subtracting origin in r1 from the pixel
+       * centers.
+       */
+      brw_ADD(p,
+	      c->delta_xy[0],
+	      retype(c->pixel_xy[0], BRW_REGISTER_TYPE_UW),
+	      negate(r1));
+
+      brw_ADD(p,
+	      c->delta_xy[1],
+	      retype(c->pixel_xy[1], BRW_REGISTER_TYPE_UW),
+	      negate(suboffset(r1,1)));
+   }
+}
+
+
+
+#if 0
+static void emit_pixel_w( struct brw_wm_compile *c )
+{
+   if (is_null(c->pixel_w)) {
+      struct brw_compile *p = &c->func;
+
+      struct brw_reg interp_wpos = c->coef_wpos;
+      
+      c->pixel_w = alloc_tmp(c);
+
+      emit_delta_xy(c);
+
+      /* Calc 1/w - just linterp wpos[3] optimized by putting the
+       * result straight into a message reg.
+       */
+      struct brw_reg interp3 = brw_vec1_grf(interp_wpos.nr+1, 4);
+      brw_LINE(p, brw_null_reg(), interp3, c->delta_xy[0]);
+      brw_MAC(p, brw_message_reg(2), suboffset(interp3, 1), c->delta_xy[1]);
+
+      /* Calc w */
+      brw_math_16( p, 
+		   c->pixel_w,
+		   BRW_MATH_FUNCTION_INV,
+		   BRW_MATH_SATURATE_NONE,
+		   2, 
+		   brw_null_reg(),
+		   BRW_MATH_PRECISION_FULL);
+   }
+}
+#endif
+
+
+static void emit_cinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_MOV(p, dst, suboffset(interp[i],3));
+      }
+   }
+}
+
+static void emit_linterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   emit_delta_xy(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = c->wm_regs[TGSI_FILE_INPUT][idx][i];
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+      }
+   }
+}
+
+#if 0
+static void emit_pinterp(struct brw_wm_compile *c,
+			 int idx,
+			 int mask )
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg interp[4];
+   struct brw_reg coef = c->payload_coef[idx];
+   int i;
+
+   get_delta_xy(c);
+   get_pixel_w(c);
+
+   interp[0] = brw_vec1_grf(coef.nr, 0);
+   interp[1] = brw_vec1_grf(coef.nr, 4);
+   interp[2] = brw_vec1_grf(coef.nr+1, 0);
+   interp[3] = brw_vec1_grf(coef.nr+1, 4);
+
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 struct brw_reg dst = allocate_reg(c, TGSI_FILE_INPUT, idx, i);
+	 brw_LINE(p, brw_null_reg(), interp[i], c->delta_xy[0]);
+	 brw_MAC(p, dst, suboffset(interp[i],1), c->delta_xy[1]);
+	 brw_MUL(p, dst, dst, c->pixel_w);
+      }
+   }
+}
+#endif
+
+
+
+#if 0
+static void emit_wpos( )
+{ 
+   struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx);
+   struct tgsi_full_src_register interp = src_reg(PROGRAM_PAYLOAD, idx);
+   struct tgsi_full_src_register deltas = get_delta_xy(c);
+   struct tgsi_full_src_register arg2;
+   unsigned opcode;
+
+   opcode = WM_LINTERP;
+   arg2 = src_undef();
+
+   /* Have to treat wpos.xy specially:
+    */
+   emit_op(c,
+	   WM_WPOSXY,
+	   dst_mask(dst, WRITEMASK_XY),
+	   0, 0, 0,
+	   get_pixel_xy(c),
+	   src_undef(),
+	   src_undef());
+      
+   dst = dst_mask(dst, WRITEMASK_ZW);
+
+   /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw
+    */
+   emit_op(c,
+	   WM_LINTERP,
+	   dst,
+	   0, 0, 0,
+	   interp,
+	   deltas,
+	   arg2);
+}
+#endif
+
+
+
+
+/* Perform register allocation:
+ * 
+ *  -- r0???
+ *  -- passthrough depth regs (and stencil/aa??)
+ *  -- curbe ??
+ *  -- inputs (coefficients)
+ *
+ * Use a totally static register allocation.  This will perform poorly
+ * but is an easy way to get started (again).
+ */
+static void prealloc_reg(struct brw_wm_compile *c)
+{
+   int i, j;
+   int nr_curbe_regs = 0;
+
+   /* R0, then some depth related regs:
+    */
+   for (i = 0; i < c->key.nr_depth_regs; i++) {
+      c->payload_depth[i] =  brw_vec8_grf(i*2, 0);
+      c->reg_index += 2;
+   }
+
+
+   /* Then a copy of our part of the CURBE entry:
+    */
+   {
+      int nr_constants = c->fp->info.file_max[TGSI_FILE_CONSTANT] + 1;
+      int index = 0;
+
+      /* XXX number of constants, or highest numbered constant? */
+      assert(nr_constants == c->fp->info.file_count[TGSI_FILE_CONSTANT]);
+
+      c->prog_data.max_const = 4*nr_constants;
+      for (i = 0; i < nr_constants; i++) {
+	 for (j = 0; j < 4; j++, index++) 
+	    c->wm_regs[TGSI_FILE_CONSTANT][i][j] = brw_vec1_grf(c->reg_index + index/8,
+								index%8);
+      }
+
+      nr_curbe_regs = 2*((4*nr_constants+15)/16);
+      c->reg_index += nr_curbe_regs;
+   }
+
+   /* Adjust for parameter coefficients for position, which are
+    * currently always provided.
+    */
+//   c->position_coef[i] = brw_vec8_grf(c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Next we receive the plane coefficients for parameter
+    * interpolation:
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] == c->fp->info.num_inputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) {
+      c->payload_coef[i] = brw_vec8_grf(c->reg_index, 0);
+      c->reg_index += 2;
+   }
+
+   c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
+   c->prog_data.urb_read_length = (c->fp->info.num_inputs + 1) * 2;
+   c->prog_data.curb_read_length = nr_curbe_regs;
+
+   /* That's the end of the payload, now we can start allocating registers.
+    */
+   c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index++;
+
+   c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
+   c->reg_index += 2;
+
+   /* Now allocate room for the interpolated inputs and staging
+    * registers for the outputs:
+    */
+   /* XXX do we want to loop over the _number_ of inputs/outputs or loop
+    * to the highest input/output index that's used?
+    *  Probably the same, actually.
+    */
+   assert(c->fp->info.file_max[TGSI_FILE_INPUT] + 1 == c->fp->info.num_inputs);
+   assert(c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1 == c->fp->info.num_outputs);
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_INPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_INPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   for (i = 0; i < c->fp->info.file_max[TGSI_FILE_OUTPUT] + 1; i++) 
+      for (j = 0; j < 4; j++)
+	 c->wm_regs[TGSI_FILE_OUTPUT][i][j] = brw_vec8_grf( c->reg_index++, 0 );
+
+   /* Beyond this we should only need registers for internal temporaries:
+    */
+   c->tmp_start = c->reg_index;
+}
+
+
+
+
+
+/* Need to interpolate fragment program inputs in as a preamble to the
+ * shader.  A more sophisticated compiler would do this on demand, but
+ * we'll do it up front:
+ */
+void brw_wm_emit_decls(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   int done = 0;
+
+   prealloc_reg(c);
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !done &&
+	  !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+      {
+	 const struct tgsi_full_declaration *decl = &parse.FullToken.FullDeclaration;
+	 unsigned first = decl->DeclarationRange.First;
+	 unsigned last = decl->DeclarationRange.Last;
+	 unsigned mask = decl->Declaration.UsageMask; /* ? */
+	 unsigned i;
+
+	 if (decl->Declaration.File != TGSI_FILE_INPUT)
+	    break;
+
+	 for( i = first; i <= last; i++ ) {
+	    switch (decl->Declaration.Interpolate) {
+	    case TGSI_INTERPOLATE_CONSTANT:
+	       emit_cinterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_LINEAR:
+	       emit_linterp(c, i, mask);
+	       break;
+
+	    case TGSI_INTERPOLATE_PERSPECTIVE:
+	       //emit_pinterp(c, i, mask);
+	       emit_linterp(c, i, mask);
+	       break;
+	    }
+	 }
+	 break;
+      }
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+      default:
+         done = 1;
+	 break;
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   release_tmps(c);
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_glsl.c b/src/gallium/drivers/i965simple/brw_wm_glsl.c
new file mode 100644
index 0000000000..ab6410aa60
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_glsl.c
@@ -0,0 +1,1076 @@
+
+#include "brw_context.h"
+#include "brw_eu.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+
+
+
+static int get_scalar_dst_index(struct tgsi_full_instruction *inst)
+{
+   struct tgsi_dst_register dst = inst->FullDstRegisters[0].DstRegister;
+   int i;
+   for (i = 0; i < 4; i++)
+      if (dst.WriteMask & (1<<i))
+	 break;
+   return i;
+}
+
+static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
+{
+   c->tmp_index++;
+   c->reg_index = MAX2(c->reg_index, c->tmp_index);
+   return brw_vec8_grf(c->tmp_start + c->tmp_index, 0);
+}
+
+static void release_tmps(struct brw_wm_compile *c)
+{
+   c->tmp_index = 0;
+}
+
+
+static struct brw_reg
+get_reg(struct brw_wm_compile *c, int file, int index, int component )
+{
+   switch (file) {
+   case TGSI_FILE_NULL:
+      return brw_null_reg();
+
+   case TGSI_FILE_SAMPLER:
+      /* Should never get here:
+       */
+      assert (0);	       
+      return brw_null_reg();
+
+   case TGSI_FILE_IMMEDIATE:
+      /* These need a different path:
+       */
+      assert(0);
+      return brw_null_reg();
+
+       
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_INPUT:
+   case TGSI_FILE_OUTPUT:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+      return c->wm_regs[file][index][component];
+
+   default:
+      assert(0);
+      return brw_null_reg();
+   }
+}
+
+
+static struct brw_reg get_dst_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_instruction *inst, 
+				  int component)
+{
+   return get_reg(c, 
+		  inst->FullDstRegisters[0].DstRegister.File, 
+		  inst->FullDstRegisters[0].DstRegister.Index,
+		  component);
+}
+
+static int get_swz( struct tgsi_src_register src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.SwizzleX;
+   case 1: return src.SwizzleY;
+   case 2: return src.SwizzleZ;
+   case 3: return src.SwizzleW;
+   default: return 0;
+   }
+}
+
+static int get_ext_swz( struct tgsi_src_register_ext_swz src, int index )
+{
+   switch (index & 3) {
+   case 0: return src.ExtSwizzleX;
+   case 1: return src.ExtSwizzleY;
+   case 2: return src.ExtSwizzleZ;
+   case 3: return src.ExtSwizzleW;
+   default: return 0;
+   }
+}
+
+static struct brw_reg get_src_reg(struct brw_wm_compile *c,
+				  struct tgsi_full_src_register *src, 
+				  int index)
+{
+   struct brw_reg reg;
+   int component = index;
+   int neg = 0;
+   int abs = 0;
+
+   if (src->SrcRegister.Negate)
+      neg = 1;
+
+   component = get_swz(src->SrcRegister, component);
+
+   /* Yes, there are multiple negates:
+    */
+   switch (component & 3) {
+   case 0: neg ^= src->SrcRegisterExtSwz.NegateX; break;
+   case 1: neg ^= src->SrcRegisterExtSwz.NegateY; break;
+   case 2: neg ^= src->SrcRegisterExtSwz.NegateZ; break;
+   case 3: neg ^= src->SrcRegisterExtSwz.NegateW; break;
+   }
+
+   /* And multiple swizzles, fun isn't it:
+    */
+   component = get_ext_swz(src->SrcRegisterExtSwz, component);
+
+   /* Not handling indirect lookups yet:
+    */
+   assert(src->SrcRegister.Indirect == 0);
+
+   /* Don't know what dimension means:
+    */
+   assert(src->SrcRegister.Dimension == 0);
+
+   /* Will never handle any of this stuff: 
+    */
+   assert(src->SrcRegisterExtMod.Complement == 0);
+   assert(src->SrcRegisterExtMod.Bias == 0);
+   assert(src->SrcRegisterExtMod.Scale2X == 0);
+
+   if (src->SrcRegisterExtMod.Absolute)
+      abs = 1;
+
+   /* Another negate!  This is a post-absolute negate, which we
+    * can't do.  Need to clean the crap out of tgsi somehow.
+    */
+   assert(src->SrcRegisterExtMod.Negate == 0);
+
+   switch( component ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      reg = get_reg(c, 
+		    src->SrcRegister.File, 
+		    src->SrcRegister.Index, 
+		    component );
+
+      if (neg) 
+	 reg = negate(reg);
+   
+      if (abs)
+	 reg = brw_abs(reg);
+
+      break;
+
+      /* XXX: this won't really work in the general case, but we know
+       * that the extended swizzle is only allowed in the SWZ
+       * instruction (right??), in which case using an immediate
+       * directly will work.
+       */
+   case TGSI_EXTSWIZZLE_ZERO:
+      reg = brw_imm_f(0);
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      if (neg && !abs)
+	 reg = brw_imm_f(-1.0);
+      else
+	 reg = brw_imm_f(1.0);
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+    
+   return reg;
+}
+
+static void emit_abs( struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst)
+{
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+
+   int i;
+   struct brw_compile *p = &c->func;
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 struct brw_reg src, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_MOV(p, dst, brw_abs(src)); /* NOTE */
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_xpd(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   int i;
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   for (i = 0; i < 4; i++) {
+      unsigned i2 = (i+2)%3;
+      unsigned i1 = (i+1)%3;
+      if (mask & (1<<i)) {
+	 struct brw_reg src0, src1, dst;
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = negate(get_src_reg(c, &inst->FullSrcRegisters[0], i2));
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i1);
+	 brw_MUL(p, brw_null_reg(), src0, src1);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i1);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i2);
+	 brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+	 brw_MAC(p, dst, src0, src1);
+	 brw_set_saturate(p, 0);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp3(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[3], src1[3], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 3; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dp4(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, brw_null_reg(), src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_MAC(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_dph(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_reg src0[4], src1[4], dst;
+   int i;
+   struct brw_compile *p = &c->func;
+   for (i = 0; i < 4; i++) {
+      src0[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+      src1[i] = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+   }
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MUL(p, brw_null_reg(), src0[0], src1[0]);
+   brw_MAC(p, brw_null_reg(), src0[1], src1[1]);
+   brw_MAC(p, dst, src0[2], src1[2]);
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   brw_ADD(p, dst, src0[3], src1[3]);
+   brw_set_saturate(p, 0);
+}
+
+static void emit_math1(struct brw_wm_compile *c,
+		       struct tgsi_full_instruction *inst, unsigned func)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_math(p,
+	    dst,
+	    func,
+	    ((inst->Instruction.Saturate != TGSI_SAT_NONE) 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+
+static void emit_alu2(struct brw_wm_compile *c,		      
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, src1, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_alu2(p, opcode, dst, src0, src1);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+
+static void emit_alu1(struct brw_wm_compile *c,
+		      struct tgsi_full_instruction *inst,
+		      unsigned opcode)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg src0, dst;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+   brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+   for (i = 0 ; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 brw_alu1(p, opcode, dst, src0);
+      }
+   }
+   if (inst->Instruction.Saturate != TGSI_SAT_NONE)
+      brw_set_saturate(p, 0);
+}
+
+
+static void emit_max(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src0, src1);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_min(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg src0, src1, dst;
+   int i;
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MOV(p, dst, src0);
+	 brw_set_saturate(p, 0);
+
+	 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, src1, src0);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, src1);
+	 brw_set_saturate(p, 0);
+	 brw_set_predicate_control_flag_value(p, 0xff);
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+static void emit_pow(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg dst, src0, src1;
+   dst = get_dst_reg(c, inst, get_scalar_dst_index(inst));
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   src1 = get_src_reg(c, &inst->FullSrcRegisters[1], 0);
+
+   brw_MOV(p, brw_message_reg(2), src0);
+   brw_MOV(p, brw_message_reg(3), src1);
+
+   brw_math(p,
+	    dst,
+	    BRW_MATH_FUNCTION_POW,
+	    (inst->Instruction.Saturate != TGSI_SAT_NONE 
+	     ? BRW_MATH_SATURATE_SATURATE 
+	     : BRW_MATH_SATURATE_NONE),
+	    2,
+	    brw_null_reg(),
+	    BRW_MATH_DATA_VECTOR,
+	    BRW_MATH_PRECISION_FULL);
+}
+
+static void emit_lrp(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, tmp1, tmp2, src0, src1, src2;
+   int i;
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+
+	 if (src1.nr == dst.nr) {
+	    tmp1 = alloc_tmp(c);
+	    brw_MOV(p, tmp1, src1);
+	 } else
+	    tmp1 = src1;
+
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 if (src2.nr == dst.nr) {
+	    tmp2 = alloc_tmp(c);
+	    brw_MOV(p, tmp2, src2);
+	 } else
+	    tmp2 = src2;
+
+	 brw_ADD(p, dst, negate(src0), brw_imm_f(1.0));
+	 brw_MUL(p, brw_null_reg(), dst, tmp2);
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_MAC(p, dst, src0, tmp1);
+	 brw_set_saturate(p, 0);
+      }
+      release_tmps(c);
+   }
+}
+
+static void emit_kil(struct brw_wm_compile *c)
+{
+   struct brw_compile *p = &c->func;
+   struct brw_reg depth = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+   brw_push_insn_state(p);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   brw_NOT(p, c->emit_mask_reg, brw_mask_reg(1)); //IMASK
+   brw_AND(p, depth, c->emit_mask_reg, depth);
+   brw_pop_insn_state(p);
+}
+
+static void emit_mad(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1, src2;
+   int i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 src2 = get_src_reg(c, &inst->FullSrcRegisters[2], i);
+	 brw_MUL(p, dst, src0, src1);
+
+	 brw_set_saturate(p, (inst->Instruction.Saturate != TGSI_SAT_NONE) ? 1 : 0);
+	 brw_ADD(p, dst, dst, src2);
+	 brw_set_saturate(p, 0);
+      }
+   }
+}
+
+static void emit_sop(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst, unsigned cond)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg dst, src0, src1;
+   int i;
+
+   brw_push_insn_state(p);
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 src0 = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+	 src1 = get_src_reg(c, &inst->FullSrcRegisters[1], i);
+	 brw_CMP(p, brw_null_reg(), cond, src0, src1);
+	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+	 brw_MOV(p, dst, brw_imm_f(0.0));
+	 brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+	 brw_MOV(p, dst, brw_imm_f(1.0));
+      }
+   }
+   brw_pop_insn_state(p);
+}
+
+
+static void emit_ddx(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   nr = src0.nr;
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, interp[i]);
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+static void emit_ddy(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   unsigned mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+   struct brw_reg interp[4];
+   struct brw_reg dst;
+   struct brw_reg src0, w;
+   unsigned nr, i;
+
+   src0 = get_src_reg(c, &inst->FullSrcRegisters[0], 0);
+   nr = src0.nr;
+   w = get_src_reg(c, &inst->FullSrcRegisters[1], 3);
+   interp[0] = brw_vec1_grf(nr, 0);
+   interp[1] = brw_vec1_grf(nr, 4);
+   interp[2] = brw_vec1_grf(nr+1, 0);
+   interp[3] = brw_vec1_grf(nr+1, 4);
+   brw_set_saturate(p, inst->Instruction.Saturate != TGSI_SAT_NONE);
+   for(i = 0; i < 4; i++ ) {
+      if (mask & (1<<i)) {
+	 dst = get_dst_reg(c, inst, i);
+	 brw_MOV(p, dst, suboffset(interp[i], 1));
+	 brw_MUL(p, dst, dst, w);
+      }
+   }
+   brw_set_saturate(p, 0);
+}
+
+/* TODO
+   BIAS on SIMD8 not workind yet...
+*/
+static void emit_txb(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned i;
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+      break;
+   default:
+      brw_MOV(p, brw_message_reg(2), src[0]);
+      brw_MOV(p, brw_message_reg(3), src[1]);
+      brw_MOV(p, brw_message_reg(4), src[2]);
+      break;
+   }
+#else
+   brw_MOV(p, brw_message_reg(2), src[0]);
+   brw_MOV(p, brw_message_reg(3), src[1]);
+   brw_MOV(p, brw_message_reg(4), brw_imm_f(0));
+#endif
+
+   brw_MOV(p, brw_message_reg(5), src[3]);
+   brw_MOV(p, brw_message_reg(6), brw_imm_f(0));
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      4,
+	      4,
+	      0);
+#endif
+}
+
+static void emit_tex(struct brw_wm_compile *c,
+		     struct tgsi_full_instruction *inst)
+{
+#if 0
+   struct brw_compile *p = &c->func;
+   struct brw_reg payload_reg = c->payload_depth[0];
+   struct brw_reg dst[4], src[4];
+   unsigned msg_len;
+   unsigned i, nr;
+   unsigned emit;
+   boolean shadow = (c->key.shadowtex_mask & (1<<inst->TexSrcUnit)) ? 1 : 0;
+
+   for (i = 0; i < 4; i++)
+      dst[i] = get_dst_reg(c, inst, i);
+   for (i = 0; i < 4; i++)
+      src[i] = get_src_reg(c, &inst->FullSrcRegisters[0], i);
+
+#if 0
+   switch (inst->TexSrcTarget) {
+   case TEXTURE_1D_INDEX:
+      emit = WRITEMASK_X;
+      nr = 1;
+      break;
+   case TEXTURE_2D_INDEX:
+   case TEXTURE_RECT_INDEX:
+      emit = WRITEMASK_XY;
+      nr = 2;
+      break;
+   default:
+      emit = WRITEMASK_XYZ;
+      nr = 3;
+      break;
+   }
+#else
+   emit = WRITEMASK_XY;
+   nr = 2;
+#endif
+
+   msg_len = 1;
+
+   for (i = 0; i < nr; i++) {
+      static const unsigned swz[4] = {0,1,2,2};
+      if (emit & (1<<i))
+	 brw_MOV(p, brw_message_reg(msg_len+1), src[swz[i]]);
+      else
+	 brw_MOV(p, brw_message_reg(msg_len+1), brw_imm_f(0));
+      msg_len += 1;
+   }
+
+   if (shadow) {
+      brw_MOV(p, brw_message_reg(5), brw_imm_f(0));
+      brw_MOV(p, brw_message_reg(6), src[2]);
+   }
+
+   brw_SAMPLE(p,
+	      retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),
+	      1,
+	      retype(payload_reg, BRW_REGISTER_TYPE_UW),
+	      inst->TexSrcUnit + 1, /* surface */
+	      inst->TexSrcUnit,     /* sampler */
+	      inst->FullDstRegisters[0].DstRegister.WriteMask,
+	      BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,
+	      4,
+	      shadow ? 6 : 4,
+	      0);
+
+   if (shadow)
+      brw_MOV(p, dst[3], brw_imm_f(1.0));
+#endif
+}
+
+
+
+
+
+
+
+
+static void emit_fb_write(struct brw_wm_compile *c,
+			  struct tgsi_full_instruction *inst)
+{
+   struct brw_compile *p = &c->func;
+   int nr = 2;
+   int channel;
+   int base_reg = 0;
+
+   // src0 = output color
+   // src1 = payload_depth[0]
+   // src2 = output depth
+   // dst = ???
+
+
+
+   /* Reserve a space for AA - may not be needed:
+    */
+   if (c->key.aa_dest_stencil_reg)
+      nr += 1;
+
+   {
+      brw_push_insn_state(p);
+      for (channel = 0; channel < 4; channel++) {
+	 struct brw_reg src0 = c->wm_regs[TGSI_FILE_OUTPUT][0][channel];
+
+	 /*  mov (8) m2.0<1>:ud   r28.0<8;8,1>:ud  { Align1 } */
+	 /*  mov (8) m6.0<1>:ud   r29.0<8;8,1>:ud  { Align1 SecHalf } */
+	 brw_MOV(p, brw_message_reg(nr + channel), src0);
+      }
+      /* skip over the regs populated above: */
+      nr += 8;
+      brw_pop_insn_state(p);
+   }
+    
+
+   /* Pass through control information:
+    */
+   /*  mov (8) m1.0<1>:ud   r1.0<8;8,1>:ud   { Align1 NoMask } */
+   {
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE); /* ? */
+      brw_MOV(p,
+	      brw_message_reg(base_reg + 1),
+	      brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   /* Send framebuffer write message: */
+   brw_fb_WRITE(p,
+		retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
+		base_reg,
+		retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
+		0,              /* render surface always 0 */
+		nr,
+		0,
+		1);
+
+}
+
+
+static void brw_wm_emit_instruction( struct brw_wm_compile *c,
+				     struct tgsi_full_instruction *inst )
+{
+   struct brw_compile *p = &c->func;
+
+#if 0   
+   if (inst->CondUpdate)
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
+   else
+      brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#else
+   brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
+#endif
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ABS:
+      emit_abs(c, inst);
+      break;
+   case TGSI_OPCODE_ADD:
+      emit_alu2(c, inst, BRW_OPCODE_ADD);
+      break;
+   case TGSI_OPCODE_SUB:
+      assert(0);
+//      emit_alu2(c, inst, BRW_OPCODE_SUB);
+      break;
+   case TGSI_OPCODE_FRC:
+      emit_alu1(c, inst, BRW_OPCODE_FRC);
+      break;
+   case TGSI_OPCODE_FLR:
+      assert(0);
+//      emit_alu1(c, inst, BRW_OPCODE_FLR);
+      break;
+   case TGSI_OPCODE_LRP:
+      emit_lrp(c, inst);
+      break;
+   case TGSI_OPCODE_INT:
+      emit_alu1(c, inst, BRW_OPCODE_RNDD);
+      break;
+   case TGSI_OPCODE_MOV:
+      emit_alu1(c, inst, BRW_OPCODE_MOV);
+      break;
+   case TGSI_OPCODE_DP3:
+      emit_dp3(c, inst);
+      break;
+   case TGSI_OPCODE_DP4:
+      emit_dp4(c, inst);
+      break;
+   case TGSI_OPCODE_XPD:
+      emit_xpd(c, inst);
+      break;
+   case TGSI_OPCODE_DPH:
+      emit_dph(c, inst);
+      break;
+   case TGSI_OPCODE_RCP:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_INV);
+      break;
+   case TGSI_OPCODE_RSQ:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_RSQ);
+      break;
+   case TGSI_OPCODE_SIN:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_SIN);
+      break;
+   case TGSI_OPCODE_COS:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_COS);
+      break;
+   case TGSI_OPCODE_EX2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_EXP);
+      break;
+   case TGSI_OPCODE_LG2:
+      emit_math1(c, inst, BRW_MATH_FUNCTION_LOG);
+      break;
+   case TGSI_OPCODE_MAX:
+      emit_max(c, inst);
+      break;
+   case TGSI_OPCODE_MIN:
+      emit_min(c, inst);
+      break;
+   case TGSI_OPCODE_DDX:
+      emit_ddx(c, inst);
+      break;
+   case TGSI_OPCODE_DDY:
+      emit_ddy(c, inst);
+      break;
+   case TGSI_OPCODE_SLT:
+      emit_sop(c, inst, BRW_CONDITIONAL_L);
+      break;
+   case TGSI_OPCODE_SLE:
+      emit_sop(c, inst, BRW_CONDITIONAL_LE);
+      break;
+   case TGSI_OPCODE_SGT:
+      emit_sop(c, inst, BRW_CONDITIONAL_G);
+      break;
+   case TGSI_OPCODE_SGE:
+      emit_sop(c, inst, BRW_CONDITIONAL_GE);
+      break;
+   case TGSI_OPCODE_SEQ:
+      emit_sop(c, inst, BRW_CONDITIONAL_EQ);
+      break;
+   case TGSI_OPCODE_SNE:
+      emit_sop(c, inst, BRW_CONDITIONAL_NEQ);
+      break;
+   case TGSI_OPCODE_MUL:
+      emit_alu2(c, inst, BRW_OPCODE_MUL);
+      break;
+   case TGSI_OPCODE_POW:
+      emit_pow(c, inst);
+      break;
+   case TGSI_OPCODE_MAD:
+      emit_mad(c, inst);
+      break;
+   case TGSI_OPCODE_TEX:
+      emit_tex(c, inst);
+      break;
+   case TGSI_OPCODE_TXB:
+      emit_txb(c, inst);
+      break;
+   case TGSI_OPCODE_TEXKILL:
+      emit_kil(c);
+      break;
+   case TGSI_OPCODE_IF:
+      assert(c->if_insn < MAX_IFSN);
+      c->if_inst[c->if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_ELSE:
+      c->if_inst[c->if_insn-1]  = brw_ELSE(p, c->if_inst[c->if_insn-1]);
+      break;
+   case TGSI_OPCODE_ENDIF:
+      assert(c->if_insn > 0);
+      brw_ENDIF(p, c->if_inst[--c->if_insn]);
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   case TGSI_OPCODE_ENDSUB:
+      break;
+   case TGSI_OPCODE_CAL:
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_ADD(p, deref_1ud(c->stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(4));
+//      orig_inst = inst->Data;
+//      orig_inst->Data = &p->store[p->nr_insn];
+      assert(0);
+      brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
+      brw_pop_insn_state(p);
+      break;
+
+   case TGSI_OPCODE_RET:
+#if 0
+      brw_push_insn_state(p);
+      brw_set_mask_control(p, BRW_MASK_DISABLE);
+      brw_ADD(p, 
+	      get_addr_reg(c->stack_index),
+	      get_addr_reg(c->stack_index), brw_imm_d(-4));
+      brw_set_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p, brw_ip_reg(), deref_1ud(c->stack_index, 0));
+      brw_set_access_mode(p, BRW_ALIGN_16);
+      brw_pop_insn_state(p);
+#else
+      emit_fb_write(c, inst);
+#endif
+
+      break;
+   case TGSI_OPCODE_LOOP:
+      c->loop_inst[c->loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+      break;
+   case TGSI_OPCODE_BRK:
+      brw_BREAK(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_CONT:
+      brw_CONT(p);
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      c->loop_insn--;
+      c->inst0 = c->inst1 = brw_WHILE(p, c->loop_inst[c->loop_insn]);
+      /* patch all the BREAK instructions from
+	 last BEGINLOOP */
+      while (c->inst0 > c->loop_inst[c->loop_insn]) {
+	 c->inst0--;
+	 if (c->inst0->header.opcode == BRW_OPCODE_BREAK) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0 + 1;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 } else if (c->inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+	    c->inst0->bits3.if_else.jump_count = c->inst1 - c->inst0;
+	    c->inst0->bits3.if_else.pop_count = 0;
+	 }
+      }
+      break;
+   case TGSI_OPCODE_END:
+      emit_fb_write(c, inst);
+      break;
+
+   default:
+      debug_printf("unsupported IR in fragment shader %d\n",
+		   inst->Instruction.Opcode);
+   }
+#if 0
+   if (inst->CondUpdate)
+      brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
+   else
+      brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+#endif
+}
+
+
+
+
+
+
+void brw_wm_glsl_emit(struct brw_wm_compile *c)
+{
+   struct tgsi_parse_context parse;
+   struct brw_compile *p = &c->func;
+
+   brw_init_compile(&c->func);
+   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
+
+   c->reg_index = 0;
+   c->if_insn = 0;
+   c->loop_insn = 0;
+   c->stack_index = brw_indirect(0,0);
+
+   /* Do static register allocation and parameter interpolation:
+    */
+   brw_wm_emit_decls( c );
+
+   /* Emit the actual program.  All done with very direct translation,
+    * hopefully we can improve on this shortly...
+    */
+   brw_MOV(p, get_addr_reg(c->stack_index), brw_address(c->stack));
+
+   tgsi_parse_init( &parse, c->fp->program.tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) 
+   {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+	 /* already done */
+	 break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* not handled yet */
+	 assert(0);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         brw_wm_emit_instruction(c, &parse.FullToken.FullInstruction);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free (&parse);
+   
+   /* Fix up call targets:
+    */
+#if 0
+   {
+      unsigned nr_insns = c->fp->program.Base.NumInstructions;
+      unsigned insn, target_insn;
+      struct tgsi_full_instruction *inst1, *inst2;
+      struct brw_instruction *brw_inst1, *brw_inst2;
+      int offset;
+      for (insn = 0; insn < nr_insns; insn++) {
+	 inst1 = &c->fp->program.Base.Instructions[insn];
+	 brw_inst1 = inst1->Data;
+	 switch (inst1->Opcode) {
+	 case TGSI_OPCODE_CAL:
+	    target_insn = inst1->BranchTarget;
+	    inst2 = &c->fp->program.Base.Instructions[target_insn];
+	    brw_inst2 = inst2->Data;
+	    offset = brw_inst2 - brw_inst1;
+	    brw_set_src1(brw_inst1, brw_imm_d(offset*16));
+	    break;
+	 default:
+	    break;
+	 }
+      }
+   }
+#endif
+
+   c->prog_data.total_grf = c->reg_index;
+   c->prog_data.total_scratch = 0;
+}
diff --git a/src/gallium/drivers/i965simple/brw_wm_iz.c b/src/gallium/drivers/i965simple/brw_wm_iz.c
new file mode 100644
index 0000000000..6c5f25bf39
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_iz.c
@@ -0,0 +1,214 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_wm.h"
+
+
+#undef P			/* prompted depth */
+#undef C			/* computed */
+#undef N			/* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+const struct {
+   unsigned mode:2;
+   unsigned sd_present:1;
+   unsigned sd_to_rt:1;
+   unsigned dd_present:1;
+   unsigned ds_present:1;
+} wm_iz_table[IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 1, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 0, 1, 0, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 1, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 0, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { N, 0, 1, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { C, 0, 1, 1, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { N, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { N, 1, 1, 0, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 0, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { C, 0, 1, 0, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { P, 0, 0, 0, 0 }, 
+ { C, 1, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 }, 
+ { C, 0, 1, 1, 1 } 
+};
+
+void brw_wm_lookup_iz( unsigned line_aa,
+		       unsigned lookup,
+		       struct brw_wm_prog_key *key )
+{
+   unsigned reg = 2;
+
+   assert (lookup < IZ_BIT_MAX);
+      
+   if (lookup & IZ_PS_COMPUTES_DEPTH_BIT)
+      key->computes_depth = 1;
+
+   if (wm_iz_table[lookup].sd_present) {
+      key->source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt)
+      key->source_depth_to_render_target = 1;
+
+   if (wm_iz_table[lookup].ds_present || line_aa != AA_NEVER) {
+      key->aa_dest_stencil_reg = reg;
+      key->runtime_check_aads_emit = (!wm_iz_table[lookup].ds_present &&
+				      line_aa == AA_SOMETIMES);
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      key->dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   key->nr_depth_regs = (reg+1)/2;
+}
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_sampler_state.c b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
new file mode 100644
index 0000000000..52b2909a65
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_sampler_state.c
@@ -0,0 +1,275 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define COMPAREFUNC_ALWAYS		0
+#define COMPAREFUNC_NEVER		0x1
+#define COMPAREFUNC_LESS		0x2
+#define COMPAREFUNC_EQUAL		0x3
+#define COMPAREFUNC_LEQUAL		0x4
+#define COMPAREFUNC_GREATER		0x5
+#define COMPAREFUNC_NOTEQUAL		0x6
+#define COMPAREFUNC_GEQUAL		0x7
+
+/* Samplers aren't strictly wm state from the hardware's perspective,
+ * but that is the only situation in which we use them in this driver.
+ */
+
+static int intel_translate_shadow_compare_func(unsigned func)
+{
+   switch(func) {
+   case PIPE_FUNC_NEVER:
+       return COMPAREFUNC_ALWAYS;
+   case PIPE_FUNC_LESS:
+       return COMPAREFUNC_LEQUAL;
+   case PIPE_FUNC_LEQUAL:
+       return COMPAREFUNC_LESS;
+   case PIPE_FUNC_GREATER:
+       return COMPAREFUNC_GEQUAL;
+   case PIPE_FUNC_GEQUAL:
+      return COMPAREFUNC_GREATER;
+   case PIPE_FUNC_NOTEQUAL:
+      return COMPAREFUNC_EQUAL;
+   case PIPE_FUNC_EQUAL:
+      return COMPAREFUNC_NOTEQUAL;
+   case PIPE_FUNC_ALWAYS:
+       return COMPAREFUNC_NEVER;
+   }
+
+   debug_printf("Unknown value in %s: %x\n", __FUNCTION__, func);
+   return COMPAREFUNC_NEVER;
+}
+
+/* The brw (and related graphics cores) do not support GL_CLAMP.  The
+ * Intel drivers for "other operating systems" implement GL_CLAMP as
+ * GL_CLAMP_TO_EDGE, so the same is done here.
+ */
+static unsigned translate_wrap_mode( int wrap )
+{
+   switch( wrap ) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return BRW_TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP:
+      return BRW_TEXCOORDMODE_CLAMP;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return BRW_TEXCOORDMODE_CLAMP; /* conform likes it this way */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return BRW_TEXCOORDMODE_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return BRW_TEXCOORDMODE_MIRROR;
+   default:
+      return BRW_TEXCOORDMODE_WRAP;
+   }
+}
+
+
+static unsigned U_FIXED(float value, unsigned frac_bits)
+{
+   value *= (1<<frac_bits);
+   return value < 0 ? 0 : value;
+}
+
+static int S_FIXED(float value, unsigned frac_bits)
+{
+   return value * (1<<frac_bits);
+}
+
+
+static unsigned upload_default_color( struct brw_context *brw,
+                                      const float *color )
+{
+   struct brw_sampler_default_color sdc;
+
+   COPY_4V(sdc.color, color);
+
+   return brw_cache_data( &brw->cache[BRW_SAMPLER_DEFAULT_COLOR], &sdc );
+}
+
+
+/*
+ */
+static void brw_update_sampler_state( const struct pipe_sampler_state *pipe_sampler,
+				      unsigned sdc_gs_offset,
+				      struct brw_sampler_state *sampler)
+{
+   memset(sampler, 0, sizeof(*sampler));
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.min_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.min_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.min_filter = BRW_MAPFILTER_ANISOTROPIC;
+      break;
+   default:
+      break;
+   }
+
+   switch (pipe_sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      sampler->ss0.mip_filter = BRW_MIPFILTER_NONE;
+      break;
+   default:
+      break;
+   }
+   /* Set Anisotropy:
+    */
+   switch (pipe_sampler->mag_img_filter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_NEAREST;
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      sampler->ss0.mag_filter = BRW_MAPFILTER_LINEAR;
+      break;
+   default:
+      break;
+   }
+
+   if (pipe_sampler->max_anisotropy > 2.0) {
+      sampler->ss3.max_aniso = MAX2((pipe_sampler->max_anisotropy - 2) / 2,
+                                    BRW_ANISORATIO_16);
+   }
+
+   sampler->ss1.s_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_s);
+   sampler->ss1.r_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_r);
+   sampler->ss1.t_wrap_mode = translate_wrap_mode(pipe_sampler->wrap_t);
+
+   /* Fulsim complains if I don't do this.  Hardware doesn't mind:
+    */
+#if 0
+   if (texObj->Target == GL_TEXTURE_CUBE_MAP_ARB) {
+      sampler->ss1.r_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.s_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+      sampler->ss1.t_wrap_mode = BRW_TEXCOORDMODE_CUBE;
+   }
+#endif
+
+   /* Set shadow function:
+    */
+   if (pipe_sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+      /* Shadowing is "enabled" by emitting a particular sampler
+       * message (sample_c).  So need to recompile WM program when
+       * shadow comparison is enabled on each/any texture unit.
+       */
+      sampler->ss0.shadow_function = intel_translate_shadow_compare_func(pipe_sampler->compare_func);
+   }
+
+   /* Set LOD bias:
+    */
+   sampler->ss0.lod_bias = S_FIXED(CLAMP(pipe_sampler->lod_bias, -16, 15), 6);
+
+   sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+   sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+   /* Set BaseMipLevel, MaxLOD, MinLOD:
+    *
+    * XXX: I don't think that using firstLevel, lastLevel works,
+    * because we always setup the surface state as if firstLevel ==
+    * level zero.  Probably have to subtract firstLevel from each of
+    * these:
+    */
+   sampler->ss0.base_level = U_FIXED(0, 1);
+
+   sampler->ss1.max_lod = U_FIXED(MIN2(MAX2(pipe_sampler->max_lod, 0), 13), 6);
+   sampler->ss1.min_lod = U_FIXED(MIN2(MAX2(pipe_sampler->min_lod, 0), 13), 6);
+
+   sampler->ss2.default_color_pointer = sdc_gs_offset >> 5;
+}
+
+
+
+/* All samplers must be uploaded in a single contiguous array, which
+ * complicates various things.  However, this is still too confusing -
+ * FIXME: simplify all the different new texture state flags.
+ */
+static void upload_wm_samplers(struct brw_context *brw)
+{
+   unsigned unit;
+   unsigned sampler_count = 0;
+
+   /* BRW_NEW_SAMPLER */
+   for (unit = 0; unit < brw->num_textures && unit < brw->num_samplers;
+        unit++) {
+      /* determine unit enable/disable by looking for a bound texture */
+      if (brw->attribs.Texture[unit]) {
+         const struct pipe_sampler_state *sampler = brw->attribs.Samplers[unit];
+	 unsigned sdc_gs_offset = upload_default_color(brw, sampler->border_color);
+
+	 brw_update_sampler_state(sampler,
+				  sdc_gs_offset,
+				  &brw->wm.sampler[unit]);
+
+	 sampler_count = unit + 1;
+      }
+   }
+
+   if (brw->wm.sampler_count != sampler_count) {
+      brw->wm.sampler_count = sampler_count;
+      brw->state.dirty.cache |= CACHE_NEW_SAMPLER;
+   }
+
+   brw->wm.sampler_gs_offset = 0;
+
+   if (brw->wm.sampler_count)
+      brw->wm.sampler_gs_offset =
+	 brw_cache_data_sz(&brw->cache[BRW_SAMPLER],
+			   brw->wm.sampler,
+			   sizeof(struct brw_sampler_state) * brw->wm.sampler_count);
+}
+
+const struct brw_tracked_state brw_wm_samplers = {
+   .dirty = {
+      .brw = BRW_NEW_SAMPLER,
+      .cache = 0
+   },
+   .update = upload_wm_samplers
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_state.c b/src/gallium/drivers/i965simple/brw_wm_state.c
new file mode 100644
index 0000000000..37a9bf919c
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_state.c
@@ -0,0 +1,195 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+#include "brw_wm.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+/***********************************************************************
+ * WM unit - fragment programs and rasterization
+ */
+static void upload_wm_unit(struct brw_context *brw )
+{
+   struct brw_wm_unit_state wm;
+   unsigned max_threads;
+   unsigned per_thread;
+
+   if (BRW_DEBUG & DEBUG_SINGLE_THREAD)
+      max_threads = 0;
+   else
+      max_threads = 31;
+
+
+   memset(&wm, 0, sizeof(wm));
+
+   /* CACHE_NEW_WM_PROG */
+   wm.thread0.grf_reg_count = align(brw->wm.prog_data->total_grf, 16) / 16 - 1;
+   wm.thread0.kernel_start_pointer = brw->wm.prog_gs_offset >> 6;
+   wm.thread3.dispatch_grf_start_reg = brw->wm.prog_data->first_curbe_grf;
+   wm.thread3.urb_entry_read_length = brw->wm.prog_data->urb_read_length;
+   wm.thread3.const_urb_entry_read_length = brw->wm.prog_data->curb_read_length;
+
+   wm.wm5.max_threads = max_threads;
+
+   per_thread = align(brw->wm.prog_data->total_scratch, 1024);
+   assert(per_thread <= 12 * 1024);
+
+#if 0
+   if (brw->wm.prog_data->total_scratch) {
+      unsigned total = per_thread * (max_threads + 1);
+
+      /* Scratch space -- just have to make sure there is sufficient
+       * allocated for the active program and current number of threads.
+       */
+      brw->wm.scratch_buffer_size = total;
+      if (brw->wm.scratch_buffer &&
+	  brw->wm.scratch_buffer_size > brw->wm.scratch_buffer->size) {
+	 dri_bo_unreference(brw->wm.scratch_buffer);
+	 brw->wm.scratch_buffer = NULL;
+      }
+      if (!brw->wm.scratch_buffer) {
+	 brw->wm.scratch_buffer = dri_bo_alloc(intel->intelScreen->bufmgr,
+					       "wm scratch",
+					       brw->wm.scratch_buffer_size,
+					       4096, DRM_BO_FLAG_MEM_TT);
+      }
+   }
+   /* XXX: Scratch buffers are not implemented correectly.
+    *
+    * The scratch offset to be programmed into wm is relative to the general
+    * state base address.  However, using dri_bo_alloc/dri_bo_emit_reloc (or
+    * the previous bmGenBuffers scheme), we get an offset relative to the
+    * start of framebuffer.  Even before then, it was broken in other ways,
+    * so just fail for now if we hit that path.
+    */
+   assert(brw->wm.prog_data->total_scratch == 0);
+#endif
+
+   /* CACHE_NEW_SURFACE */
+   wm.thread1.binding_table_entry_count = brw->wm.nr_surfaces;
+
+   /* BRW_NEW_CURBE_OFFSETS */
+   wm.thread3.const_urb_entry_read_offset = brw->curbe.wm_start * 2;
+
+   wm.thread3.urb_entry_read_offset = 0;
+   wm.thread1.depth_coef_urb_read_offset = 1;
+   wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
+
+   /* CACHE_NEW_SAMPLER */
+   wm.wm4.sampler_count = (brw->wm.sampler_count + 1) / 4;
+   wm.wm4.sampler_state_pointer = brw->wm.sampler_gs_offset >> 5;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   {
+      const struct brw_fragment_program *fp = brw->attribs.FragmentProgram;
+
+      if (fp->UsesDepth)
+	 wm.wm5.program_uses_depth = 1; /* as far as we can tell */
+
+      if (fp->info.writes_z)
+	 wm.wm5.program_computes_depth = 1;
+
+      /* BRW_NEW_ALPHA_TEST */
+      if (fp->info.uses_kill ||
+	  brw->attribs.DepthStencil->alpha.enabled)
+	 wm.wm5.program_uses_killpixel = 1;
+
+      wm.wm5.enable_8_pix = 1;
+   }
+
+   wm.wm5.thread_dispatch_enable = 1;	/* AKA: color_write */
+   wm.wm5.legacy_line_rast = 0;
+   wm.wm5.legacy_global_depth_bias = 0;
+   wm.wm5.early_depth_test = 1;	        /* never need to disable */
+   wm.wm5.line_aa_region_width = 0;
+   wm.wm5.line_endcap_aa_region_width = 1;
+
+   /* BRW_NEW_RASTERIZER */
+   if (brw->attribs.Raster->poly_stipple_enable)
+      wm.wm5.polygon_stipple = 1;
+
+#if 0
+   if (brw->attribs.Polygon->OffsetFill) {
+      wm.wm5.depth_offset = 1;
+      /* Something wierd going on with legacy_global_depth_bias,
+       * offset_constant, scaling and MRD.  This value passes glean
+       * but gives some odd results elsewere (eg. the
+       * quad-offset-units test).
+       */
+      wm.global_depth_offset_constant = brw->attribs.Polygon->OffsetUnits * 2;
+
+      /* This is the only value that passes glean:
+       */
+      wm.global_depth_offset_scale = brw->attribs.Polygon->OffsetFactor;
+   }
+#endif
+
+   if (brw->attribs.Raster->line_stipple_enable) {
+      wm.wm5.line_stipple = 1;
+   }
+
+   if (BRW_DEBUG & DEBUG_STATS)
+      wm.wm4.stats_enable = 1;
+
+   brw->wm.state_gs_offset = brw_cache_data( &brw->cache[BRW_WM_UNIT], &wm );
+
+   if (brw->wm.prog_data->total_scratch) {
+      /*
+      dri_emit_reloc(brw->cache[BRW_WM_UNIT].pool->buffer,
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     (per_thread / 1024) - 1,
+		     brw->wm.state_gs_offset +
+		     ((char *)&wm.thread2 - (char *)&wm),
+		     brw->wm.scratch_buffer);
+      */
+   } else {
+      wm.thread2.scratch_space_base_pointer = 0;
+   }
+}
+
+const struct brw_tracked_state brw_wm_unit = {
+   .dirty = {
+      .brw = (BRW_NEW_RASTERIZER |
+	      BRW_NEW_ALPHA_TEST |
+	      BRW_NEW_FS |
+	      BRW_NEW_CURBE_OFFSETS),
+
+      .cache = (CACHE_NEW_SURFACE |
+		CACHE_NEW_WM_PROG |
+		CACHE_NEW_SAMPLER)
+   },
+   .update = upload_wm_unit
+};
+
diff --git a/src/gallium/drivers/i965simple/brw_wm_surface_state.c b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
new file mode 100644
index 0000000000..1a326f9918
--- /dev/null
+++ b/src/gallium/drivers/i965simple/brw_wm_surface_state.c
@@ -0,0 +1,304 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+static unsigned translate_tex_target( enum pipe_texture_target target )
+{
+   switch (target) {
+   case PIPE_TEXTURE_1D:
+      return BRW_SURFACE_1D;
+
+   case PIPE_TEXTURE_2D:
+      return BRW_SURFACE_2D;
+
+   case PIPE_TEXTURE_3D:
+      return BRW_SURFACE_3D;
+
+   case PIPE_TEXTURE_CUBE:
+      return BRW_SURFACE_CUBE;
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_tex_format( enum pipe_format pipe_format )
+{
+   switch( pipe_format ) {
+   case PIPE_FORMAT_L8_UNORM:
+      return BRW_SURFACEFORMAT_L8_UNORM;
+
+   case PIPE_FORMAT_I8_UNORM:
+      return BRW_SURFACEFORMAT_I8_UNORM;
+
+   case PIPE_FORMAT_A8_UNORM:
+      return BRW_SURFACEFORMAT_A8_UNORM;
+
+   case PIPE_FORMAT_A8L8_UNORM:
+      return BRW_SURFACEFORMAT_L8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      assert(0);		/* not supported for sampling */
+      return BRW_SURFACEFORMAT_R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      return BRW_SURFACEFORMAT_B5G5R5A1_UNORM;
+
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      return BRW_SURFACEFORMAT_B4G4R4A4_UNORM;
+
+   case PIPE_FORMAT_YCBCR_REV:
+      return BRW_SURFACEFORMAT_YCRCB_NORMAL;
+
+   case PIPE_FORMAT_YCBCR:
+      return BRW_SURFACEFORMAT_YCRCB_SWAPUVY;
+#if 0
+   case PIPE_FORMAT_RGB_FXT1:
+   case PIPE_FORMAT_RGBA_FXT1:
+      return BRW_SURFACEFORMAT_FXT1;
+#endif
+
+   case PIPE_FORMAT_Z16_UNORM:
+      return BRW_SURFACEFORMAT_I16_UNORM;
+#if 0
+   case PIPE_FORMAT_RGB_DXT1:
+       return BRW_SURFACEFORMAT_DXT1_RGB;
+
+   case PIPE_FORMAT_RGBA_DXT1:
+       return BRW_SURFACEFORMAT_BC1_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT3:
+       return BRW_SURFACEFORMAT_BC2_UNORM;
+
+   case PIPE_FORMAT_RGBA_DXT5:
+       return BRW_SURFACEFORMAT_BC3_UNORM;
+
+   case PIPE_FORMAT_SRGBA8:
+      return BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB;
+   case PIPE_FORMAT_SRGB_DXT1:
+      return BRW_SURFACEFORMAT_BC1_UNORM_SRGB;
+#endif
+
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned brw_buffer_offset(struct brw_context *brw,
+                                  struct pipe_buffer *buffer)
+{
+   return brw->winsys->get_buffer_offset(brw->winsys,
+                                         buffer,
+                                         0);
+}
+
+static
+void brw_update_texture_surface( struct brw_context *brw,
+				 unsigned unit )
+{
+   const struct brw_texture *tObj = brw->attribs.Texture[unit];
+   struct brw_surface_state surf;
+
+   memset(&surf, 0, sizeof(surf));
+
+   surf.ss0.mipmap_layout_mode = BRW_SURFACE_MIPMAPLAYOUT_BELOW;
+   surf.ss0.surface_type = translate_tex_target(tObj->base.target);
+   surf.ss0.surface_format = translate_tex_format(tObj->base.format);
+
+   /* This is ok for all textures with channel width 8bit or less:
+    */
+/*    surf.ss0.data_return_format = BRW_SURFACERETURNFORMAT_S1; */
+
+   /* Updated in emit_reloc */
+   surf.ss1.base_addr = brw_buffer_offset( brw, tObj->buffer );
+
+   surf.ss2.mip_count = tObj->base.last_level;
+   surf.ss2.width = tObj->base.width[0] - 1;
+   surf.ss2.height = tObj->base.height[0] - 1;
+
+   surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+   surf.ss3.tiled_surface = 0; /* always zero */
+   surf.ss3.pitch = tObj->stride - 1;
+   surf.ss3.depth = tObj->base.depth[0] - 1;
+
+   surf.ss4.min_lod = 0;
+
+   if (tObj->base.target == PIPE_TEXTURE_CUBE) {
+      surf.ss0.cube_pos_x = 1;
+      surf.ss0.cube_pos_y = 1;
+      surf.ss0.cube_pos_z = 1;
+      surf.ss0.cube_neg_x = 1;
+      surf.ss0.cube_neg_y = 1;
+      surf.ss0.cube_neg_z = 1;
+   }
+
+   brw->wm.bind.surf_ss_offset[unit + 1] =
+      brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+}
+
+
+
+#define OFFSET(TYPE, FIELD) ( (unsigned)&(((TYPE *)0)->FIELD) )
+
+
+static void upload_wm_surfaces(struct brw_context *brw )
+{
+   unsigned i;
+
+   {
+      struct brw_surface_state surf;
+
+      /* BRW_NEW_FRAMEBUFFER
+       */
+      struct pipe_surface *pipe_surface = brw->attribs.FrameBuffer.cbufs[0];/*fixme*/
+
+      memset(&surf, 0, sizeof(surf));
+
+      if (pipe_surface != NULL) {
+	 if (pipe_surface->block.size == 4)
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 else
+	    surf.ss0.surface_format = BRW_SURFACEFORMAT_B5G6R5_UNORM;
+
+	 surf.ss0.surface_type = BRW_SURFACE_2D;
+
+	 surf.ss1.base_addr = brw_buffer_offset( brw, pipe_surface->buffer );
+
+	 surf.ss2.width = pipe_surface->width - 1;
+	 surf.ss2.height = pipe_surface->height - 1;
+	 surf.ss3.tile_walk = BRW_TILEWALK_XMAJOR;
+	 surf.ss3.tiled_surface = 0;
+	 surf.ss3.pitch = pipe_surface->stride - 1;
+      } else {
+	 surf.ss0.surface_format = BRW_SURFACEFORMAT_B8G8R8A8_UNORM;
+	 surf.ss0.surface_type = BRW_SURFACE_NULL;
+      }
+
+      /* BRW_NEW_BLEND */
+      surf.ss0.color_blend = (!brw->attribs.Blend->logicop_enable &&
+			      brw->attribs.Blend->blend_enable);
+
+
+      surf.ss0.writedisable_red =   !(brw->attribs.Blend->colormask & PIPE_MASK_R);
+      surf.ss0.writedisable_green = !(brw->attribs.Blend->colormask & PIPE_MASK_G);
+      surf.ss0.writedisable_blue =  !(brw->attribs.Blend->colormask & PIPE_MASK_B);
+      surf.ss0.writedisable_alpha = !(brw->attribs.Blend->colormask & PIPE_MASK_A);
+
+
+
+
+      brw->wm.bind.surf_ss_offset[0] = brw_cache_data( &brw->cache[BRW_SS_SURFACE], &surf );
+
+      brw->wm.nr_surfaces = 1;
+   }
+
+
+   /* BRW_NEW_TEXTURE
+    */
+   for (i = 0; i < brw->num_textures && i < brw->num_samplers; i++) {
+      const struct brw_texture *texUnit = brw->attribs.Texture[i];
+
+      if (texUnit &&
+	  texUnit->base.refcount/*(texUnit->refcount > 0) == really used */) {
+
+	 brw_update_texture_surface(brw, i);
+
+	 brw->wm.nr_surfaces = i+2;
+      }
+      else {
+	 brw->wm.bind.surf_ss_offset[i+1] = 0;
+      }
+   }
+
+   brw->wm.bind_ss_offset = brw_cache_data( &brw->cache[BRW_SS_SURF_BIND],
+					    &brw->wm.bind );
+}
+
+
+/* KW: Will find a different way to acheive this, see for example the
+ * state caches with relocs in the i915 swz driver.
+ */
+#if 0
+static void emit_reloc_wm_surfaces(struct brw_context *brw)
+{
+   int unit;
+
+   if (brw->state.draw_region != NULL) {
+      /* Emit framebuffer relocation */
+      dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+		     DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		     0,
+		     brw->wm.bind.surf_ss_offset[0] +
+		     offsetof(struct brw_surface_state, ss1),
+		     brw->state.draw_region->buffer);
+   }
+
+   /* Emit relocations for texture buffers */
+   for (unit = 0; unit < BRW_MAX_TEX_UNIT; unit++) {
+      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
+      struct gl_texture_object *tObj = texUnit->_Current;
+      struct intel_texture_object *intelObj = intel_texture_object(tObj);
+
+      if (texUnit->_ReallyEnabled && intelObj->mt != NULL) {
+	 dri_emit_reloc(brw_cache_buffer(brw, BRW_SS_SURFACE),
+			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+			0,
+			brw->wm.bind.surf_ss_offset[unit + 1] +
+			offsetof(struct brw_surface_state, ss1),
+			intelObj->mt->region->buffer);
+      }
+   }
+}
+#endif
+
+const struct brw_tracked_state brw_wm_surfaces = {
+   .dirty = {
+      .brw = (BRW_NEW_FRAMEBUFFER |
+	      BRW_NEW_BLEND |
+	      BRW_NEW_TEXTURE),
+      .cache = 0
+   },
+   .update = upload_wm_surfaces,
+};
diff --git a/src/gallium/drivers/nouveau/nouveau_bo.h b/src/gallium/drivers/nouveau/nouveau_bo.h
new file mode 100644
index 0000000000..65b138283c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_bo.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_BO_H__
+#define __NOUVEAU_BO_H__
+
+/* Relocation/Buffer type flags */
+#define NOUVEAU_BO_VRAM  (1 << 0)
+#define NOUVEAU_BO_GART  (1 << 1)
+#define NOUVEAU_BO_RD    (1 << 2)
+#define NOUVEAU_BO_WR    (1 << 3)
+#define NOUVEAU_BO_RDWR  (NOUVEAU_BO_RD | NOUVEAU_BO_WR)
+#define NOUVEAU_BO_MAP   (1 << 4)
+#define NOUVEAU_BO_PIN   (1 << 5)
+#define NOUVEAU_BO_LOW   (1 << 6)
+#define NOUVEAU_BO_HIGH  (1 << 7)
+#define NOUVEAU_BO_OR    (1 << 8)
+#define NOUVEAU_BO_LOCAL (1 << 9)
+#define NOUVEAU_BO_TILED (1 << 10)
+#define NOUVEAU_BO_ZTILE (1 << 11)
+#define NOUVEAU_BO_DUMMY (1 << 31)
+
+struct nouveau_bo {
+	struct nouveau_device *device;
+	uint64_t handle;
+
+	uint64_t size;
+	void *map;
+
+	uint32_t flags;
+	uint64_t offset;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_channel.h b/src/gallium/drivers/nouveau/nouveau_channel.h
new file mode 100644
index 0000000000..cd99a676bd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_channel.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_CHANNEL_H__
+#define __NOUVEAU_CHANNEL_H__
+
+struct nouveau_channel {
+	struct nouveau_device *device;
+	int id;
+
+	struct nouveau_pushbuf *pushbuf;
+
+	struct nouveau_grobj *nullobj;
+	struct nouveau_grobj *vram;
+	struct nouveau_grobj *gart;
+
+	void *user_private;
+	void (*hang_notify)(struct nouveau_channel *);
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_class.h b/src/gallium/drivers/nouveau/nouveau_class.h
new file mode 100644
index 0000000000..3df3d7b2b8
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_class.h
@@ -0,0 +1,8006 @@
+/*************************************************************************
+
+   Autogenerated file, do not edit !
+
+**************************************************************************
+
+   Copyright (C) 2006-2008 :
+   Dmitry Baryshkov,
+   Laurent Carlier,
+   Matthieu Castet,
+   Dawid Gajownik,
+   Jeremy Kolb,
+   Stephane Loeuillet,
+   Patrice Mandin,
+   Stephane Marchesin,
+   Serge Martin,
+   Sylvain Munaut,
+   Simon Raffeiner,
+   Ben Skeggs,
+   Erik Waling,
+   koala_br,
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*************************************************************************/
+
+
+#ifndef NOUVEAU_REG_H
+#define NOUVEAU_REG_H 1
+
+
+#define NV01_ROOT									0x00000001
+
+
+
+#define NV01_CONTEXT_DMA								0x00000002
+
+
+
+#define NV01_DEVICE									0x00000003
+
+
+
+#define NV01_TIMER									0x00000004
+
+#define  NV01_TIMER_SYNCHRONIZE								0x00000100
+#define  NV01_TIMER_STOP_ALARM								0x00000104
+#define  NV01_TIMER_DMA_NOTIFY								0x00000180
+#define  NV01_TIMER_TIME(x)								(0x00000300+((x)*4))
+#define  NV01_TIMER_TIME__SIZE								0x00000002
+#define  NV01_TIMER_ALARM_NOTIFY							0x00000308
+
+
+#define NV_IMAGE_STENCIL								0x00000010
+
+#define  NV_IMAGE_STENCIL_NOTIFY							0x00000104
+#define  NV_IMAGE_STENCIL_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_STENCIL_IMAGE_OUTPUT							0x00000200
+#define  NV_IMAGE_STENCIL_IMAGE_INPUT(x)						(0x00000204+((x)*4))
+#define  NV_IMAGE_STENCIL_IMAGE_INPUT__SIZE						0x00000002
+
+
+#define NV_IMAGE_BLEND_AND								0x00000011
+
+#define  NV_IMAGE_BLEND_AND_NOP								0x00000100
+#define  NV_IMAGE_BLEND_AND_NOTIFY							0x00000104
+#define  NV_IMAGE_BLEND_AND_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_BLEND_AND_IMAGE_OUTPUT						0x00000200
+#define  NV_IMAGE_BLEND_AND_BETA_INPUT							0x00000204
+#define  NV_IMAGE_BLEND_AND_IMAGE_INPUT							0x00000208
+
+
+#define NV01_CONTEXT_BETA1								0x00000012
+
+#define  NV01_CONTEXT_BETA1_NOP								0x00000100
+#define  NV01_CONTEXT_BETA1_NOTIFY							0x00000104
+#define  NV01_CONTEXT_BETA1_DMA_NOTIFY							0x00000180
+#define  NV01_CONTEXT_BETA1_BETA_1D31							0x00000300
+
+
+#define NV_IMAGE_ROP_AND								0x00000013
+
+#define  NV_IMAGE_ROP_AND_NOTIFY							0x00000104
+#define  NV_IMAGE_ROP_AND_DMA_NOTIFY							0x00000180
+#define  NV_IMAGE_ROP_AND_IMAGE_OUTPUT							0x00000200
+#define  NV_IMAGE_ROP_AND_ROP_INPUT							0x00000204
+#define  NV_IMAGE_ROP_AND_IMAGE_INPUT(x)						(0x00000208+((x)*4))
+#define  NV_IMAGE_ROP_AND_IMAGE_INPUT__SIZE						0x00000002
+
+
+#define NV_IMAGE_COLOR_KEY								0x00000015
+
+
+
+#define NV01_CONTEXT_COLOR_KEY								0x00000017
+
+#define  NV01_CONTEXT_COLOR_KEY_NOP							0x00000100
+#define  NV01_CONTEXT_COLOR_KEY_NOTIFY							0x00000104
+#define  NV01_CONTEXT_COLOR_KEY_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT						0x00000300
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_CONTEXT_COLOR_KEY_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_CONTEXT_COLOR_KEY_COLOR							0x00000304
+
+
+#define NV01_CONTEXT_PATTERN								0x00000018
+
+#define  NV01_CONTEXT_PATTERN_NOP							0x00000100
+#define  NV01_CONTEXT_PATTERN_NOTIFY							0x00000104
+#define  NV01_CONTEXT_PATTERN_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_PATTERN_COLOR_FORMAT						0x00000300
+#define  NV01_CONTEXT_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define  NV01_CONTEXT_PATTERN_SHAPE							0x00000308
+#define  NV01_CONTEXT_PATTERN_COLOR(x)							(0x00000310+((x)*4))
+#define  NV01_CONTEXT_PATTERN_COLOR__SIZE						0x00000002
+#define  NV01_CONTEXT_PATTERN_PATTERN(x)						(0x00000318+((x)*4))
+#define  NV01_CONTEXT_PATTERN_PATTERN__SIZE						0x00000002
+
+
+#define NV01_CONTEXT_CLIP_RECTANGLE							0x00000019
+
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOP						0x00000100
+#define  NV01_CONTEXT_CLIP_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_CONTEXT_CLIP_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_CONTEXT_CLIP_RECTANGLE_POINT						0x00000300
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_X_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_POINT_Y_MASK					0xffff0000
+#define  NV01_CONTEXT_CLIP_RECTANGLE_SIZE						0x00000304
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_SHIFT					0
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_W_MASK					0x0000ffff
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_SHIFT					16
+#define   NV01_CONTEXT_CLIP_RECTANGLE_SIZE_H_MASK					0xffff0000
+
+
+#define NV01_RENDER_SOLID_LINE								0x0000001c
+
+#define  NV01_RENDER_SOLID_LINE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_LINE_NOTIFY							0x00000104
+#define  NV01_RENDER_SOLID_LINE_PATCH							0x0000010c
+#define  NV01_RENDER_SOLID_LINE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_LINE_CLIP_RECTANGLE						0x00000184
+#define  NV01_RENDER_SOLID_LINE_PATTERN							0x00000188
+#define  NV01_RENDER_SOLID_LINE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_LINE_BETA1							0x00000190
+#define  NV01_RENDER_SOLID_LINE_SURFACE							0x00000194
+#define  NV01_RENDER_SOLID_LINE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_RENDER_SOLID_LINE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_AND					0x00000002
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_LINE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_LINE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_LINE_COLOR_FORMAT						0x00000300
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A8Y8					0x00000001
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X24Y8					0x00000002
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16A1R5G5B5				0x00000003
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X17R5G5B5					0x00000004
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A8R8G8B8					0x00000005
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X8R8G8B8					0x00000006
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_A16Y16					0x00000007
+#define   NV01_RENDER_SOLID_LINE_COLOR_FORMAT_X16Y16					0x00000008
+#define  NV01_RENDER_SOLID_LINE_COLOR							0x00000304
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0(x)						(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT0__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT0_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1(x)						(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_LINE_POINT1__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_LINE_POINT1_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X(x)					(0x00000480+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y(x)					(0x00000484+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT0_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X(x)					(0x00000488+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_X__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y(x)					(0x0000048c+((x)*16))
+#define  NV01_RENDER_SOLID_LINE_LINE32_POINT1_Y__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE(x)						(0x00000500+((x)*4))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE__SIZE						0x00000020
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_SHIFT					0
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_LINE_POLYLINE_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_POLYLINE32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR(x)					(0x00000600+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_COLOR__SIZE					0x00000010
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT(x)					(0x00000604+((x)*8))
+#define  NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT__SIZE					0x00000010
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_LINE_CPOLYLINE_POINT_Y_MASK					0xffff0000
+
+
+#define NV01_RENDER_SOLID_TRIANGLE							0x0000001d
+
+#define  NV01_RENDER_SOLID_TRIANGLE_NOP							0x00000100
+#define  NV01_RENDER_SOLID_TRIANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_TRIANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_TRIANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_TRIANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_TRIANGLE_ROP							0x0000018c
+#define  NV01_RENDER_SOLID_TRIANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_TRIANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_TRIANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_TRIANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_TRIANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0					0x00000310
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1					0x00000314
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2					0x00000318
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_X					0x00000320
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT0_Y					0x00000324
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_X					0x00000328
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT1_Y					0x0000032c
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_X					0x00000330
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIANGLE32_POINT2_Y					0x00000334
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH(x)						(0x00000400+((x)*4))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH__SIZE					0x00000020
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_SHIFT					0
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_X_MASK					0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_SHIFT					16
+#define   NV01_RENDER_SOLID_TRIANGLE_TRIMESH_Y_MASK					0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X(x)				(0x00000480+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_X__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y(x)				(0x00000484+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_TRIMESH32_POINT_Y__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR(x)					(0x00000500+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_COLOR__SIZE				0x00000008
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0(x)					(0x00000504+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT0_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1(x)					(0x00000508+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT1_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2(x)					(0x0000050c+((x)*16))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2__SIZE				0x00000008
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIANGLE_POINT2_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR(x)					(0x00000580+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_COLOR__SIZE				0x00000010
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT(x)					(0x00000584+((x)*8))
+#define  NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_TRIANGLE_CTRIMESH_POINT_Y_MASK				0xffff0000
+
+
+#define NV01_RENDER_SOLID_RECTANGLE							0x0000001e
+
+#define  NV01_RENDER_SOLID_RECTANGLE_NOP						0x00000100
+#define  NV01_RENDER_SOLID_RECTANGLE_NOTIFY						0x00000104
+#define  NV01_RENDER_SOLID_RECTANGLE_PATCH						0x0000010c
+#define  NV01_RENDER_SOLID_RECTANGLE_DMA_NOTIFY						0x00000180
+#define  NV01_RENDER_SOLID_RECTANGLE_CLIP_RECTANGLE					0x00000184
+#define  NV01_RENDER_SOLID_RECTANGLE_PATTERN						0x00000188
+#define  NV01_RENDER_SOLID_RECTANGLE_ROP						0x0000018c
+#define  NV01_RENDER_SOLID_RECTANGLE_BETA1						0x00000190
+#define  NV01_RENDER_SOLID_RECTANGLE_SURFACE						0x00000194
+#define  NV01_RENDER_SOLID_RECTANGLE_OPERATION						0x000002fc
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_ROP_AND					0x00000001
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_AND				0x00000002
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY					0x00000003
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV01_RENDER_SOLID_RECTANGLE_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR_FORMAT					0x00000300
+#define  NV01_RENDER_SOLID_RECTANGLE_COLOR						0x00000304
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT(x)					(0x00000400+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_X_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_POINT_Y_MASK				0xffff0000
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE(x)					(0x00000404+((x)*8))
+#define  NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE__SIZE				0x00000010
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_SHIFT				0
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_W_MASK				0x0000ffff
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_SHIFT				16
+#define   NV01_RENDER_SOLID_RECTANGLE_RECTANGLE_SIZE_H_MASK				0xffff0000
+
+
+#define NV01_IMAGE_BLIT									0x0000001f
+
+#define  NV01_IMAGE_BLIT_NOP								0x00000100
+#define  NV01_IMAGE_BLIT_NOTIFY								0x00000104
+#define  NV01_IMAGE_BLIT_PATCH								0x0000010c
+#define  NV01_IMAGE_BLIT_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_BLIT_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_BLIT_CLIP_RECTANGLE							0x00000188
+#define  NV01_IMAGE_BLIT_PATTERN							0x0000018c
+#define  NV01_IMAGE_BLIT_ROP								0x00000190
+#define  NV01_IMAGE_BLIT_BETA1								0x00000194
+#define  NV01_IMAGE_BLIT_SURFACE							0x0000019c
+#define  NV01_IMAGE_BLIT_OPERATION							0x000002fc
+#define  NV01_IMAGE_BLIT_IMAGE_INPUT							0x00000204
+#define  NV01_IMAGE_BLIT_POINT_IN							0x00000300
+#define   NV01_IMAGE_BLIT_POINT_IN_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_IN_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_IN_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_POINT_OUT							0x00000304
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_SHIFT						0
+#define   NV01_IMAGE_BLIT_POINT_OUT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_SHIFT						16
+#define   NV01_IMAGE_BLIT_POINT_OUT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_BLIT_SIZE								0x00000308
+#define   NV01_IMAGE_BLIT_SIZE_W_SHIFT							0
+#define   NV01_IMAGE_BLIT_SIZE_W_MASK							0x0000ffff
+#define   NV01_IMAGE_BLIT_SIZE_H_SHIFT							16
+#define   NV01_IMAGE_BLIT_SIZE_H_MASK							0xffff0000
+
+
+#define NV01_IMAGE_FROM_CPU								0x00000021
+
+#define  NV01_IMAGE_FROM_CPU_NOP							0x00000100
+#define  NV01_IMAGE_FROM_CPU_NOTIFY							0x00000104
+#define  NV01_IMAGE_FROM_CPU_PATCH							0x0000010c
+#define  NV01_IMAGE_FROM_CPU_DMA_NOTIFY							0x00000180
+#define  NV01_IMAGE_FROM_CPU_COLOR_KEY							0x00000184
+#define  NV01_IMAGE_FROM_CPU_CLIP_RECTANGLE						0x00000188
+#define  NV01_IMAGE_FROM_CPU_PATTERN							0x0000018c
+#define  NV01_IMAGE_FROM_CPU_ROP							0x00000190
+#define  NV01_IMAGE_FROM_CPU_BETA1							0x00000194
+#define  NV01_IMAGE_FROM_CPU_SURFACE							0x00000198
+#define  NV01_IMAGE_FROM_CPU_OPERATION							0x000002fc
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV01_IMAGE_FROM_CPU_OPERATION_ROP_AND						0x00000001
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_AND					0x00000002
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY						0x00000003
+#define   NV01_IMAGE_FROM_CPU_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV01_IMAGE_FROM_CPU_OPERATION_BLEND_PREMULT					0x00000005
+#define  NV01_IMAGE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_Y8						0x00000001
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A1R5G5B5					0x00000002
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X1R5G5B5					0x00000003
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_A8R8G8B8					0x00000004
+#define   NV01_IMAGE_FROM_CPU_COLOR_FORMAT_X8R8G8B8					0x00000005
+#define  NV01_IMAGE_FROM_CPU_POINT							0x00000304
+#define   NV01_IMAGE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_OUT							0x00000308
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_OUT_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_SIZE_IN							0x0000030c
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT						0
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_W_MASK						0x0000ffff
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT						16
+#define   NV01_IMAGE_FROM_CPU_SIZE_IN_H_MASK						0xffff0000
+#define  NV01_IMAGE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV01_IMAGE_FROM_CPU_COLOR__SIZE						0x00000020
+
+
+#define NV01_NULL									0x00000030
+
+
+
+#define NV03_STRETCHED_IMAGE_FROM_CPU							0x00000036
+
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DMA_NOTIFY					0x00000180
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_KEY					0x00000184
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_PATTERN						0x00000188
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_ROP						0x0000018c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_BETA1						0x00000190
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000194
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_OPERATION					0x000002fc
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR_FORMAT					0x00000300
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN						0x00000304
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_SHIFT					0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_W_MASK					0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_SHIFT					16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_SIZE_IN_H_MASK					0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DX_DU						0x00000308
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_DY_DV						0x0000030c
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT					0x00000310
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE					0x00000314
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4					0x00000318
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_SHIFT				0
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_X_MASK				0x0000ffff
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_SHIFT				16
+#define   NV03_STRETCHED_IMAGE_FROM_CPU_POINT12D4_Y_MASK				0xffff0000
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR(x)						(0x00000400+((x)*4))
+#define  NV03_STRETCHED_IMAGE_FROM_CPU_COLOR__SIZE					0x00000020
+
+
+#define NV03_SCALED_IMAGE_FROM_MEMORY							0x00000037
+
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOP						0x00000100
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_NOTIFY						0x00000104
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY					0x00000180
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE					0x00000184
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_PATTERN						0x00000188
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_ROP						0x0000018c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_BETA1						0x00000190
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000194
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT					0x00000300
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_V8YB8U8YA8				0x00000005
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_YB8V8YA8U8				0x00000006
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5				0x00000007
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8					0x00000008
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_AY8				0x00000009
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION					0x00000304
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_ROP_AND				0x00000001
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_AND				0x00000002
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY				0x00000003
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT					0x00000308
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE					0x0000030c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT					0x00000310
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_X_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_X_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_Y_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_POINT_Y_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE					0x00000314
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_OUT_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DELTA_DU_DX					0x00000318
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_DELTA_DV_DY					0x0000031c
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE					0x00000400
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_W_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_W_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_H_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_SIZE_H_MASK				0xffff0000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT					0x00000404
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_PITCH_SHIFT			0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_PITCH_MASK			0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_SHIFT			16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_MASK			0x00ff0000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_CENTER			0x00010000
+#define    NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_ORIGIN_CORNER			0x00020000
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_INTERPOLATOR_SHIFT		24
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_FORMAT_INTERPOLATOR_MASK		0xff000000
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_OFFSET					0x00000408
+#define  NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT					0x0000040c
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_U_SHIFT				0
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_U_MASK				0x0000ffff
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_V_SHIFT				16
+#define   NV03_SCALED_IMAGE_FROM_MEMORY_IMAGE_IN_POINT_V_MASK				0xffff0000
+
+
+#define NV04_DVD_SUBPICTURE								0x00000038
+
+#define  NV04_DVD_SUBPICTURE_NOP							0x00000100
+#define  NV04_DVD_SUBPICTURE_NOTIFY							0x00000104
+#define  NV04_DVD_SUBPICTURE_WAIT_FOR_IDLE						0x00000108
+#define  NV04_DVD_SUBPICTURE_DMA_NOTIFY							0x00000180
+#define  NV04_DVD_SUBPICTURE_DMA_OVERLAY						0x00000184
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEIN						0x00000188
+#define  NV04_DVD_SUBPICTURE_DMA_IMAGEOUT						0x0000018c
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_POINT						0x00000300
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_X_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_POINT_Y_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE						0x00000304
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT						0x00000308
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEOUT_FORMAT_COLOR_MASK				0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEOUT_OFFSET						0x0000030c
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DU_DX					0x00000310
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_DELTA_DV_DY					0x00000314
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_SIZE						0x00000318
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT						0x0000031c
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_OFFSET						0x00000320
+#define  NV04_DVD_SUBPICTURE_IMAGEIN_POINT						0x00000324
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_IMAGEIN_POINT_V_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DU_DX					0x00000328
+#define  NV04_DVD_SUBPICTURE_OVERLAY_DELTA_DV_DY					0x0000032c
+#define  NV04_DVD_SUBPICTURE_OVERLAY_SIZE						0x00000330
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_W_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_SIZE_H_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_FORMAT						0x00000334
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_SHIFT				0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_PITCH_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_SHIFT				16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_FORMAT_COLOR_MASK					0xffff0000
+#define  NV04_DVD_SUBPICTURE_OVERLAY_OFFSET						0x00000338
+#define  NV04_DVD_SUBPICTURE_OVERLAY_POINT						0x0000033c
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_SHIFT					0
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_U_MASK					0x0000ffff
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_SHIFT					16
+#define   NV04_DVD_SUBPICTURE_OVERLAY_POINT_V_MASK					0xffff0000
+
+
+#define NV04_MEMORY_TO_MEMORY_FORMAT							0x00000039
+
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOP						0x00000100
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_NOTIFY						0x00000104
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY					0x00000180
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN					0x00000184
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_OUT					0x00000188
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN						0x0000030c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT					0x00000310
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN						0x00000314
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT						0x00000318
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN					0x0000031c
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_LINE_COUNT					0x00000320
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT						0x00000324
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_SHIFT				0
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_INPUT_INC_MASK				0x0000000f
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_SHIFT				8
+#define   NV04_MEMORY_TO_MEMORY_FORMAT_FORMAT_OUTPUT_INC_MASK				0x00000f00
+#define  NV04_MEMORY_TO_MEMORY_FORMAT_BUF_NOTIFY					0x00000328
+
+
+#define NV01_MEMORY_LOCAL_BANKED							0x0000003d
+
+
+
+#define NV01_MAPPING_SYSTEM								0x0000003e
+
+
+
+#define NV03_MEMORY_LOCAL_CURSOR							0x0000003f
+
+
+
+#define NV01_MEMORY_LOCAL_LINEAR							0x00000040
+
+
+
+#define NV01_MAPPING_LOCAL								0x00000041
+
+
+
+#define NV04_CONTEXT_SURFACES_2D							0x00000042
+
+#define  NV04_CONTEXT_SURFACES_2D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_2D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_2D_PM_TRIGGER						0x00000140
+#define  NV04_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE					0x00000184
+#define  NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_DESTIN					0x00000188
+#define  NV04_CONTEXT_SURFACES_2D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y8						0x00000001
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_Z1R5G5B5				0x00000002
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1R5G5B5_X1R5G5B5				0x00000003
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5					0x00000004
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y16						0x00000005
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_Z8R8G8B8				0x00000006
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X8R8G8B8_X8R8G8B8				0x00000007
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_Z1A7R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_X1A7R8G8B8_X1A7R8G8B8				0x00000009
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8					0x0000000a
+#define   NV04_CONTEXT_SURFACES_2D_FORMAT_Y32						0x0000000b
+#define  NV04_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV04_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV03_CONTEXT_ROP								0x00000043
+
+#define  NV03_CONTEXT_ROP_NOP								0x00000100
+#define  NV03_CONTEXT_ROP_NOTIFY							0x00000104
+#define  NV03_CONTEXT_ROP_DMA_NOTIFY							0x00000180
+#define  NV03_CONTEXT_ROP_ROP								0x00000300
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SHIFT					0
+#define   NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_MASK					0x0000000f
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOR					0x00000001
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_INVERTED				0x00000002
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY_INVERTED				0x00000003
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND_REVERSE				0x00000004
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_INVERT					0x00000005
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_XOR					0x00000006
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NAND					0x00000007
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_AND					0x00000008
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_EQUI					0x00000009
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_NOOP					0x0000000a
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_INVERTED				0x0000000b
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_COPY					0x0000000c
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR_REVERSE					0x0000000d
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_OR						0x0000000e
+#define    NV03_CONTEXT_ROP_ROP_DST_LOGIC_OP_SET					0x0000000f
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SHIFT					4
+#define   NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_MASK					0x000000f0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_CLEAR					0x00000000
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOR					0x00000010
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_INVERTED				0x00000020
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY_INVERTED				0x00000030
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND_REVERSE				0x00000040
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_INVERT					0x00000050
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_XOR					0x00000060
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NAND					0x00000070
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_AND					0x00000080
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_EQUI					0x00000090
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_NOOP					0x000000a0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_INVERTED				0x000000b0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_COPY					0x000000c0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR_REVERSE					0x000000d0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_OR						0x000000e0
+#define    NV03_CONTEXT_ROP_ROP_SRC_LOGIC_OP_SET					0x000000f0
+
+
+#define NV04_IMAGE_PATTERN								0x00000044
+
+#define  NV04_IMAGE_PATTERN_NOP								0x00000100
+#define  NV04_IMAGE_PATTERN_NOTIFY							0x00000104
+#define  NV04_IMAGE_PATTERN_DMA_NOTIFY							0x00000180
+#define  NV04_IMAGE_PATTERN_COLOR_FORMAT						0x00000300
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A16R5G6B5					0x00000001
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_X16A1R5G5B5					0x00000002
+#define   NV04_IMAGE_PATTERN_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_IMAGE_PATTERN_MONOCHROME_FORMAT						0x00000304
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_CGA6					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_SHAPE						0x00000308
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_8X8					0x00000000
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_64X1					0x00000001
+#define   NV04_IMAGE_PATTERN_MONOCHROME_SHAPE_1X64					0x00000002
+#define  NV04_IMAGE_PATTERN_PATTERN_SELECT						0x0000030c
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_MONO					0x00000001
+#define   NV04_IMAGE_PATTERN_PATTERN_SELECT_COLOR					0x00000002
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR0						0x00000310
+#define  NV04_IMAGE_PATTERN_MONOCHROME_COLOR1						0x00000314
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN0						0x00000318
+#define  NV04_IMAGE_PATTERN_MONOCHROME_PATTERN1						0x0000031c
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8(x)						(0x00000400+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_Y8__SIZE						0x00000010
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y0_MASK						0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y1_MASK						0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y2_MASK						0x00ff0000
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_SHIFT					24
+#define   NV04_IMAGE_PATTERN_PATTERN_Y8_Y3_MASK						0xff000000
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5(x)						(0x00000500+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_R5G6B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G0_MASK					0x000007e0
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_SHIFT					11
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R0_MASK					0x0000f800
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_G1_MASK					0x07e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_SHIFT					27
+#define   NV04_IMAGE_PATTERN_PATTERN_R5G6B5_R1_MASK					0xf8000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5(x)						(0x00000600+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5__SIZE					0x00000020
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B0_MASK					0x0000001f
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_SHIFT					5
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G0_MASK					0x000003e0
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_SHIFT					10
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R0_MASK					0x00007c00
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_B1_MASK					0x001f0000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_SHIFT					21
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_G1_MASK					0x03e00000
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_SHIFT					26
+#define   NV04_IMAGE_PATTERN_PATTERN_X1R5G5B5_R1_MASK					0x7c000000
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8(x)						(0x00000700+((x)*4))
+#define  NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8__SIZE					0x00000040
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_SHIFT					0
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_B_MASK					0x000000ff
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_SHIFT					8
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_G_MASK					0x0000ff00
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_SHIFT					16
+#define   NV04_IMAGE_PATTERN_PATTERN_X8R8G8B8_R_MASK					0x00ff0000
+
+
+#define NV03_VIDEO_LUT_CURSOR_DAC							0x00000046
+
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SYNCHRONIZE						0x00000100
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_IMAGE						0x00000104
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_CURSOR						0x00000108
+#define  NV03_VIDEO_LUT_CURSOR_DAC_STOP_DAC						0x0000010c
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_NOTIFY						0x00000180
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE(x)						(0x00000184+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_IMAGE__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT(x)						(0x0000018c+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_LUT__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR(x)					(0x00000194+((x)*4))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_DMA_CURSOR__SIZE					0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_GET							0x000002fc
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET(x)					(0x00000300+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT(x)					(0x00000304+((x)*8))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_COLOR_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_IMAGE_FORMAT_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET(x)					(0x00000340+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_OFFSET__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT(x)				(0x00000344+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_X_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_Y_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT(x)					(0x00000348+((x)*12))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_FORMAT__SIZE				0x00000002
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A				0x00000358
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_X_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_CURSOR_POINT_OUT_A_Y_MASK			0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE(x)				(0x00000380+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_W_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_IMAGE_SIZE_H_MASK				0xffff0000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC(x)					(0x00000384+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_HSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC(x)					(0x00000388+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC__SIZE					0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_SHIFT				0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_START_MASK				0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_SHIFT				16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_WIDTH_MASK				0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_VSYNC_POLARITY_MASK				0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE(x)				(0x0000038c+((x)*16))
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE__SIZE				0x00000002
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_SHIFT			0
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_WIDTH_MASK			0x0000ffff
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_SHIFT			16
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_HEIGHT_MASK			0x0fff0000
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_SHIFT			28
+#define   NV03_VIDEO_LUT_CURSOR_DAC_SET_DAC_TOTAL_SIZE_NOTIFY_MASK			0xf0000000
+#define  NV03_VIDEO_LUT_CURSOR_DAC_SET_PIXEL_CLOCK					0x000003a0
+
+
+#define NV03_DX3_TEXTURED_TRIANGLE							0x00000048
+
+#define  NV03_DX3_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV03_DX3_TEXTURED_TRIANGLE_NOTIFY						0x00000104
+#define  NV03_DX3_TEXTURED_TRIANGLE_PATCH						0x0000010c
+#define  NV03_DX3_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV03_DX3_TEXTURED_TRIANGLE_DMA_TEXTURE						0x00000184
+#define  NV03_DX3_TEXTURED_TRIANGLE_CLIP_RECTANGLE					0x00000188
+#define  NV03_DX3_TEXTURED_TRIANGLE_SURFACE						0x0000018c
+#define  NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_OFFSET					0x00000304
+#define  NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT					0x00000308
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_SHIFT		0
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_MASK_MASK			0x0000ffff
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_SHIFT		16
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_KEY_ENABLE_MASK		0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_SHIFT				20
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_COLOR_MASK				0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_SHIFT			24
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MIN_MASK			0x0f000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_SHIFT			28
+#define   NV03_DX3_TEXTURED_TRIANGLE_TEXTURE_FORMAT_SIZE_MAX_MASK			0xf0000000
+#define  NV03_DX3_TEXTURED_TRIANGLE_FILTER						0x0000030c
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_X_SHIFT				0
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_X_MASK				0x0000001f
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_SHIFT				8
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SPREAD_Y_MASK				0x00001f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_SHIFT				16
+#define   NV03_DX3_TEXTURED_TRIANGLE_FILTER_SIZE_ADJUST_MASK				0x00ff0000
+#define  NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR						0x00000310
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_B_SHIFT					0
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_B_MASK					0x000000ff
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_G_SHIFT					8
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_G_MASK					0x0000ff00
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_R_SHIFT					16
+#define   NV03_DX3_TEXTURED_TRIANGLE_FOG_COLOR_R_MASK					0x00ff0000
+#define  NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT						0x00000314
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_SHIFT			0
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_INTERPOLATOR_MASK			0x0000000f
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_SHIFT				4
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_U_MASK				0x00000030
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_SHIFT				6
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_WRAP_V_MASK				0x000000c0
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_SHIFT			8
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SOURCE_COLOR_MASK			0x00000f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_SHIFT				12
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_CULLING_MASK				0x00007000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_PERSPECTIVE_ENABLE			(1 << 15)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_SHIFT				16
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_FUNC_MASK				0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_SHIFT			20
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_Z_WRITE_ENABLE_MASK			0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_SHIFT		24
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_COLOR_WRITE_ENABLE_MASK		0x07000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_SHIFT				27
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_ROP_MASK				0x18000000
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_BETA					(1 << 29)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_DST_BLEND				(1 << 30)
+#define   NV03_DX3_TEXTURED_TRIANGLE_CONTROL_OUT_SRC_BLEND				(1 << 31)
+#define  NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL					0x00000318
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_SHIFT			0
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_REF_MASK			0x000000ff
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_SHIFT			8
+#define   NV03_DX3_TEXTURED_TRIANGLE_ALPHA_CONTROL_ALPHA_FUNC_MASK			0xffffff00
+#define  NV03_DX3_TEXTURED_TRIANGLE_SPECULAR(x)						(0x00001000+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_SPECULAR__SIZE					0x00000040
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I0_SHIFT					0
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I0_MASK					0x0000000f
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I1_SHIFT					4
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I1_MASK					0x000000f0
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I2_SHIFT					8
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I2_MASK					0x00000f00
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I3_SHIFT					12
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I3_MASK					0x0000f000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I4_SHIFT					16
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I4_MASK					0x000f0000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I5_SHIFT					20
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_I5_MASK					0x00f00000
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_FOG_SHIFT					24
+#define   NV03_DX3_TEXTURED_TRIANGLE_SPECULAR_FOG_MASK					0xff000000
+#define  NV03_DX3_TEXTURED_TRIANGLE_COLOR(x)						(0x00001004+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_COLOR__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_X(x)						(0x00001008+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_X__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_Y(x)						(0x0000100c+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_Y__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_Z(x)						(0x00001010+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_Z__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_M(x)						(0x00001014+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_M__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_U(x)						(0x00001018+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_U__SIZE						0x00000040
+#define  NV03_DX3_TEXTURED_TRIANGLE_V(x)						(0x0000101c+((x)*32))
+#define  NV03_DX3_TEXTURED_TRIANGLE_V__SIZE						0x00000040
+
+
+#define NV04_GDI_RECTANGLE_TEXT								0x0000004a
+
+#define  NV04_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV04_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV04_GDI_RECTANGLE_TEXT_PATCH							0x0000010c
+#define  NV04_GDI_RECTANGLE_TEXT_PM_TRIGGER						0x00000140
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV04_GDI_RECTANGLE_TEXT_DMA_FONTS						0x00000184
+#define  NV04_GDI_RECTANGLE_TEXT_PATTERN						0x00000188
+#define  NV04_GDI_RECTANGLE_TEXT_ROP							0x0000018c
+#define  NV04_GDI_RECTANGLE_TEXT_BETA1							0x00000190
+#define  NV04_GDI_RECTANGLE_TEXT_BETA4							0x00000194
+#define  NV04_GDI_RECTANGLE_TEXT_SURFACE						0x00000198
+#define  NV04_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_AND					0x00000000
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_ROP_AND					0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_AND					0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY					0x00000003
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY_PREMULT				0x00000004
+#define   NV04_GDI_RECTANGLE_TEXT_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_X16A1R5G5B5				0x00000002
+#define   NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8					0x00000003
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_CGA6				0x00000001
+#define   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE					0x00000002
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(x)				(0x00000400+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE(x)				(0x00000404+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE__SIZE				0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0						0x000005f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1						0x000005f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_B_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_B						0x000005fc
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0(x)				(0x00000600+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1(x)				(0x00000604+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1__SIZE			0x00000020
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x000007ec
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x000007f0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_C						0x000007f4
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_C							0x000007f8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_C						0x000007fc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000800+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x00000be4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x00000be8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR0_E						0x00000bec
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_E						0x00000bf0
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x00000bf4
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x00000bf8
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_POINT_E						0x00000bfc
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00000c00+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000080
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_F							0x00000ff0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_F_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0						0x00000ff4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1						0x00000ff8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_F_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_F						0x00000ffc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F(x)					(0x00001000+((x)*4))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_INDEX_MASK				0x000000ff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_SHIFT				8
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_X_MASK				0x000fff00
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_SHIFT				20
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_F_Y_MASK				0xfff00000
+#define  NV04_GDI_RECTANGLE_TEXT_FONT_G							0x000017f0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_OFFSET_MASK					0x0fffffff
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_SHIFT					28
+#define   NV04_GDI_RECTANGLE_TEXT_FONT_G_PITCH_MASK					0xf0000000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0						0x000017f4
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_L_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT0_T_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1						0x000017f8
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_SHIFT					0
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_R_MASK					0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_SHIFT					16
+#define   NV04_GDI_RECTANGLE_TEXT_CLIP_G_POINT1_B_MASK					0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_COLOR1_G						0x000017fc
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT(x)				(0x00001800+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT__SIZE				0x00000100
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_SHIFT			0
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_X_MASK			0x0000ffff
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_SHIFT			16
+#define   NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_POINT_Y_MASK			0xffff0000
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX(x)				(0x00001804+((x)*8))
+#define  NV04_GDI_RECTANGLE_TEXT_CHARACTER_COLOR1_G_INDEX__SIZE				0x00000100
+
+
+#define NV03_GDI_RECTANGLE_TEXT								0x0000004b
+
+#define  NV03_GDI_RECTANGLE_TEXT_NOP							0x00000100
+#define  NV03_GDI_RECTANGLE_TEXT_NOTIFY							0x00000104
+#define  NV03_GDI_RECTANGLE_TEXT_DMA_NOTIFY						0x00000180
+#define  NV03_GDI_RECTANGLE_TEXT_PATTERN						0x00000184
+#define  NV03_GDI_RECTANGLE_TEXT_ROP							0x00000188
+#define  NV03_GDI_RECTANGLE_TEXT_BETA1							0x0000018c
+#define  NV03_GDI_RECTANGLE_TEXT_SURFACE						0x00000190
+#define  NV03_GDI_RECTANGLE_TEXT_OPERATION						0x000002fc
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR_FORMAT						0x00000300
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT					0x00000304
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_A						0x000003fc
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT				0x00000400
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_Y_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT_X_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE				0x00000404
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_H_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_SIZE_W_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B						0x000007f4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT0_B_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B						0x000007f8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_POINT1_B_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_B						0x000007fc
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0				0x00000800
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_L_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_0_T_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1				0x00000804
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_SHIFT			0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_R_MASK			0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_SHIFT			16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIPPED_RECTANGLE_POINT_1_B_MASK			0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0						0x00000bec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1						0x00000bf0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_C_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_C						0x00000bf4
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_C							0x00000bf8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_W_MASK						0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_C_H_MASK						0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_C						0x00000bfc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_C_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C(x)					(0x00000c00+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_C__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0						0x00000fe8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1						0x00000fec
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_D_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_D						0x00000ff0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D						0x00000ff4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D						0x00000ff8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_D_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_D						0x00000ffc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_D_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D(x)					(0x00001000+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR1_D__SIZE				0x00000020
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0						0x000013e4
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_L_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT0_T_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1						0x000013e8
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_R_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_CLIP_E_POINT1_B_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR0_E						0x000013ec
+#define  NV03_GDI_RECTANGLE_TEXT_COLOR1_E						0x000013f0
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E						0x000013f4
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_IN_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E						0x000013f8
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_W_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_SIZE_OUT_E_H_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_POINT_E						0x000013fc
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_SHIFT					0
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_X_MASK					0x0000ffff
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_SHIFT					16
+#define   NV03_GDI_RECTANGLE_TEXT_POINT_E_Y_MASK					0xffff0000
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E(x)				(0x00001400+((x)*4))
+#define  NV03_GDI_RECTANGLE_TEXT_MONOCHROME_COLOR01_E__SIZE				0x00000020
+
+
+#define NV04_SWIZZLED_SURFACE								0x00000052
+
+#define  NV04_SWIZZLED_SURFACE_NOP							0x00000100
+#define  NV04_SWIZZLED_SURFACE_NOTIFY							0x00000104
+#define  NV04_SWIZZLED_SURFACE_DMA_NOTIFY						0x00000180
+#define  NV04_SWIZZLED_SURFACE_DMA_IMAGE						0x00000184
+#define  NV04_SWIZZLED_SURFACE_FORMAT							0x00000300
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_SHIFT					0
+#define   NV04_SWIZZLED_SURFACE_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y8					0x00000001
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5				0x00000002
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1R5G5B5_X1R5G5B5				0x00000003
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_R5G6B5					0x00000004
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y16					0x00000005
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8				0x00000006
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X8R8G8B8_X8R8G8B8				0x00000007
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000008
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000009
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_A8R8G8B8					0x0000000a
+#define    NV04_SWIZZLED_SURFACE_FORMAT_COLOR_Y32					0x0000000b
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_MASK					0x00ff0000
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_MASK					0xff000000
+#define  NV04_SWIZZLED_SURFACE_OFFSET							0x00000304
+
+
+#define NV04_CONTEXT_SURFACES_3D							0x00000053
+
+#define  NV04_CONTEXT_SURFACES_3D_NOP							0x00000100
+#define  NV04_CONTEXT_SURFACES_3D_NOTIFY						0x00000104
+#define  NV04_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV04_CONTEXT_SURFACES_3D_DMA_COLOR						0x00000184
+#define  NV04_CONTEXT_SURFACES_3D_DMA_ZETA						0x00000188
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL					0x000002f8
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_X_MASK				0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_HORIZONTAL_W_MASK				0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL						0x000002fc
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_SHIFT				0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_FORMAT						0x00000300
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_MASK					0x000000ff
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_Z1R5G5B5			0x00000001
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1R5G5B5_X1R5G5B5			0x00000002
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_R5G6B5					0x00000003
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_Z8R8G8B8			0x00000004
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X8R8G8B8_X8R8G8B8			0x00000005
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_Z1A7R8G8B8			0x00000006
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_X1A7R8G8B8_X1A7R8G8B8			0x00000007
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_COLOR_A8R8G8B8				0x00000008
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SHIFT					8
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_MASK					0x0000ff00
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_PITCH					0x00000100
+#define    NV04_CONTEXT_SURFACES_3D_FORMAT_TYPE_SWIZZLE					0x00000200
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_U_MASK				0x00ff0000
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_SHIFT				24
+#define   NV04_CONTEXT_SURFACES_3D_FORMAT_BASE_SIZE_V_MASK				0xff000000
+#define  NV04_CONTEXT_SURFACES_3D_CLIP_SIZE						0x00000304
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_W_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_CLIP_SIZE_H_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_PITCH							0x00000308
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_SHIFT					0
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_COLOR_MASK					0x0000ffff
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_SHIFT					16
+#define   NV04_CONTEXT_SURFACES_3D_PITCH_ZETA_MASK					0xffff0000
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x0000030c
+#define  NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000310
+
+
+#define NV04_DX5_TEXTURED_TRIANGLE							0x00000054
+
+#define  NV04_DX5_TEXTURED_TRIANGLE_NOP							0x00000100
+#define  NV04_DX5_TEXTURED_TRIANGLE_NOTIFY						0x00000104
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_A						0x00000184
+#define  NV04_DX5_TEXTURED_TRIANGLE_DMA_B						0x00000188
+#define  NV04_DX5_TEXTURED_TRIANGLE_SURFACE						0x0000018c
+#define  NV04_DX5_TEXTURED_TRIANGLE_COLORKEY						0x00000300
+#define  NV04_DX5_TEXTURED_TRIANGLE_OFFSET						0x00000304
+#define  NV04_DX5_TEXTURED_TRIANGLE_FORMAT						0x00000308
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_DMA_SHIFT					0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_DMA_MASK					0x00000003
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_SHIFT			2
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_KEY_MATCH_MASK			0x0000000c
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK				0x00000030
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CENTER				0x00000010
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER				0x00000020
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_MASK				0x000000c0
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CENTER				0x00000040
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER				0x00000080
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_Y8					0x00000100
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A1R5G5B5				0x00000200
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_X1R5G5B5				0x00000300
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A4R4G4B4				0x00000400
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_R5G6B5				0x00000500
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_A8R8G8B8				0x00000600
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_X8R8G8B8				0x00000700
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MASK				0x07000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT				0x01000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT			0x02000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE			0x03000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER			0x04000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP				0x05000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_WRAPU					(1 << 27)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT				28
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MASK				0x70000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_REPEAT				0x10000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_MIRRORED_REPEAT			0x20000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE			0x30000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_BORDER			0x40000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP				0x50000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FORMAT_WRAPV					(1 << 31)
+#define  NV04_DX5_TEXTURED_TRIANGLE_FILTER						0x0000030c
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE			(1 << 15)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT			16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST				0x01000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR				0x02000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST		0x03000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST		0x04000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR		0x05000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR		0x06000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_SHIFT				28
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_MASK				0x70000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST				0x10000000
+#define    NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR				0x20000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_DX5_TEXTURED_TRIANGLE_BLEND						0x00000310
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_MAP_MASK				0x0000000f
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_MASK_BIT_SHIFT				4
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_MASK_BIT_MASK				0x00000030
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_SHIFT				6
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_MASK				0x000000c0
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT				0x00000040
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD				0x00000080
+#define    NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_PHONG				0x000000c0
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_SHIFT		8
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_MASK		0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SPECULAR_ENABLE_SHIFT			12
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SPECULAR_ENABLE_MASK				0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_FOG_ENABLE_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_FOG_ENABLE_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_ALPHA_ENABLE_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_ALPHA_ENABLE_MASK				0x00f00000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_SRC_MASK					0x0f000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_DX5_TEXTURED_TRIANGLE_BLEND_DST_MASK					0xf0000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_CONTROL						0x00000314
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_REF_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE				(1 << 12)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN					(1 << 13)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT				14
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_MASK				0x0000c000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_MASK				0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT				20
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_MASK				0x00300000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE				(1 << 22)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE			(1 << 23)
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT			24
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_MASK			0x3f000000
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT				30
+#define   NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_MASK				0xc0000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR						0x00000318
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(x)					(0x00000400+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SY(x)					(0x00000404+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SY__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SZ(x)					(0x00000408+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SZ__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_RHW(x)					(0x0000040c+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_RHW__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR(x)					(0x00000410+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR__SIZE				0x00000010
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_B_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_G_MASK				0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_R_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_SHIFT				24
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_COLOR_A_MASK				0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR(x)				(0x00000414+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR__SIZE				0x00000010
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_SHIFT				0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_SHIFT				8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_SHIFT				16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_SHIFT			24
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SPECULAR_FOG_MASK				0xff000000
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TU(x)					(0x00000418+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TU__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TV(x)					(0x0000041c+((x)*32))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_TV__SIZE					0x00000010
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(x)				(0x00000600+((x)*4))
+#define  NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE__SIZE			0x00000040
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I0_SHIFT			0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I0_MASK			0x0000000f
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I1_SHIFT			4
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I1_MASK			0x000000f0
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I2_SHIFT			8
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I2_MASK			0x00000f00
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I3_SHIFT			12
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I3_MASK			0x0000f000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I4_SHIFT			16
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I4_MASK			0x000f0000
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I5_SHIFT			20
+#define   NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE_I5_MASK			0x00f00000
+
+
+#define NV04_DX6_MULTITEX_TRIANGLE							0x00000055
+
+#define  NV04_DX6_MULTITEX_TRIANGLE_NOP							0x00000100
+#define  NV04_DX6_MULTITEX_TRIANGLE_NOTIFY						0x00000104
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_NOTIFY						0x00000180
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_A						0x00000184
+#define  NV04_DX6_MULTITEX_TRIANGLE_DMA_B						0x00000188
+#define  NV04_DX6_MULTITEX_TRIANGLE_SURFACE						0x0000018c
+#define  NV04_DX6_MULTITEX_TRIANGLE_OFFSET(x)						(0x00000308+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_OFFSET__SIZE					0x00000002
+#define  NV04_DX6_MULTITEX_TRIANGLE_FORMAT(x)						(0x00000310+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_FORMAT__SIZE					0x00000002
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_DMA_SHIFT					0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_DMA_MASK					0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_SHIFT				4
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_ZOH_MASK				0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_SHIFT				6
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ORIGIN_FOH_MASK				0x000000c0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_COLOR_SHIFT					8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_COLOR_MASK					0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT				12
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_MIPMAP_LEVELS_MASK				0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_U_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_BASE_SIZE_V_MASK				0x00f00000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSU_MASK				0x07000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_WRAPU					(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_SHIFT				28
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_ADDRESSV_MASK				0x70000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FORMAT_WRAPV					(1 << 31)
+#define  NV04_DX6_MULTITEX_TRIANGLE_FILTER(x)						(0x00000318+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_FILTER__SIZE					0x00000002
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_X_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_KERNEL_SIZE_Y_MASK				0x00007f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_DITHER_ENABLE			(1 << 15)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MIPMAP_LODBIAS_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MINIFY_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MINIFY_MASK					0x07000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE			(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MAGNIFY_SHIFT				28
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_MAGNIFY_MASK				0x70000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE			(1 << 31)
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA					0x00000320
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_ALPHA_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR					0x00000324
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_0_COLOR_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA					0x0000032c
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_ALPHA_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR					0x00000330
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE0				(1 <<  0)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA0				(1 <<  1)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT0_SHIFT			2
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT0_MASK			0x000000fc
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE1				(1 <<  8)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA1				(1 <<  9)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT1_SHIFT			10
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT1_MASK			0x0000fc00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE2				(1 << 16)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA2				(1 << 17)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT2_SHIFT			18
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT2_MASK			0x00fc0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_INVERSE3				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ALPHA3				(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT3_SHIFT			26
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_ARGUMENT3_MASK			0x1c000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_OPERATION_SHIFT			29
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_1_COLOR_OPERATION_MASK			0xe0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR					0x00000334
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_COMBINE_FACTOR_A_MASK				0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_BLEND						0x00000338
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_MASK_BIT_SHIFT				4
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_MASK_BIT_MASK				0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_SHIFT				6
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SHADE_MODE_MASK				0x000000c0
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_SHIFT		8
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_TEXTURE_PERSPECTIVE_ENABLE_MASK		0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SPECULAR_ENABLE_SHIFT			12
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SPECULAR_ENABLE_MASK				0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_FOG_ENABLE_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_FOG_ENABLE_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_ALPHA_ENABLE_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_ALPHA_ENABLE_MASK				0x00f00000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SRC_SHIFT					24
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_SRC_MASK					0x0f000000
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_DST_SHIFT					28
+#define   NV04_DX6_MULTITEX_TRIANGLE_BLEND_DST_MASK					0xf0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL0						0x0000033c
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_REF_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_FUNC_MASK				0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_TEST_ENABLE				(1 << 12)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ORIGIN					(1 << 13)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_ENABLE_SHIFT				14
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_ENABLE_MASK				0x0000c000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FUNC_MASK				0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_SHIFT				20
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_CULL_MODE_MASK				0x00300000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_DITHER_ENABLE				(1 << 22)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_PERSPECTIVE_ENABLE			(1 << 23)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_WRITE_ENABLE				(1 << 24)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_STENCIL_WRITE_ENABLE			(1 << 25)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_ALPHA_WRITE_ENABLE			(1 << 26)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_RED_WRITE_ENABLE				(1 << 27)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_GREEN_WRITE_ENABLE			(1 << 28)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_BLUE_WRITE_ENABLE				(1 << 29)
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_SHIFT				30
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL0_Z_FORMAT_MASK				0xc0000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL1						0x00000340
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_TEST_ENABLE_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_TEST_ENABLE_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_FUNC_MASK				0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_REF_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_READ_MASK			0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_SHIFT			24
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL1_STENCIL_MASK_WRITE_MASK			0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_CONTROL2						0x00000344
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_FAIL_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZFAIL_MASK			0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_CONTROL2_STENCIL_OP_ZPASS_MASK			0x00000f00
+#define  NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR						0x00000348
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_B_SHIFT					0
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_B_MASK					0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_G_SHIFT					8
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_G_MASK					0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_R_SHIFT					16
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_R_MASK					0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_A_SHIFT					24
+#define   NV04_DX6_MULTITEX_TRIANGLE_FOGCOLOR_A_MASK					0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SX(x)					(0x00000400+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SX__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SY(x)					(0x00000404+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SY__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SZ(x)					(0x00000408+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SZ__SIZE					0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_RHW(x)					(0x0000040c+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_RHW__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR(x)					(0x00000410+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR__SIZE				0x00000008
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_SHIFT				0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_SHIFT				8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_SHIFT				16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_SHIFT				24
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_COLOR_A_MASK				0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR(x)				(0x00000414+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR__SIZE				0x00000008
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_B_MASK				0x000000ff
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_G_MASK				0x0000ff00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_R_MASK				0x00ff0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_SHIFT			24
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_SPECULAR_FOG_MASK			0xff000000
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU0(x)					(0x00000418+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU0__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV0(x)					(0x0000041c+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV0__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU1(x)					(0x00000420+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TU1__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV1(x)					(0x00000424+((x)*40))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_TV1__SIZE				0x00000008
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE(x)				(0x00000540+((x)*4))
+#define  NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE__SIZE			0x00000030
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I0_SHIFT			0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I0_MASK			0x0000000f
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I1_SHIFT			4
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I1_MASK			0x000000f0
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I2_SHIFT			8
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I2_MASK			0x00000f00
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I3_SHIFT			12
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I3_MASK			0x0000f000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I4_SHIFT			16
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I4_MASK			0x000f0000
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I5_SHIFT			20
+#define   NV04_DX6_MULTITEX_TRIANGLE_TLMTVERTEX_DRAWPRIMITIVE_I5_MASK			0x00f00000
+
+
+#define NV10_DX5_TEXTURED_TRIANGLE							0x00000094
+
+
+
+#define NV10TCL										0x00000056
+
+#define  NV10TCL_NOP									0x00000100
+#define  NV10TCL_NOTIFY									0x00000104
+#define  NV10TCL_DMA_NOTIFY								0x00000180
+#define  NV10TCL_DMA_IN_MEMORY0								0x00000184
+#define  NV10TCL_DMA_IN_MEMORY1								0x00000188
+#define  NV10TCL_DMA_VTXBUF0								0x0000018c
+#define  NV10TCL_DMA_IN_MEMORY2								0x00000194
+#define  NV10TCL_DMA_IN_MEMORY3								0x00000198
+#define  NV10TCL_RT_HORIZ								0x00000200
+#define   NV10TCL_RT_HORIZ_X_SHIFT							0
+#define   NV10TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV10TCL_RT_HORIZ_W_SHIFT							16
+#define   NV10TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV10TCL_RT_VERT								0x00000204
+#define   NV10TCL_RT_VERT_Y_SHIFT							0
+#define   NV10TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV10TCL_RT_VERT_H_SHIFT							16
+#define   NV10TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV10TCL_RT_FORMAT								0x00000208
+#define   NV10TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV10TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV10TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV10TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV10TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV10TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV10TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV10TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV10TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV10TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV10TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV10TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV10TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV10TCL_RT_PITCH								0x0000020c
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_SHIFT						0
+#define   NV10TCL_RT_PITCH_COLOR_PITCH_MASK						0x0000ffff
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_SHIFT						16
+#define   NV10TCL_RT_PITCH_ZETA_PITCH_MASK						0xffff0000
+#define  NV10TCL_COLOR_OFFSET								0x00000210
+#define  NV10TCL_ZETA_OFFSET								0x00000214
+#define  NV10TCL_TX_OFFSET(x)								(0x00000218+((x)*4))
+#define  NV10TCL_TX_OFFSET__SIZE							0x00000002
+#define  NV10TCL_TX_FORMAT(x)								(0x00000220+((x)*4))
+#define  NV10TCL_TX_FORMAT__SIZE							0x00000002
+#define   NV10TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV10TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV10TCL_TX_FORMAT_CUBE_MAP							(1 <<  2)
+#define   NV10TCL_TX_FORMAT_FORMAT_SHIFT						7
+#define   NV10TCL_TX_FORMAT_FORMAT_MASK							0x00000780
+#define    NV10TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV10TCL_TX_FORMAT_FORMAT_A8							0x00000080
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000100
+#define    NV10TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000180
+#define    NV10TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000200
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000280
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000300
+#define    NV10TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000380
+#define    NV10TCL_TX_FORMAT_FORMAT_INDEX8						0x00000580
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV10TCL_TX_FORMAT_FORMAT_DXT5						0x00000780
+#define    NV10TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00000800
+#define    NV10TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00000880
+#define    NV10TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00000900
+#define    NV10TCL_TX_FORMAT_FORMAT_L8_RECT						0x00000980
+#define    NV10TCL_TX_FORMAT_FORMAT_A8L8						0x00000d00
+#define    NV10TCL_TX_FORMAT_FORMAT_A8_RECT2						0x00000d80
+#define    NV10TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00000e80
+#define    NV10TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00000f00
+#define    NV10TCL_TX_FORMAT_FORMAT_L8A8_RECT						0x00001000
+#define    NV10TCL_TX_FORMAT_FORMAT_DSDT						0x00001400
+#define    NV10TCL_TX_FORMAT_FORMAT_A16							0x00001900
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO16						0x00001980
+#define    NV10TCL_TX_FORMAT_FORMAT_A16_RECT						0x00001a80
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00001b00
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO8						0x00002200
+#define    NV10TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00002280
+#define    NV10TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00002300
+#define    NV10TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00002380
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00002500
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00002580
+#define    NV10TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00002600
+#define   NV10TCL_TX_FORMAT_NPOT							(1 << 11)
+#define   NV10TCL_TX_FORMAT_MIPMAP							(1 << 15)
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						16
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x000f0000
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						20
+#define   NV10TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x00f00000
+#define   NV10TCL_TX_FORMAT_WRAP_S_SHIFT						24
+#define   NV10TCL_TX_FORMAT_WRAP_S_MASK							0x0f000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_REPEAT						0x01000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT					0x02000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE					0x03000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER					0x04000000
+#define    NV10TCL_TX_FORMAT_WRAP_S_CLAMP						0x05000000
+#define   NV10TCL_TX_FORMAT_WRAP_T_SHIFT						28
+#define   NV10TCL_TX_FORMAT_WRAP_T_MASK							0xf0000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_REPEAT						0x10000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_MIRRORED_REPEAT					0x20000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_EDGE					0x30000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP_TO_BORDER					0x40000000
+#define    NV10TCL_TX_FORMAT_WRAP_T_CLAMP						0x50000000
+#define  NV10TCL_TX_ENABLE(x)								(0x00000228+((x)*4))
+#define  NV10TCL_TX_ENABLE__SIZE							0x00000002
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_SHIFT						4
+#define   NV10TCL_TX_ENABLE_ANISOTROPY_MASK						0x00000030
+#define   NV10TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV10TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV10TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV10TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV10TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV10TCL_TX_NPOT_PITCH(x)							(0x00000230+((x)*4))
+#define  NV10TCL_TX_NPOT_PITCH__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_SHIFT						16
+#define   NV10TCL_TX_NPOT_PITCH_PITCH_MASK						0xffff0000
+#define  NV10TCL_TX_NPOT_SIZE(x)							(0x00000240+((x)*4))
+#define  NV10TCL_TX_NPOT_SIZE__SIZE							0x00000002
+#define   NV10TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV10TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV10TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV10TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV10TCL_TX_FILTER(x)								(0x00000248+((x)*4))
+#define  NV10TCL_TX_FILTER__SIZE							0x00000002
+#define   NV10TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV10TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV10TCL_TX_FILTER_MINIFY_SHIFT						24
+#define   NV10TCL_TX_FILTER_MINIFY_MASK							0x0f000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST						0x01000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR						0x02000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x03000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x04000000
+#define    NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x05000000
+#define    NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x06000000
+#define   NV10TCL_TX_FILTER_MAGNIFY_SHIFT						28
+#define   NV10TCL_TX_FILTER_MAGNIFY_MASK						0xf0000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_NEAREST						0x10000000
+#define    NV10TCL_TX_FILTER_MAGNIFY_LINEAR						0x20000000
+#define  NV10TCL_TX_PALETTE_OFFSET(x)							(0x00000250+((x)*4))
+#define  NV10TCL_TX_PALETTE_OFFSET__SIZE						0x00000002
+#define  NV10TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV10TCL_RC_IN_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL_NV				0x00000040
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE_NV				0x00000060
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE_NV				0x000000e0
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL_NV				0x00004000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE_NV				0x00006000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE_NV				0x0000e000
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0_NV				0x00010000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1_NV				0x00020000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR_NV				0x00050000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0_NV					0x000c0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE1_NV					0x000d0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL_NV				0x00400000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE_NV				0x00600000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE_NV				0x00e00000
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0_NV				0x01000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1_NV				0x02000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR_NV				0x05000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0_NV					0x0c000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE1_NV					0x0d000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL_NV				0x40000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE_NV				0x60000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE_NV				0xe0000000
+#define  NV10TCL_RC_IN_RGB(x)								(0x00000268+((x)*4))
+#define  NV10TCL_RC_IN_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV10TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV10TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_IN_RGB_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV10TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_IN_RGB_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV10TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_IN_RGB_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_IN_RGB_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_IN_RGB_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_IN_RGB_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_RC_COLOR(x)								(0x00000270+((x)*4))
+#define  NV10TCL_RC_COLOR__SIZE								0x00000002
+#define   NV10TCL_RC_COLOR_B_SHIFT							0
+#define   NV10TCL_RC_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_RC_COLOR_G_SHIFT							8
+#define   NV10TCL_RC_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_RC_COLOR_R_SHIFT							16
+#define   NV10TCL_RC_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_RC_COLOR_A_SHIFT							24
+#define   NV10TCL_RC_COLOR_A_MASK							0xff000000
+#define  NV10TCL_RC_OUT_ALPHA(x)							(0x00000278+((x)*4))
+#define  NV10TCL_RC_OUT_ALPHA__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x0000000e
+#define    NV10TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x000000e0
+#define    NV10TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x00000e00
+#define    NV10TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV			0x00008000
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV10TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV10TCL_RC_OUT_RGB(x)								(0x00000280+((x)*4))
+#define  NV10TCL_RC_OUT_RGB__SIZE							0x00000002
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV10TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV10TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000000e0
+#define    NV10TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV10TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV10TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV10TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV10TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV10TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV10TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV				0x00008000
+#define   NV10TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV10TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV10TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define   NV10TCL_RC_OUT_RGB_OPERATION_SHIFT						27
+#define   NV10TCL_RC_OUT_RGB_OPERATION_MASK						0x38000000
+#define  NV10TCL_RC_FINAL0								0x00000288
+#define   NV10TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV10TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV10TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV10TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV10TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV10TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV10TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV10TCL_RC_FINAL0_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV10TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV10TCL_RC_FINAL0_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV10TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV10TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV10TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV10TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV10TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV10TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV10TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_FINAL0_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_FINAL0_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_FINAL0_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_FINAL0_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_FINAL0_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_RC_FINAL1								0x0000028c
+#define   NV10TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV10TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV10TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV10TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV10TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV10TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV10TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV10TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV10TCL_RC_FINAL1_G_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0_NV						0x00000c00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE1_NV						0x00000d00
+#define    NV10TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV10TCL_RC_FINAL1_G_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV10TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV10TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV10TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV10TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV10TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV10TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV10TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV10TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV10TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV10TCL_RC_FINAL1_F_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0_NV						0x000c0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE1_NV						0x000d0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV10TCL_RC_FINAL1_F_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV10TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV10TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV10TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV10TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV10TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV10TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0_NV						0x0c000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE1_NV						0x0d000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV10TCL_RC_FINAL1_E_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV10TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV10TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV10TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV10TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV10TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV10TCL_LIGHT_MODEL								0x00000294
+#define   NV10TCL_LIGHT_MODEL_COLOR_CONTROL						(1 <<  1)
+#define   NV10TCL_LIGHT_MODEL_LOCAL_VIEWER						(1 << 16)
+#define  NV10TCL_COLOR_MATERIAL_ENABLE							0x00000298
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_SPECULAR					(1 <<  0)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_DIFFUSE						(1 <<  1)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_AMBIENT						(1 <<  2)
+#define   NV10TCL_COLOR_MATERIAL_ENABLE_EMISSION					(1 <<  3)
+#define  NV10TCL_FOG_MODE								0x0000029c
+#define   NV10TCL_FOG_MODE_EXP								0x00000800
+#define   NV10TCL_FOG_MODE_EXP_2							0x00000802
+#define   NV10TCL_FOG_MODE_EXP2								0x00000803
+#define   NV10TCL_FOG_MODE_LINEAR							0x00000804
+#define   NV10TCL_FOG_MODE_LINEAR_2							0x00002601
+#define  NV10TCL_FOG_COORD_DIST								0x000002a0
+#define   NV10TCL_FOG_COORD_DIST_COORD_FALSE						0x00000000
+#define   NV10TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_RADIAL_NV		0x00000001
+#define   NV10TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_PLANE_ABSOLUTE_NV	0x00000002
+#define   NV10TCL_FOG_COORD_DIST_COORD_FOG						0x00000003
+#define  NV10TCL_FOG_ENABLE								0x000002a4
+#define  NV10TCL_FOG_COLOR								0x000002a8
+#define   NV10TCL_FOG_COLOR_R_SHIFT							0
+#define   NV10TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV10TCL_FOG_COLOR_G_SHIFT							8
+#define   NV10TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_FOG_COLOR_B_SHIFT							16
+#define   NV10TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV10TCL_FOG_COLOR_A_SHIFT							24
+#define   NV10TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV10TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_L_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_LEFT_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_R_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_HORIZ_CLIP_RIGHT_ENABLE					(1 << 27)
+#define  NV10TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV10TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_SHIFT					0
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_T_MASK					0x000007ff
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_TOP_ENABLE					(1 << 11)
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_SHIFT					16
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_B_MASK					0x07ff0000
+#define   NV10TCL_VIEWPORT_CLIP_VERT_CLIP_BOTTOM_ENABLE					(1 << 27)
+#define  NV10TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV10TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV10TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV10TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV10TCL_DITHER_ENABLE								0x00000310
+#define  NV10TCL_LIGHTING_ENABLE							0x00000314
+#define  NV10TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV10TCL_POINT_SMOOTH_ENABLE							0x0000031c
+#define  NV10TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV10TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV10TCL_VERTEX_WEIGHT_ENABLE							0x00000328
+#define  NV10TCL_STENCIL_ENABLE								0x0000032c
+#define  NV10TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV10TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV10TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV10TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV10TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV10TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV10TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV10TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV10TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV10TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_FUNC_DST								0x00000348
+#define   NV10TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV10TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV10TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV10TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV10TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV10TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV10TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV10TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV10TCL_BLEND_COLOR								0x0000034c
+#define   NV10TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV10TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV10TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV10TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV10TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV10TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV10TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV10TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV10TCL_BLEND_EQUATION								0x00000350
+#define   NV10TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV10TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV10TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV10TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV10TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV10TCL_DEPTH_FUNC								0x00000354
+#define   NV10TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV10TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV10TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV10TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV10TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV10TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV10TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV10TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV10TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV10TCL_COLOR_MASK								0x00000358
+#define   NV10TCL_COLOR_MASK_B								(1 <<  0)
+#define   NV10TCL_COLOR_MASK_G								(1 <<  8)
+#define   NV10TCL_COLOR_MASK_R								(1 << 16)
+#define   NV10TCL_COLOR_MASK_A								(1 << 24)
+#define  NV10TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV10TCL_STENCIL_MASK								0x00000360
+#define  NV10TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV10TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV10TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV10TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV10TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV10TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV10TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV10TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV10TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV10TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV10TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV10TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV10TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV10TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV10TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV10TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV10TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV10TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV10TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV10TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV10TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV10TCL_SHADE_MODEL								0x0000037c
+#define   NV10TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV10TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV10TCL_LINE_WIDTH								0x00000380
+#define  NV10TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV10TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV10TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV10TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV10TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV10TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV10TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV10TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV10TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV10TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV10TCL_CULL_FACE								0x0000039c
+#define   NV10TCL_CULL_FACE_FRONT							0x00000404
+#define   NV10TCL_CULL_FACE_BACK							0x00000405
+#define   NV10TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV10TCL_FRONT_FACE								0x000003a0
+#define   NV10TCL_FRONT_FACE_CW								0x00000900
+#define   NV10TCL_FRONT_FACE_CCW							0x00000901
+#define  NV10TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV10TCL_COLOR_MATERIAL_R							0x000003a8
+#define  NV10TCL_COLOR_MATERIAL_G							0x000003ac
+#define  NV10TCL_COLOR_MATERIAL_B							0x000003b0
+#define  NV10TCL_COLOR_MATERIAL_A							0x000003b4
+#define  NV10TCL_COLOR_CONTROL								0x000003b8
+#define  NV10TCL_ENABLED_LIGHTS								0x000003bc
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT0							(1 <<  0)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT1							(1 <<  2)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT2							(1 <<  4)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT3							(1 <<  6)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT4							(1 <<  8)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT5							(1 << 10)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT6							(1 << 12)
+#define   NV10TCL_ENABLED_LIGHTS_LIGHT7							(1 << 14)
+#define  NV10TCL_TX_GEN_S(x)								(0x000003c0+((x)*16))
+#define  NV10TCL_TX_GEN_S__SIZE								0x00000002
+#define   NV10TCL_TX_GEN_S_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_S_EYE_LINEAR							0x00002400
+#define   NV10TCL_TX_GEN_S_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_S_SPHERE_MAP							0x00002402
+#define   NV10TCL_TX_GEN_S_NORMAL_MAP							0x00008511
+#define   NV10TCL_TX_GEN_S_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_T(x)								(0x000003c4+((x)*16))
+#define  NV10TCL_TX_GEN_T__SIZE								0x00000002
+#define   NV10TCL_TX_GEN_T_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_T_EYE_LINEAR							0x00002400
+#define   NV10TCL_TX_GEN_T_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_T_SPHERE_MAP							0x00002402
+#define   NV10TCL_TX_GEN_T_NORMAL_MAP							0x00008511
+#define   NV10TCL_TX_GEN_T_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_R(x)								(0x000003c8+((x)*16))
+#define  NV10TCL_TX_GEN_R__SIZE								0x00000002
+#define   NV10TCL_TX_GEN_R_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_R_EYE_LINEAR							0x00002400
+#define   NV10TCL_TX_GEN_R_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_R_SPHERE_MAP							0x00002402
+#define   NV10TCL_TX_GEN_R_NORMAL_MAP							0x00008511
+#define   NV10TCL_TX_GEN_R_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_GEN_Q(x)								(0x000003cc+((x)*16))
+#define  NV10TCL_TX_GEN_Q__SIZE								0x00000002
+#define   NV10TCL_TX_GEN_Q_FALSE							0x00000000
+#define   NV10TCL_TX_GEN_Q_EYE_LINEAR							0x00002400
+#define   NV10TCL_TX_GEN_Q_OBJECT_LINEAR						0x00002401
+#define   NV10TCL_TX_GEN_Q_SPHERE_MAP							0x00002402
+#define   NV10TCL_TX_GEN_Q_NORMAL_MAP							0x00008511
+#define   NV10TCL_TX_GEN_Q_REFLECTION_MAP						0x00008512
+#define  NV10TCL_TX_MATRIX_ENABLE(x)							(0x000003e0+((x)*4))
+#define  NV10TCL_TX_MATRIX_ENABLE__SIZE							0x00000002
+#define  NV10TCL_VIEW_MATRIX_ENABLE							0x000003e8
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW1						(1 <<  0)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_MODELVIEW0						(1 <<  1)
+#define   NV10TCL_VIEW_MATRIX_ENABLE_PROJECTION						(1 <<  2)
+#define  NV10TCL_POINT_SIZE								0x000003ec
+#define  NV10TCL_MODELVIEW0_MATRIX(x)							(0x00000400+((x)*4))
+#define  NV10TCL_MODELVIEW0_MATRIX__SIZE						0x00000010
+#define  NV10TCL_MODELVIEW1_MATRIX(x)							(0x00000440+((x)*4))
+#define  NV10TCL_MODELVIEW1_MATRIX__SIZE						0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX(x)						(0x00000480+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW0_MATRIX__SIZE					0x00000010
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX(x)						(0x000004c0+((x)*4))
+#define  NV10TCL_INVERSE_MODELVIEW1_MATRIX__SIZE					0x00000010
+#define  NV10TCL_PROJECTION_MATRIX(x)							(0x00000500+((x)*4))
+#define  NV10TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV10TCL_TX0_MATRIX(x)								(0x00000540+((x)*4))
+#define  NV10TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV10TCL_TX1_MATRIX(x)								(0x00000580+((x)*4))
+#define  NV10TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV10TCL_CLIP_PLANE_A(x)							(0x00000600+((x)*16))
+#define  NV10TCL_CLIP_PLANE_A__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_B(x)							(0x00000604+((x)*16))
+#define  NV10TCL_CLIP_PLANE_B__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_C(x)							(0x00000608+((x)*16))
+#define  NV10TCL_CLIP_PLANE_C__SIZE							0x00000008
+#define  NV10TCL_CLIP_PLANE_D(x)							(0x0000060c+((x)*16))
+#define  NV10TCL_CLIP_PLANE_D__SIZE							0x00000008
+#define  NV10TCL_FOG_EQUATION_CONSTANT							0x00000680
+#define  NV10TCL_FOG_EQUATION_LINEAR							0x00000684
+#define  NV10TCL_FOG_EQUATION_QUADRATIC							0x00000688
+#define  NV10TCL_FRONT_MATERIAL_SHININESS(x)						(0x000006a0+((x)*4))
+#define  NV10TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000006c4
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000006c8
+#define  NV10TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000006cc
+#define  NV10TCL_VIEWPORT_SCALE_X							0x000006e8
+#define  NV10TCL_VIEWPORT_SCALE_Y							0x000006ec
+#define  NV10TCL_VIEWPORT_SCALE_Z							0x000006f0
+#define  NV10TCL_VIEWPORT_SCALE_W							0x000006f4
+#define  NV10TCL_POINT_PARAMETER(x)							(0x000006f8+((x)*4))
+#define  NV10TCL_POINT_PARAMETER__SIZE							0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00000800+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00000804+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00000808+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000080c+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00000810+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00000814+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00000818+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000081c+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00000820+((x)*128))
+#define  NV10TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_X(x)							(0x00000828+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000082c+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00000830+((x)*128))
+#define  NV10TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_X(x)							(0x00000834+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Y(x)							(0x00000838+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV10TCL_LIGHT_DIRECTION_Z(x)							(0x0000083c+((x)*128))
+#define  NV10TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00000840+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00000844+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00000848+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_X(x)							(0x0000084c+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Y(x)							(0x00000850+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_DIR_Z(x)							(0x00000854+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00000858+((x)*128))
+#define  NV10TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV10TCL_LIGHT_POSITION_X(x)							(0x0000085c+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Y(x)							(0x00000860+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV10TCL_LIGHT_POSITION_Z(x)							(0x00000864+((x)*128))
+#define  NV10TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00000868+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000086c+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00000870+((x)*128))
+#define  NV10TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV10TCL_VERTEX_POS_3F_X							0x00000c00
+#define  NV10TCL_VERTEX_POS_3F_Y							0x00000c04
+#define  NV10TCL_VERTEX_POS_3F_Z							0x00000c08
+#define  NV10TCL_VERTEX_POS_4F_X							0x00000c18
+#define  NV10TCL_VERTEX_POS_4F_Y							0x00000c1c
+#define  NV10TCL_VERTEX_POS_4F_Z							0x00000c20
+#define  NV10TCL_VERTEX_POS_4F_W							0x00000c24
+#define  NV10TCL_VERTEX_NOR_3F_X							0x00000c30
+#define  NV10TCL_VERTEX_NOR_3F_Y							0x00000c34
+#define  NV10TCL_VERTEX_NOR_3F_Z							0x00000c38
+#define  NV10TCL_VERTEX_NOR_3I_XY							0x00000c40
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV10TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV10TCL_VERTEX_NOR_3I_Z							0x00000c44
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV10TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV10TCL_VERTEX_COL_4F_R							0x00000c50
+#define  NV10TCL_VERTEX_COL_4F_G							0x00000c54
+#define  NV10TCL_VERTEX_COL_4F_B							0x00000c58
+#define  NV10TCL_VERTEX_COL_4F_A							0x00000c5c
+#define  NV10TCL_VERTEX_COL_3F_R							0x00000c60
+#define  NV10TCL_VERTEX_COL_3F_G							0x00000c64
+#define  NV10TCL_VERTEX_COL_3F_B							0x00000c68
+#define  NV10TCL_VERTEX_COL_4I								0x00000c6c
+#define   NV10TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV10TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV10TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV10TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV10TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV10TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV10TCL_VERTEX_COL2_3F_R							0x00000c80
+#define  NV10TCL_VERTEX_COL2_3F_G							0x00000c84
+#define  NV10TCL_VERTEX_COL2_3F_B							0x00000c88
+#define  NV10TCL_VERTEX_COL2_3I								0x00000c8c
+#define   NV10TCL_VERTEX_COL2_3I_R_SHIFT						0
+#define   NV10TCL_VERTEX_COL2_3I_R_MASK							0x000000ff
+#define   NV10TCL_VERTEX_COL2_3I_G_SHIFT						8
+#define   NV10TCL_VERTEX_COL2_3I_G_MASK							0x0000ff00
+#define   NV10TCL_VERTEX_COL2_3I_B_SHIFT						16
+#define   NV10TCL_VERTEX_COL2_3I_B_MASK							0x00ff0000
+#define  NV10TCL_VERTEX_TX0_2F_S							0x00000c90
+#define  NV10TCL_VERTEX_TX0_2F_T							0x00000c94
+#define  NV10TCL_VERTEX_TX0_2I								0x00000c98
+#define   NV10TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX0_4F_S							0x00000ca0
+#define  NV10TCL_VERTEX_TX0_4F_T							0x00000ca4
+#define  NV10TCL_VERTEX_TX0_4F_R							0x00000ca8
+#define  NV10TCL_VERTEX_TX0_4F_Q							0x00000cac
+#define  NV10TCL_VERTEX_TX0_4I_ST							0x00000cb0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX0_4I_RQ							0x00000cb4
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_2F_S							0x00000cb8
+#define  NV10TCL_VERTEX_TX1_2F_T							0x00000cbc
+#define  NV10TCL_VERTEX_TX1_2I								0x00000cc0
+#define   NV10TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV10TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV10TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV10TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV10TCL_VERTEX_TX1_4F_S							0x00000cc8
+#define  NV10TCL_VERTEX_TX1_4F_T							0x00000ccc
+#define  NV10TCL_VERTEX_TX1_4F_R							0x00000cd0
+#define  NV10TCL_VERTEX_TX1_4F_Q							0x00000cd4
+#define  NV10TCL_VERTEX_TX1_4I_ST							0x00000cd8
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV10TCL_VERTEX_TX1_4I_RQ							0x00000cdc
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV10TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV10TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV10TCL_VERTEX_FOG_1F								0x00000ce0
+#define  NV10TCL_VERTEX_WGH_1F								0x00000ce4
+#define  NV10TCL_EDGEFLAG_ENABLE							0x00000cec
+#define  NV10TCL_VERTEX_ARRAY_VALIDATE							0x00000cf0
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(x)						(0x00000d00+((x)*8))
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET__SIZE					0x00000008
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(x)						(0x00000d04+((x)*8))
+#define  NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT__SIZE					0x00000008
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_FIELDS_SHIFT				4
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_FIELDS_MASK				0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_STRIDE_SHIFT				8
+#define   NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT_STRIDE_MASK				0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_POS						0x00000d00
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_POS						0x00000d04
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_POS_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_COL						0x00000d08
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_COL						0x00000d0c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_COL2						0x00000d10
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_COL2						0x00000d14
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_COL2_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_TX0						0x00000d18
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_TX0						0x00000d1c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX0_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_TX1						0x00000d20
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_TX1						0x00000d24
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_TX1_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_NOR						0x00000d28
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_NOR						0x00000d2c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_NOR_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_WGH						0x00000d30
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_WGH						0x00000d34
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_WGH_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_ARRAY_OFFSET_FOG						0x00000d38
+#define  NV10TCL_VERTEX_ARRAY_FORMAT_FOG						0x00000d3c
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_TYPE_SHIFT					0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_TYPE_MASK					0x0000000f
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_FIELDS_SHIFT					4
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_FIELDS_MASK					0x000000f0
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_STRIDE_SHIFT					8
+#define   NV10TCL_VERTEX_ARRAY_FORMAT_FOG_STRIDE_MASK					0x0000ff00
+#define  NV10TCL_VERTEX_BEGIN_END							0x00000dfc
+#define   NV10TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV10TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV10TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV10TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV10TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV10TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV10TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV10TCL_VB_ELEMENT_U16								0x00000e00
+#define   NV10TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV10TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV10TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV10TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV10TCL_VB_ELEMENT_U32								0x00001100
+#define  NV10TCL_VERTEX_BUFFER_BEGIN_END						0x000013fc
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_STOP						0x00000000
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POINTS					0x00000001
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINES						0x00000002
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_LOOP					0x00000003
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_LINE_STRIP					0x00000004
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLES					0x00000005
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_STRIP				0x00000006
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_TRIANGLE_FAN					0x00000007
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUADS						0x00000008
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_QUAD_STRIP					0x00000009
+#define   NV10TCL_VERTEX_BUFFER_BEGIN_END_POLYGON					0x0000000a
+#define  NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS						0x00001400
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_SHIFT					0
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_FIRST_MASK					0x0000ffff
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_SHIFT					24
+#define   NV10TCL_VERTEX_BUFFER_DRAW_ARRAYS_LAST_MASK					0xff000000
+#define  NV10TCL_VERTEX_ARRAY_DATA							0x00001800
+
+
+#define NV04_CONTEXT_COLOR_KEY								0x00000057
+
+
+
+#define NV03_CONTEXT_SURFACES_2D							0x00000058
+
+#define  NV03_CONTEXT_SURFACES_2D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_2D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_2D_DMA_SOURCE						0x00000184
+#define  NV03_CONTEXT_SURFACES_2D_DMA_DESTIN						0x00000188
+#define  NV03_CONTEXT_SURFACES_2D_COLOR_FORMAT						0x00000300
+#define  NV03_CONTEXT_SURFACES_2D_PITCH							0x00000304
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_SHIFT					0
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_SOURCE_MASK					0x0000ffff
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_SHIFT					16
+#define   NV03_CONTEXT_SURFACES_2D_PITCH_DESTIN_MASK					0xffff0000
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_SOURCE						0x00000308
+#define  NV03_CONTEXT_SURFACES_2D_OFFSET_DESTIN						0x0000030c
+
+
+#define NV03_CONTEXT_SURFACES_3D							0x0000005a
+
+#define  NV03_CONTEXT_SURFACES_3D_SYNCHRONIZE						0x00000100
+#define  NV03_CONTEXT_SURFACES_3D_DMA_NOTIFY						0x00000180
+#define  NV03_CONTEXT_SURFACES_3D_DMA_SURFACE						0x00000184
+#define  NV03_CONTEXT_SURFACES_3D_PITCH							0x00000300
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_COLOR						0x00000304
+#define  NV03_CONTEXT_SURFACES_3D_OFFSET_ZETA						0x00000308
+
+
+#define NV04_RENDER_SOLID_LINE								0x0000005c
+
+#define  NV04_RENDER_SOLID_LINE_SURFACE							0x00000198
+
+
+#define NV04_RENDER_SOLID_TRIANGLE							0x0000005d
+
+
+
+#define NV04_RENDER_SOLID_RECTANGLE							0x0000005e
+
+#define  NV04_RENDER_SOLID_RECTANGLE_SURFACE						0x00000198
+
+
+#define NV04_IMAGE_BLIT									0x0000005f
+
+#define  NV04_IMAGE_BLIT_NOP								0x00000100
+#define  NV04_IMAGE_BLIT_NOTIFY								0x00000104
+#define  NV04_IMAGE_BLIT_DMA_NOTIFY							0x00000180
+#define  NV04_IMAGE_BLIT_COLOR_KEY							0x00000184
+#define  NV04_IMAGE_BLIT_CLIP_RECTANGLE							0x00000188
+#define  NV04_IMAGE_BLIT_PATTERN							0x0000018c
+#define  NV04_IMAGE_BLIT_ROP								0x00000190
+#define  NV04_IMAGE_BLIT_BETA4								0x00000198
+#define  NV04_IMAGE_BLIT_SURFACE							0x0000019c
+#define  NV04_IMAGE_BLIT_OPERATION							0x000002fc
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY_AND						0x00000000
+#define   NV04_IMAGE_BLIT_OPERATION_ROP_AND						0x00000001
+#define   NV04_IMAGE_BLIT_OPERATION_BLEND_AND						0x00000002
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY						0x00000003
+#define   NV04_IMAGE_BLIT_OPERATION_SRCCOPY_PREMULT					0x00000004
+#define   NV04_IMAGE_BLIT_OPERATION_BLEND_PREMULT					0x00000005
+
+
+#define NV04_INDEXED_IMAGE_FROM_CPU							0x00000060
+
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOP						0x00000100
+#define  NV04_INDEXED_IMAGE_FROM_CPU_NOTIFY						0x00000104
+#define  NV04_INDEXED_IMAGE_FROM_CPU_PATCH						0x0000010c
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV04_INDEXED_IMAGE_FROM_CPU_DMA_LUT						0x00000184
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR_FORMAT					0x000003e8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_INDEX_FORMAT					0x000003ec
+#define  NV04_INDEXED_IMAGE_FROM_CPU_LUT_OFFSET						0x000003f0
+#define  NV04_INDEXED_IMAGE_FROM_CPU_POINT						0x000003f4
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_OUT						0x000003f8
+#define  NV04_INDEXED_IMAGE_FROM_CPU_SIZE_IN						0x000003fc
+#define  NV04_INDEXED_IMAGE_FROM_CPU_COLOR						0x00000400
+
+
+#define NV04_IMAGE_FROM_CPU								0x00000061
+
+#define  NV04_IMAGE_FROM_CPU_BETA4							0x00000198
+#define  NV04_IMAGE_FROM_CPU_SURFACE							0x0000019c
+
+
+#define NV10_CONTEXT_SURFACES_2D							0x00000062
+
+
+
+#define NV05_SCALED_IMAGE_FROM_MEMORY							0x00000063
+
+#define  NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION					0x000002fc
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_DITHER				0x00000000
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define   NV05_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_SUBTR_TRUNCATE			0x00000002
+
+
+#define NV01_IMAGE_SRCCOPY_AND								0x00000064
+
+#define  NV01_IMAGE_SRCCOPY_AND_NOTIFY							0x00000104
+#define  NV01_IMAGE_SRCCOPY_AND_DMA_NOTIFY						0x00000180
+#define  NV01_IMAGE_SRCCOPY_AND_IMAGE_OUTPUT						0x00000200
+#define  NV01_IMAGE_SRCCOPY_AND_IMAGE_INPUT						0x00000204
+
+
+#define NV05_INDEXED_IMAGE_FROM_CPU							0x00000064
+
+#define  NV05_INDEXED_IMAGE_FROM_CPU_COLOR_KEY						0x00000188
+#define  NV05_INDEXED_IMAGE_FROM_CPU_CLIP_RECTANGLE					0x0000018c
+#define  NV05_INDEXED_IMAGE_FROM_CPU_PATTERN						0x00000190
+#define  NV05_INDEXED_IMAGE_FROM_CPU_ROP						0x00000194
+#define  NV05_INDEXED_IMAGE_FROM_CPU_BETA1						0x00000198
+#define  NV05_INDEXED_IMAGE_FROM_CPU_BETA4						0x0000019c
+#define  NV05_INDEXED_IMAGE_FROM_CPU_SURFACE						0x000001a0
+#define  NV05_INDEXED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000003e0
+#define  NV05_INDEXED_IMAGE_FROM_CPU_OPERATION						0x000003e4
+#define  NV05_INDEXED_IMAGE_FROM_CPU_INDICES						0x00000400
+
+
+#define NV05_IMAGE_FROM_CPU								0x00000065
+
+#define  NV05_IMAGE_FROM_CPU_BETA4							0x00000198
+#define  NV05_IMAGE_FROM_CPU_SURFACE							0x0000019c
+
+
+#define NV05_STRETCHED_IMAGE_FROM_CPU							0x00000066
+
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_BETA4						0x00000194
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_SURFACE						0x00000198
+#define  NV05_STRETCHED_IMAGE_FROM_CPU_COLOR_CONVERSION					0x000002f8
+
+
+#define NV04_IMAGE_BLEND_PREMULT							0x00000067
+
+#define  NV04_IMAGE_BLEND_PREMULT_NOP							0x00000100
+#define  NV04_IMAGE_BLEND_PREMULT_NOTIFY						0x00000104
+#define  NV04_IMAGE_BLEND_PREMULT_DMA_NOTIFY						0x00000180
+#define  NV04_IMAGE_BLEND_PREMULT_IMAGE_OUTPUT						0x00000200
+#define  NV04_IMAGE_BLEND_PREMULT_BETA_INPUT						0x00000204
+#define  NV04_IMAGE_BLEND_PREMULT_IMAGE_INPUT						0x00000208
+
+
+#define NV03_CHANNEL_PIO								0x0000006a
+
+
+
+#define NV03_CHANNEL_DMA								0x0000006b
+
+
+
+#define NV04_BETA_SOLID									0x00000072
+
+#define  NV04_BETA_SOLID_NOP								0x00000100
+#define  NV04_BETA_SOLID_NOTIFY								0x00000104
+#define  NV04_BETA_SOLID_DMA_NOTIFY							0x00000180
+#define  NV04_BETA_SOLID_BETA_OUTPUT							0x00000200
+#define  NV04_BETA_SOLID_BETA_FACTOR							0x00000300
+
+
+#define NV04_STRETCHED_IMAGE_FROM_CPU							0x00000076
+
+
+
+#define NV04_SCALED_IMAGE_FROM_MEMORY							0x00000077
+
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_NOP						0x00000100
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_NOTIFY						0x00000104
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DMA_NOTIFY					0x00000180
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE					0x00000184
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_PATTERN						0x00000188
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_ROP						0x0000018c
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_BETA1						0x00000190
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_BETA4						0x00000194
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE						0x00000198
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION					0x000002fc
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_DITHER				0x00000000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE			0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_SUBTR_TRUNCATE			0x00000002
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT					0x00000300
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5				0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X1R5G5B5				0x00000002
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8				0x00000003
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8				0x00000004
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_V8YB8U8YA8				0x00000005
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_YB8V8YA8U8				0x00000006
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5				0x00000007
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_Y8					0x00000008
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_AY8				0x00000009
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION					0x00000304
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_AND				0x00000000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_ROP_AND				0x00000001
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_AND				0x00000002
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY				0x00000003
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY_PREMULT			0x00000004
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_BLEND_PREMULT				0x00000005
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT					0x00000308
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_X_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE					0x0000030c
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_W_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT					0x00000310
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_X_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_MASK				0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE						0x00000314
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_W_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_MASK					0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DU_DX						0x00000318
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_DV_DY						0x0000031c
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_SIZE						0x00000400
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_W_SHIFT					0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_W_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_H_SHIFT					16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_SIZE_H_MASK					0xffff0000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT						0x00000404
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_SHIFT				0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_PITCH_MASK				0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_SHIFT				16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_MASK				0x00ff0000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER				0x00010000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CORNER				0x00020000
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_SHIFT				24
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_MASK				0xff000000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE			0x00000000
+#define    NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_BILINEAR				0x01000000
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_ADDRESS						0x00000408
+#define  NV04_SCALED_IMAGE_FROM_MEMORY_POINT						0x0000040c
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_X_SHIFT					0
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_X_MASK					0x0000ffff
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_SHIFT					16
+#define   NV04_SCALED_IMAGE_FROM_MEMORY_POINT_Y_MASK					0xffff0000
+
+
+#define NV10_TEXTURE_FROM_CPU								0x0000007b
+
+#define  NV10_TEXTURE_FROM_CPU_NOP							0x00000100
+#define  NV10_TEXTURE_FROM_CPU_NOTIFY							0x00000104
+#define  NV10_TEXTURE_FROM_CPU_WAIT_FOR_IDLE						0x00000108
+#define  NV10_TEXTURE_FROM_CPU_PM_TRIGGER						0x00000140
+#define  NV10_TEXTURE_FROM_CPU_DMA_NOTIFY						0x00000180
+#define  NV10_TEXTURE_FROM_CPU_SURFACE							0x00000184
+#define  NV10_TEXTURE_FROM_CPU_COLOR_FORMAT						0x00000300
+#define  NV10_TEXTURE_FROM_CPU_POINT							0x00000304
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_POINT_X_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_POINT_Y_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_SIZE							0x00000308
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_SHIFT						0
+#define   NV10_TEXTURE_FROM_CPU_SIZE_W_MASK						0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_SHIFT						16
+#define   NV10_TEXTURE_FROM_CPU_SIZE_H_MASK						0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL						0x0000030c
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_X_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_HORIZONTAL_W_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL						0x00000310
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_SHIFT					0
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_Y_MASK					0x0000ffff
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_SHIFT					16
+#define   NV10_TEXTURE_FROM_CPU_CLIP_VERTICAL_H_MASK					0xffff0000
+#define  NV10_TEXTURE_FROM_CPU_COLOR(x)							(0x00000400+((x)*4))
+#define  NV10_TEXTURE_FROM_CPU_COLOR__SIZE						0x00000700
+
+
+#define NV10_VIDEO_DISPLAY								0x0000007c
+
+
+
+#define NV10_DVD_SUBPICTURE								0x00000088
+
+
+
+#define NV10_SCALED_IMAGE_FROM_MEMORY							0x00000089
+
+#define  NV10_SCALED_IMAGE_FROM_MEMORY_WAIT_FOR_IDLE					0x00000108
+
+
+#define NV10_IMAGE_FROM_CPU								0x0000008a
+
+#define  NV10_IMAGE_FROM_CPU_COLOR_CONVERSION						0x000002f8
+
+
+#define NV10_CONTEXT_SURFACES_3D							0x00000093
+
+
+
+#define NV10_DX5_TEXTURE_TRIANGLE							0x00000094
+
+
+
+#define NV10_DX6_MULTI_TEXTURE_TRIANGLE							0x00000095
+
+
+
+#define NV11TCL										0x00000096
+
+#define  NV11TCL_COLOR_LOGIC_OP_ENABLE							0x00000d40
+#define  NV11TCL_COLOR_LOGIC_OP_OP							0x00000d44
+#define   NV11TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV11TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV11TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV11TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV11TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV11TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV11TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV11TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV11TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+
+
+#define NV20TCL										0x00000097
+
+#define  NV20TCL_NOP									0x00000100
+#define  NV20TCL_NOTIFY									0x00000104
+#define  NV20TCL_DMA_NOTIFY								0x00000180
+#define  NV20TCL_DMA_TEXTURE0								0x00000184
+#define  NV20TCL_DMA_TEXTURE1								0x00000188
+#define  NV20TCL_DMA_COLOR								0x00000194
+#define  NV20TCL_DMA_ZETA								0x00000198
+#define  NV20TCL_DMA_VTXBUF0								0x0000019c
+#define  NV20TCL_DMA_VTXBUF1								0x000001a0
+#define  NV20TCL_DMA_FENCE								0x000001a4
+#define  NV20TCL_DMA_QUERY								0x000001a8
+#define  NV20TCL_RT_HORIZ								0x00000200
+#define   NV20TCL_RT_HORIZ_X_SHIFT							0
+#define   NV20TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV20TCL_RT_HORIZ_W_SHIFT							16
+#define   NV20TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV20TCL_RT_VERT								0x00000204
+#define   NV20TCL_RT_VERT_Y_SHIFT							0
+#define   NV20TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV20TCL_RT_VERT_H_SHIFT							16
+#define   NV20TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV20TCL_RT_FORMAT								0x00000208
+#define   NV20TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV20TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV20TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV20TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV20TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV20TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV20TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV20TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV20TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV20TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV20TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV20TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV20TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV20TCL_RT_PITCH								0x0000020c
+#define   NV20TCL_RT_PITCH_COLOR_PITCH_SHIFT						0
+#define   NV20TCL_RT_PITCH_COLOR_PITCH_MASK						0x0000ffff
+#define   NV20TCL_RT_PITCH_ZETA_PITCH_SHIFT						16
+#define   NV20TCL_RT_PITCH_ZETA_PITCH_MASK						0xffff0000
+#define  NV20TCL_COLOR_OFFSET								0x00000210
+#define  NV20TCL_ZETA_OFFSET								0x00000214
+#define  NV20TCL_RC_IN_ALPHA(x)								(0x00000260+((x)*4))
+#define  NV20TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define   NV20TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE0_NV					0x0000000c
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE1_NV					0x0000000d
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV20TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL_NV				0x00000040
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE_NV				0x00000060
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV20TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE_NV				0x000000e0
+#define   NV20TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE0_NV					0x00000c00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE1_NV					0x00000d00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV20TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL_NV				0x00004000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE_NV				0x00006000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV20TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE_NV				0x0000e000
+#define   NV20TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0_NV				0x00010000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1_NV				0x00020000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR_NV				0x00050000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE0_NV					0x000c0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE1_NV					0x000d0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV20TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL_NV				0x00400000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE_NV				0x00600000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV20TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE_NV				0x00e00000
+#define   NV20TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0_NV				0x01000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1_NV				0x02000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR_NV				0x05000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE0_NV					0x0c000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE1_NV					0x0d000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV20TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL_NV				0x40000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE_NV				0x60000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV20TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE_NV				0xe0000000
+#define  NV20TCL_RC_FINAL0								0x00000288
+#define   NV20TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV20TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV20TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV20TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV20TCL_RC_FINAL0_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV20TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV20TCL_RC_FINAL0_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV20TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV20TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV20TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV20TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV20TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV20TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV20TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV20TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV20TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV20TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_FINAL0_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV20TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV20TCL_RC_FINAL0_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV20TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV20TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV20TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV20TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV20TCL_RC_FINAL0_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV20TCL_RC_FINAL0_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV20TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV20TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV20TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV20TCL_RC_FINAL0_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV20TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV20TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV20TCL_RC_FINAL1								0x0000028c
+#define   NV20TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV20TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV20TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV20TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV20TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV20TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_FINAL1_G_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE0_NV						0x00000c00
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE1_NV						0x00000d00
+#define    NV20TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV20TCL_RC_FINAL1_G_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV20TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV20TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV20TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV20TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV20TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV20TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV20TCL_RC_FINAL1_F_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE0_NV						0x000c0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE1_NV						0x000d0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV20TCL_RC_FINAL1_F_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV20TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV20TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV20TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV20TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV20TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE0_NV						0x0c000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE1_NV						0x0d000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV20TCL_RC_FINAL1_E_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV20TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV20TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV20TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV20TCL_LIGHT_CONTROL								0x00000294
+#define  NV20TCL_FOG_MODE								0x0000029c
+#define   NV20TCL_FOG_MODE_EXP								0x00000800
+#define   NV20TCL_FOG_MODE_EXP_2							0x00000802
+#define   NV20TCL_FOG_MODE_EXP2								0x00000803
+#define   NV20TCL_FOG_MODE_LINEAR							0x00000804
+#define   NV20TCL_FOG_MODE_LINEAR_2							0x00002601
+#define  NV20TCL_FOG_COORD_DIST								0x000002a0
+#define   NV20TCL_FOG_COORD_DIST_COORD_FALSE						0x00000000
+#define   NV20TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_RADIAL_NV		0x00000001
+#define   NV20TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_PLANE_ABSOLUTE_NV	0x00000002
+#define   NV20TCL_FOG_COORD_DIST_COORD_FOG						0x00000003
+#define  NV20TCL_FOG_ENABLE								0x000002a4
+#define  NV20TCL_FOG_COLOR								0x000002a8
+#define   NV20TCL_FOG_COLOR_R_SHIFT							0
+#define   NV20TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV20TCL_FOG_COLOR_G_SHIFT							8
+#define   NV20TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV20TCL_FOG_COLOR_B_SHIFT							16
+#define   NV20TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV20TCL_FOG_COLOR_A_SHIFT							24
+#define   NV20TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV20TCL_VIEWPORT_CLIP_MODE							0x000002b4
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV20TCL_VIEWPORT_CLIP_VERT(x)							(0x000002e0+((x)*4))
+#define  NV20TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV20TCL_ALPHA_FUNC_ENABLE							0x00000300
+#define  NV20TCL_BLEND_FUNC_ENABLE							0x00000304
+#define  NV20TCL_CULL_FACE_ENABLE							0x00000308
+#define  NV20TCL_DEPTH_TEST_ENABLE							0x0000030c
+#define  NV20TCL_DITHER_ENABLE								0x00000310
+#define  NV20TCL_LIGHTING_ENABLE							0x00000314
+#define  NV20TCL_POINT_PARAMETERS_ENABLE						0x00000318
+#define  NV20TCL_POINT_SMOOTH_ENABLE							0x0000031c
+#define  NV20TCL_LINE_SMOOTH_ENABLE							0x00000320
+#define  NV20TCL_POLYGON_SMOOTH_ENABLE							0x00000324
+#define  NV20TCL_STENCIL_ENABLE								0x0000032c
+#define  NV20TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000330
+#define  NV20TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000334
+#define  NV20TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000338
+#define  NV20TCL_ALPHA_FUNC_FUNC							0x0000033c
+#define   NV20TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV20TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV20TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV20TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_ALPHA_FUNC_REF								0x00000340
+#define  NV20TCL_BLEND_FUNC_SRC								0x00000344
+#define   NV20TCL_BLEND_FUNC_SRC_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_SRC_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_SRC_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_SRC_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_SRC_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_SRC_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_SRC_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_FUNC_DST								0x00000348
+#define   NV20TCL_BLEND_FUNC_DST_ZERO							0x00000000
+#define   NV20TCL_BLEND_FUNC_DST_ONE							0x00000001
+#define   NV20TCL_BLEND_FUNC_DST_SRC_COLOR						0x00000300
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_COLOR					0x00000301
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA						0x00000302
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_SRC_ALPHA					0x00000303
+#define   NV20TCL_BLEND_FUNC_DST_DST_ALPHA						0x00000304
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_ALPHA					0x00000305
+#define   NV20TCL_BLEND_FUNC_DST_DST_COLOR						0x00000306
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_DST_COLOR					0x00000307
+#define   NV20TCL_BLEND_FUNC_DST_SRC_ALPHA_SATURATE					0x00000308
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_COLOR						0x00008001
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV20TCL_BLEND_FUNC_DST_CONSTANT_ALPHA						0x00008003
+#define   NV20TCL_BLEND_FUNC_DST_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV20TCL_BLEND_COLOR								0x0000034c
+#define   NV20TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV20TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV20TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV20TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV20TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV20TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV20TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV20TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV20TCL_BLEND_EQUATION								0x00000350
+#define   NV20TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV20TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV20TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV20TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV20TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV20TCL_DEPTH_FUNC								0x00000354
+#define   NV20TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV20TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV20TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV20TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV20TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV20TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV20TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV20TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV20TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV20TCL_COLOR_MASK								0x00000358
+#define   NV20TCL_COLOR_MASK_B								(1 <<  0)
+#define   NV20TCL_COLOR_MASK_G								(1 <<  8)
+#define   NV20TCL_COLOR_MASK_R								(1 << 16)
+#define   NV20TCL_COLOR_MASK_A								(1 << 24)
+#define  NV20TCL_DEPTH_WRITE_ENABLE							0x0000035c
+#define  NV20TCL_STENCIL_MASK								0x00000360
+#define  NV20TCL_STENCIL_FUNC_FUNC							0x00000364
+#define   NV20TCL_STENCIL_FUNC_FUNC_NEVER						0x00000200
+#define   NV20TCL_STENCIL_FUNC_FUNC_LESS						0x00000201
+#define   NV20TCL_STENCIL_FUNC_FUNC_EQUAL						0x00000202
+#define   NV20TCL_STENCIL_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV20TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_STENCIL_FUNC_FUNC_GREATER						0x00000204
+#define   NV20TCL_STENCIL_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV20TCL_STENCIL_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV20TCL_STENCIL_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV20TCL_STENCIL_FUNC_REF							0x00000368
+#define  NV20TCL_STENCIL_FUNC_MASK							0x0000036c
+#define  NV20TCL_STENCIL_OP_FAIL							0x00000370
+#define   NV20TCL_STENCIL_OP_FAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_FAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_FAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_FAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_FAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_FAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_FAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_FAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZFAIL							0x00000374
+#define   NV20TCL_STENCIL_OP_ZFAIL_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZFAIL_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZFAIL_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZFAIL_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZFAIL_DECR_WRAP						0x00008508
+#define  NV20TCL_STENCIL_OP_ZPASS							0x00000378
+#define   NV20TCL_STENCIL_OP_ZPASS_ZERO							0x00000000
+#define   NV20TCL_STENCIL_OP_ZPASS_INVERT						0x0000150a
+#define   NV20TCL_STENCIL_OP_ZPASS_KEEP							0x00001e00
+#define   NV20TCL_STENCIL_OP_ZPASS_REPLACE						0x00001e01
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR							0x00001e02
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR							0x00001e03
+#define   NV20TCL_STENCIL_OP_ZPASS_INCR_WRAP						0x00008507
+#define   NV20TCL_STENCIL_OP_ZPASS_DECR_WRAP						0x00008508
+#define  NV20TCL_SHADE_MODEL								0x0000037c
+#define   NV20TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV20TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV20TCL_LINE_WIDTH								0x00000380
+#define  NV20TCL_POLYGON_OFFSET_FACTOR							0x00000384
+#define  NV20TCL_POLYGON_OFFSET_UNITS							0x00000388
+#define  NV20TCL_POLYGON_MODE_FRONT							0x0000038c
+#define   NV20TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV20TCL_POLYGON_MODE_BACK							0x00000390
+#define   NV20TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV20TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV20TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV20TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV20TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV20TCL_CULL_FACE								0x0000039c
+#define   NV20TCL_CULL_FACE_FRONT							0x00000404
+#define   NV20TCL_CULL_FACE_BACK							0x00000405
+#define   NV20TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV20TCL_FRONT_FACE								0x000003a0
+#define   NV20TCL_FRONT_FACE_CW								0x00000900
+#define   NV20TCL_FRONT_FACE_CCW							0x00000901
+#define  NV20TCL_NORMALIZE_ENABLE							0x000003a4
+#define  NV20TCL_COLOR_MATERIAL_FRONT_R							0x000003a8
+#define  NV20TCL_COLOR_MATERIAL_FRONT_G							0x000003ac
+#define  NV20TCL_COLOR_MATERIAL_FRONT_B							0x000003b0
+#define  NV20TCL_COLOR_MATERIAL_FRONT_A							0x000003b4
+#define  NV20TCL_SEPARATE_SPECULAR_ENABLE						0x000003b8
+#define  NV20TCL_ENABLED_LIGHTS								0x000003bc
+#define  NV20TCL_TX_GEN_S(x)								(0x000003c0+((x)*16))
+#define  NV20TCL_TX_GEN_S__SIZE								0x00000004
+#define   NV20TCL_TX_GEN_S_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_S_EYE_LINEAR							0x00002400
+#define   NV20TCL_TX_GEN_S_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_S_SPHERE_MAP							0x00002402
+#define   NV20TCL_TX_GEN_S_NORMAL_MAP							0x00008511
+#define   NV20TCL_TX_GEN_S_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_T(x)								(0x000003c4+((x)*16))
+#define  NV20TCL_TX_GEN_T__SIZE								0x00000004
+#define   NV20TCL_TX_GEN_T_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_T_EYE_LINEAR							0x00002400
+#define   NV20TCL_TX_GEN_T_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_T_SPHERE_MAP							0x00002402
+#define   NV20TCL_TX_GEN_T_NORMAL_MAP							0x00008511
+#define   NV20TCL_TX_GEN_T_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_R(x)								(0x000003c8+((x)*16))
+#define  NV20TCL_TX_GEN_R__SIZE								0x00000004
+#define   NV20TCL_TX_GEN_R_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_R_EYE_LINEAR							0x00002400
+#define   NV20TCL_TX_GEN_R_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_R_SPHERE_MAP							0x00002402
+#define   NV20TCL_TX_GEN_R_NORMAL_MAP							0x00008511
+#define   NV20TCL_TX_GEN_R_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_GEN_Q(x)								(0x000003cc+((x)*16))
+#define  NV20TCL_TX_GEN_Q__SIZE								0x00000004
+#define   NV20TCL_TX_GEN_Q_FALSE							0x00000000
+#define   NV20TCL_TX_GEN_Q_EYE_LINEAR							0x00002400
+#define   NV20TCL_TX_GEN_Q_OBJECT_LINEAR						0x00002401
+#define   NV20TCL_TX_GEN_Q_SPHERE_MAP							0x00002402
+#define   NV20TCL_TX_GEN_Q_NORMAL_MAP							0x00008511
+#define   NV20TCL_TX_GEN_Q_REFLECTION_MAP						0x00008512
+#define  NV20TCL_TX_MATRIX_ENABLE(x)							(0x00000420+((x)*4))
+#define  NV20TCL_TX_MATRIX_ENABLE__SIZE							0x00000004
+#define  NV20TCL_POINT_SIZE								0x0000043c
+#define  NV20TCL_MODELVIEW0_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV20TCL_MODELVIEW0_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW1_MATRIX(x)							(0x000004c0+((x)*4))
+#define  NV20TCL_MODELVIEW1_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW2_MATRIX(x)							(0x00000500+((x)*4))
+#define  NV20TCL_MODELVIEW2_MATRIX__SIZE						0x00000010
+#define  NV20TCL_MODELVIEW3_MATRIX(x)							(0x00000540+((x)*4))
+#define  NV20TCL_MODELVIEW3_MATRIX__SIZE						0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW0_MATRIX(x)						(0x00000580+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW0_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW1_MATRIX(x)						(0x000005c0+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW1_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW2_MATRIX(x)						(0x00000600+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW2_MATRIX__SIZE					0x00000010
+#define  NV20TCL_INVERSE_MODELVIEW3_MATRIX(x)						(0x00000640+((x)*4))
+#define  NV20TCL_INVERSE_MODELVIEW3_MATRIX__SIZE					0x00000010
+#define  NV20TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV20TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV20TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV20TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV20TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV20TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV20TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV20TCL_TX0_CLIP_PLANE_A(x)							(0x00000840+((x)*16))
+#define  NV20TCL_TX0_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV20TCL_TX0_CLIP_PLANE_B(x)							(0x00000844+((x)*16))
+#define  NV20TCL_TX0_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV20TCL_TX0_CLIP_PLANE_C(x)							(0x00000848+((x)*16))
+#define  NV20TCL_TX0_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV20TCL_TX0_CLIP_PLANE_D(x)							(0x0000084c+((x)*16))
+#define  NV20TCL_TX0_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV20TCL_TX1_CLIP_PLANE_A(x)							(0x00000880+((x)*16))
+#define  NV20TCL_TX1_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV20TCL_TX1_CLIP_PLANE_B(x)							(0x00000884+((x)*16))
+#define  NV20TCL_TX1_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV20TCL_TX1_CLIP_PLANE_C(x)							(0x00000888+((x)*16))
+#define  NV20TCL_TX1_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV20TCL_TX1_CLIP_PLANE_D(x)							(0x0000088c+((x)*16))
+#define  NV20TCL_TX1_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV20TCL_TX2_CLIP_PLANE_A(x)							(0x000008c0+((x)*16))
+#define  NV20TCL_TX2_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV20TCL_TX2_CLIP_PLANE_B(x)							(0x000008c4+((x)*16))
+#define  NV20TCL_TX2_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV20TCL_TX2_CLIP_PLANE_C(x)							(0x000008c8+((x)*16))
+#define  NV20TCL_TX2_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV20TCL_TX2_CLIP_PLANE_D(x)							(0x000008cc+((x)*16))
+#define  NV20TCL_TX2_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV20TCL_TX3_CLIP_PLANE_A(x)							(0x00000900+((x)*16))
+#define  NV20TCL_TX3_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV20TCL_TX3_CLIP_PLANE_B(x)							(0x00000904+((x)*16))
+#define  NV20TCL_TX3_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV20TCL_TX3_CLIP_PLANE_C(x)							(0x00000908+((x)*16))
+#define  NV20TCL_TX3_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV20TCL_TX3_CLIP_PLANE_D(x)							(0x0000090c+((x)*16))
+#define  NV20TCL_TX3_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV20TCL_FOG_EQUATION_CONSTANT							0x000009c0
+#define  NV20TCL_FOG_EQUATION_LINEAR							0x000009c4
+#define  NV20TCL_FOG_EQUATION_QUADRATIC							0x000009c8
+#define  NV20TCL_FRONT_MATERIAL_SHININESS(x)						(0x000009e0+((x)*4))
+#define  NV20TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV20TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x00000a10
+#define  NV20TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x00000a14
+#define  NV20TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x00000a18
+#define  NV20TCL_VIEWPORT_SCALE0_X							0x00000a20
+#define  NV20TCL_VIEWPORT_SCALE0_Y							0x00000a24
+#define  NV20TCL_VIEWPORT_SCALE0_Z							0x00000a28
+#define  NV20TCL_VIEWPORT_SCALE0_W							0x00000a2c
+#define  NV20TCL_POINT_PARAMETER(x)							(0x00000a30+((x)*4))
+#define  NV20TCL_POINT_PARAMETER__SIZE							0x00000008
+#define  NV20TCL_RC_CONSTANT_COLOR0(x)							(0x00000a60+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define   NV20TCL_RC_CONSTANT_COLOR0_B_SHIFT						0
+#define   NV20TCL_RC_CONSTANT_COLOR0_B_MASK						0x000000ff
+#define   NV20TCL_RC_CONSTANT_COLOR0_G_SHIFT						8
+#define   NV20TCL_RC_CONSTANT_COLOR0_G_MASK						0x0000ff00
+#define   NV20TCL_RC_CONSTANT_COLOR0_R_SHIFT						16
+#define   NV20TCL_RC_CONSTANT_COLOR0_R_MASK						0x00ff0000
+#define   NV20TCL_RC_CONSTANT_COLOR0_A_SHIFT						24
+#define   NV20TCL_RC_CONSTANT_COLOR0_A_MASK						0xff000000
+#define  NV20TCL_RC_CONSTANT_COLOR1(x)							(0x00000a80+((x)*4))
+#define  NV20TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define   NV20TCL_RC_CONSTANT_COLOR1_B_SHIFT						0
+#define   NV20TCL_RC_CONSTANT_COLOR1_B_MASK						0x000000ff
+#define   NV20TCL_RC_CONSTANT_COLOR1_G_SHIFT						8
+#define   NV20TCL_RC_CONSTANT_COLOR1_G_MASK						0x0000ff00
+#define   NV20TCL_RC_CONSTANT_COLOR1_R_SHIFT						16
+#define   NV20TCL_RC_CONSTANT_COLOR1_R_MASK						0x00ff0000
+#define   NV20TCL_RC_CONSTANT_COLOR1_A_SHIFT						24
+#define   NV20TCL_RC_CONSTANT_COLOR1_A_MASK						0xff000000
+#define  NV20TCL_RC_OUT_ALPHA(x)							(0x00000aa0+((x)*4))
+#define  NV20TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define   NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x0000000e
+#define    NV20TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x000000e0
+#define    NV20TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x00000e00
+#define    NV20TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV20TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV20TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV20TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV20TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV			0x00008000
+#define   NV20TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV20TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV20TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV20TCL_RC_IN_RGB(x)								(0x00000ac0+((x)*4))
+#define  NV20TCL_RC_IN_RGB__SIZE							0x00000008
+#define   NV20TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV20TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV20TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV20TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV20TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV20TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV20TCL_RC_IN_RGB_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV20TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV20TCL_RC_IN_RGB_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV20TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV20TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV20TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV20TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV20TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV20TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV20TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV20TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_IN_RGB_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV20TCL_RC_IN_RGB_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV20TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV20TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV20TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV20TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV20TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV20TCL_RC_IN_RGB_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV20TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV20TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV20TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV20TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV20TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV20TCL_RC_IN_RGB_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV20TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV20TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV20TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV20TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV20TCL_VIEWPORT_SCALE1_X							0x00000af0
+#define  NV20TCL_VIEWPORT_SCALE1_Y							0x00000af4
+#define  NV20TCL_VIEWPORT_SCALE1_Z							0x00000af8
+#define  NV20TCL_VIEWPORT_SCALE1_W							0x00000afc
+#define  NV20TCL_VP_UPLOAD_INST(x)							(0x00000b00+((x)*4))
+#define  NV20TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV20TCL_VP_UPLOAD_CONST(x)							(0x00000b80+((x)*4))
+#define  NV20TCL_VP_UPLOAD_CONST__SIZE							0x00000004
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_R(x)					(0x00000c00+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_G(x)					(0x00000c04+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_B(x)					(0x00000c08+((x)*64))
+#define  NV20TCL_LIGHT_BACK_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00001000+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00001004+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00001008+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000100c+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00001010+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00001014+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00001018+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000101c+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00001020+((x)*128))
+#define  NV20TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_X(x)							(0x00001028+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000102c+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV20TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00001030+((x)*128))
+#define  NV20TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_X(x)							(0x00001034+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_Y(x)							(0x00001038+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV20TCL_LIGHT_DIRECTION_Z(x)							(0x0000103c+((x)*128))
+#define  NV20TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV20TCL_LIGHT_POSITION_X(x)							(0x0000105c+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV20TCL_LIGHT_POSITION_Y(x)							(0x00001060+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV20TCL_LIGHT_POSITION_Z(x)							(0x00001064+((x)*128))
+#define  NV20TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV20TCL_LIGHT_CONSTANT_ATTENUATION(x)						(0x00001068+((x)*128))
+#define  NV20TCL_LIGHT_CONSTANT_ATTENUATION__SIZE					0x00000008
+#define  NV20TCL_LIGHT_LINEAR_ATTENUATION(x)						(0x0000106c+((x)*128))
+#define  NV20TCL_LIGHT_LINEAR_ATTENUATION__SIZE						0x00000008
+#define  NV20TCL_LIGHT_QUADRATIC_ATTENUATION(x)						(0x00001070+((x)*128))
+#define  NV20TCL_LIGHT_QUADRATIC_ATTENUATION__SIZE					0x00000008
+#define  NV20TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV20TCL_VERTEX_POS_3F_X							0x00001500
+#define  NV20TCL_VERTEX_POS_3F_Y							0x00001504
+#define  NV20TCL_VERTEX_POS_3F_Z							0x00001508
+#define  NV20TCL_VERTEX_POS_4F_X							0x00001518
+#define  NV20TCL_VERTEX_POS_4F_Y							0x0000151c
+#define  NV20TCL_VERTEX_POS_4F_Z							0x00001520
+#define  NV20TCL_VERTEX_POS_3I_XY							0x00001528
+#define   NV20TCL_VERTEX_POS_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_POS_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_POS_3I_Z							0x0000152c
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_POS_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_NOR_3F_X							0x00001530
+#define  NV20TCL_VERTEX_NOR_3F_Y							0x00001534
+#define  NV20TCL_VERTEX_NOR_3F_Z							0x00001538
+#define  NV20TCL_VERTEX_NOR_3I_XY							0x00001540
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_XY_X_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_SHIFT						16
+#define   NV20TCL_VERTEX_NOR_3I_XY_Y_MASK						0xffff0000
+#define  NV20TCL_VERTEX_NOR_3I_Z							0x00001544
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_SHIFT						0
+#define   NV20TCL_VERTEX_NOR_3I_Z_Z_MASK						0x0000ffff
+#define  NV20TCL_VERTEX_COL_4F_X							0x00001550
+#define  NV20TCL_VERTEX_COL_4F_Y							0x00001554
+#define  NV20TCL_VERTEX_COL_4F_Z							0x00001558
+#define  NV20TCL_VERTEX_COL_4F_W							0x0000155c
+#define  NV20TCL_VERTEX_COL_3F_X							0x00001560
+#define  NV20TCL_VERTEX_COL_3F_Y							0x00001564
+#define  NV20TCL_VERTEX_COL_3F_Z							0x00001568
+#define  NV20TCL_VERTEX_COL_4I								0x0000156c
+#define   NV20TCL_VERTEX_COL_4I_R_SHIFT							0
+#define   NV20TCL_VERTEX_COL_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL_4I_G_SHIFT							8
+#define   NV20TCL_VERTEX_COL_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL_4I_B_SHIFT							16
+#define   NV20TCL_VERTEX_COL_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL_4I_A_SHIFT							24
+#define   NV20TCL_VERTEX_COL_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_COL2_3F_X							0x00001580
+#define  NV20TCL_VERTEX_COL2_3F_Y							0x00001584
+#define  NV20TCL_VERTEX_COL2_3F_Z							0x00001588
+#define  NV20TCL_VERTEX_COL2_4I								0x0000158c
+#define   NV20TCL_VERTEX_COL2_4I_R_SHIFT						0
+#define   NV20TCL_VERTEX_COL2_4I_R_MASK							0x000000ff
+#define   NV20TCL_VERTEX_COL2_4I_G_SHIFT						8
+#define   NV20TCL_VERTEX_COL2_4I_G_MASK							0x0000ff00
+#define   NV20TCL_VERTEX_COL2_4I_B_SHIFT						16
+#define   NV20TCL_VERTEX_COL2_4I_B_MASK							0x00ff0000
+#define   NV20TCL_VERTEX_COL2_4I_A_SHIFT						24
+#define   NV20TCL_VERTEX_COL2_4I_A_MASK							0xff000000
+#define  NV20TCL_VERTEX_TX0_2F_S							0x00001590
+#define  NV20TCL_VERTEX_TX0_2F_T							0x00001594
+#define  NV20TCL_VERTEX_TX0_2I								0x00001598
+#define   NV20TCL_VERTEX_TX0_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX0_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX0_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX0_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX0_4F_S							0x000015a0
+#define  NV20TCL_VERTEX_TX0_4F_T							0x000015a4
+#define  NV20TCL_VERTEX_TX0_4F_R							0x000015a8
+#define  NV20TCL_VERTEX_TX0_4F_Q							0x000015ac
+#define  NV20TCL_VERTEX_TX0_4I_ST							0x000015b0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX0_4I_RQ							0x000015b4
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX0_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX0_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_2F_S							0x000015b8
+#define  NV20TCL_VERTEX_TX1_2F_T							0x000015bc
+#define  NV20TCL_VERTEX_TX1_2I								0x000015c0
+#define   NV20TCL_VERTEX_TX1_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX1_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX1_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX1_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX1_4F_S							0x000015c8
+#define  NV20TCL_VERTEX_TX1_4F_T							0x000015cc
+#define  NV20TCL_VERTEX_TX1_4F_R							0x000015d0
+#define  NV20TCL_VERTEX_TX1_4F_Q							0x000015d4
+#define  NV20TCL_VERTEX_TX1_4I_ST							0x000015d8
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX1_4I_RQ							0x000015dc
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX1_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX1_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_2F_S							0x000015e0
+#define  NV20TCL_VERTEX_TX2_2F_T							0x000015e4
+#define  NV20TCL_VERTEX_TX2_2I								0x000015e8
+#define   NV20TCL_VERTEX_TX2_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX2_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX2_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX2_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX2_4F_S							0x000015f0
+#define  NV20TCL_VERTEX_TX2_4F_T							0x000015f4
+#define  NV20TCL_VERTEX_TX2_4F_R							0x000015f8
+#define  NV20TCL_VERTEX_TX2_4F_Q							0x000015fc
+#define  NV20TCL_VERTEX_TX2_4I_ST							0x00001600
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX2_4I_RQ							0x00001604
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX2_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX2_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_2F_S							0x00001608
+#define  NV20TCL_VERTEX_TX3_2F_T							0x0000160c
+#define  NV20TCL_VERTEX_TX3_2I								0x00001610
+#define   NV20TCL_VERTEX_TX3_2I_S_SHIFT							0
+#define   NV20TCL_VERTEX_TX3_2I_S_MASK							0x0000ffff
+#define   NV20TCL_VERTEX_TX3_2I_T_SHIFT							16
+#define   NV20TCL_VERTEX_TX3_2I_T_MASK							0xffff0000
+#define  NV20TCL_VERTEX_TX3_4F_S							0x00001620
+#define  NV20TCL_VERTEX_TX3_4F_T							0x00001624
+#define  NV20TCL_VERTEX_TX3_4F_R							0x00001628
+#define  NV20TCL_VERTEX_TX3_4F_Q							0x0000162c
+#define  NV20TCL_VERTEX_TX3_4I_ST							0x00001630
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_ST_S_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_ST_T_MASK						0xffff0000
+#define  NV20TCL_VERTEX_TX3_4I_RQ							0x00001634
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_SHIFT						0
+#define   NV20TCL_VERTEX_TX3_4I_RQ_R_MASK						0x0000ffff
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_SHIFT						16
+#define   NV20TCL_VERTEX_TX3_4I_RQ_Q_MASK						0xffff0000
+#define  NV20TCL_VERTEX_FOG_1F								0x00001698
+#define  NV20TCL_EDGEFLAG_ENABLE							0x000016bc
+#define  NV20TCL_VTXBUF_ADDRESS(x)							(0x00001720+((x)*4))
+#define  NV20TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV20TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV20TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV20TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV20TCL_VTXFMT(x)								(0x00001760+((x)*4))
+#define  NV20TCL_VTXFMT__SIZE								0x00000010
+#define   NV20TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV20TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV20TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV20TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV20TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV20TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV20TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV20TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV20TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV20TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000017a0
+#define  NV20TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000017a4
+#define  NV20TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000017a8
+#define  NV20TCL_COLOR_MATERIAL_BACK_A							0x000017ac
+#define  NV20TCL_COLOR_MATERIAL_BACK_R							0x000017b0
+#define  NV20TCL_COLOR_MATERIAL_BACK_G							0x000017b4
+#define  NV20TCL_COLOR_MATERIAL_BACK_B							0x000017b8
+#define  NV20TCL_COLOR_LOGIC_OP_ENABLE							0x000017bc
+#define  NV20TCL_COLOR_LOGIC_OP_OP							0x000017c0
+#define   NV20TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV20TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV20TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV20TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV20TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV20TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV20TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV20TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV20TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV20TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+#define  NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE						0x000017c4
+#define  NV20TCL_TX_SHADER_CULL_MODE							0x000017f8
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_S						(1 <<  0)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_S_LESS					0x00000001
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_T						(1 <<  1)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_T_LESS					0x00000002
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_R						(1 <<  2)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_R_LESS					0x00000004
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX0_Q						(1 <<  3)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX0_Q_LESS					0x00000008
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_S						(1 <<  4)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_S_LESS					0x00000010
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_T						(1 <<  5)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_T_LESS					0x00000020
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_R						(1 <<  6)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_R_LESS					0x00000040
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX1_Q						(1 <<  7)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX1_Q_LESS					0x00000080
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_S						(1 <<  8)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_S_LESS					0x00000100
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_T						(1 <<  9)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_T_LESS					0x00000200
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_R						(1 << 10)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_R_LESS					0x00000400
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX2_Q						(1 << 11)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX2_Q_LESS					0x00000800
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_S						(1 << 12)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_S_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_S_LESS					0x00001000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_T						(1 << 13)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_T_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_T_LESS					0x00002000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_R						(1 << 14)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_R_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_R_LESS					0x00004000
+#define   NV20TCL_TX_SHADER_CULL_MODE_TX3_Q						(1 << 15)
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_Q_GEQUAL					0x00000000
+#define    NV20TCL_TX_SHADER_CULL_MODE_TX3_Q_LESS					0x00008000
+#define  NV20TCL_VERTEX_BEGIN_END							0x000017fc
+#define   NV20TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV20TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV20TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV20TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV20TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV20TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV20TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV20TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV20TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV20TCL_VB_ELEMENT_U16								0x00001800
+#define   NV20TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV20TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV20TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV20TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV20TCL_VB_VERTEX_BATCH							0x00001810
+#define   NV20TCL_VB_VERTEX_BATCH_OFFSET_SHIFT						0
+#define   NV20TCL_VB_VERTEX_BATCH_OFFSET_MASK						0x00ffffff
+#define   NV20TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV20TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define  NV20TCL_VERTEX_DATA								0x00001818
+#define  NV20TCL_TX_SHADER_CONST_EYE_X							0x0000181c
+#define  NV20TCL_TX_SHADER_CONST_EYE_Y							0x00001820
+#define  NV20TCL_TX_SHADER_CONST_EYE_Z							0x00001824
+#define  NV20TCL_VTX_ATTR_4F_X(x)							(0x00001a00+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_Y(x)							(0x00001a04+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_Z(x)							(0x00001a08+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV20TCL_VTX_ATTR_4F_W(x)							(0x00001a0c+((x)*16))
+#define  NV20TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV20TCL_TX_OFFSET(x)								(0x00001b00+((x)*64))
+#define  NV20TCL_TX_OFFSET__SIZE							0x00000004
+#define  NV20TCL_TX_FORMAT(x)								(0x00001b04+((x)*64))
+#define  NV20TCL_TX_FORMAT__SIZE							0x00000004
+#define   NV20TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV20TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV20TCL_TX_FORMAT_CUBIC							(1 <<  2)
+#define   NV20TCL_TX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV20TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV20TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV20TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV20TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV20TCL_TX_FORMAT_DIMS_3D							0x00000030
+#define   NV20TCL_TX_FORMAT_FORMAT_SHIFT						8
+#define   NV20TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
+#define    NV20TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV20TCL_TX_FORMAT_FORMAT_A8							0x00000100
+#define    NV20TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV20TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000300
+#define    NV20TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000400
+#define    NV20TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000500
+#define    NV20TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000600
+#define    NV20TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000700
+#define    NV20TCL_TX_FORMAT_FORMAT_INDEX8						0x00000b00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT1						0x00000c00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT3						0x00000e00
+#define    NV20TCL_TX_FORMAT_FORMAT_DXT5						0x00000f00
+#define    NV20TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00001000
+#define    NV20TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00001100
+#define    NV20TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00001200
+#define    NV20TCL_TX_FORMAT_FORMAT_L8_RECT						0x00001300
+#define    NV20TCL_TX_FORMAT_FORMAT_A8L8						0x00001a00
+#define    NV20TCL_TX_FORMAT_FORMAT_A8_RECT2						0x00001b00
+#define    NV20TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00001d00
+#define    NV20TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
+#define    NV20TCL_TX_FORMAT_FORMAT_L8A8_RECT						0x00002000
+#define    NV20TCL_TX_FORMAT_FORMAT_DSDT						0x00002800
+#define    NV20TCL_TX_FORMAT_FORMAT_A16							0x00003200
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
+#define    NV20TCL_TX_FORMAT_FORMAT_A16_RECT						0x00003500
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
+#define    NV20TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00004500
+#define    NV20TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00004600
+#define    NV20TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00004700
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00004a00
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00004b00
+#define    NV20TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00004c00
+#define   NV20TCL_TX_FORMAT_MIPMAP							(1 << 19)
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						20
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x00f00000
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						24
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x0f000000
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_W_SHIFT						28
+#define   NV20TCL_TX_FORMAT_BASE_SIZE_W_MASK						0xf0000000
+#define  NV20TCL_TX_WRAP(x)								(0x00001b08+((x)*64))
+#define  NV20TCL_TX_WRAP__SIZE								0x00000004
+#define   NV20TCL_TX_WRAP_S_SHIFT							0
+#define   NV20TCL_TX_WRAP_S_MASK							0x000000ff
+#define    NV20TCL_TX_WRAP_S_REPEAT							0x00000001
+#define    NV20TCL_TX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV20TCL_TX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV20TCL_TX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV20TCL_TX_WRAP_S_CLAMP							0x00000005
+#define   NV20TCL_TX_WRAP_T_SHIFT							8
+#define   NV20TCL_TX_WRAP_T_MASK							0x00000f00
+#define    NV20TCL_TX_WRAP_T_REPEAT							0x00000100
+#define    NV20TCL_TX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV20TCL_TX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV20TCL_TX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV20TCL_TX_WRAP_T_CLAMP							0x00000500
+#define   NV20TCL_TX_WRAP_R_SHIFT							16
+#define   NV20TCL_TX_WRAP_R_MASK							0x000f0000
+#define    NV20TCL_TX_WRAP_R_REPEAT							0x00010000
+#define    NV20TCL_TX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV20TCL_TX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV20TCL_TX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV20TCL_TX_WRAP_R_CLAMP							0x00050000
+#define  NV20TCL_TX_ENABLE(x)								(0x00001b0c+((x)*64))
+#define  NV20TCL_TX_ENABLE__SIZE							0x00000004
+#define   NV20TCL_TX_ENABLE_ANISO_SHIFT							4
+#define   NV20TCL_TX_ENABLE_ANISO_MASK							0x00000030
+#define    NV20TCL_TX_ENABLE_ANISO_NONE							0x00000000
+#define    NV20TCL_TX_ENABLE_ANISO_2X							0x00000010
+#define    NV20TCL_TX_ENABLE_ANISO_4X							0x00000020
+#define    NV20TCL_TX_ENABLE_ANISO_8X							0x00000030
+#define   NV20TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV20TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV20TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV20TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV20TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV20TCL_TX_SWIZZLE(x)								(0x00001b10+((x)*64))
+#define  NV20TCL_TX_SWIZZLE__SIZE							0x00000004
+#define   NV20TCL_TX_SWIZZLE_RECT_PITCH_SHIFT						16
+#define   NV20TCL_TX_SWIZZLE_RECT_PITCH_MASK						0xffff0000
+#define  NV20TCL_TX_FILTER(x)								(0x00001b14+((x)*64))
+#define  NV20TCL_TX_FILTER__SIZE							0x00000004
+#define   NV20TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV20TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV20TCL_TX_FILTER_MINIFY_SHIFT						16
+#define   NV20TCL_TX_FILTER_MINIFY_MASK							0x000f0000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST						0x00010000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR						0x00020000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x00040000
+#define    NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x00050000
+#define    NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x00060000
+#define   NV20TCL_TX_FILTER_MAGNIFY_SHIFT						24
+#define   NV20TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
+#define    NV20TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
+#define    NV20TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
+#define  NV20TCL_TX_NPOT_SIZE(x)							(0x00001b1c+((x)*64))
+#define  NV20TCL_TX_NPOT_SIZE__SIZE							0x00000004
+#define   NV20TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV20TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV20TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV20TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV20TCL_TX_PALETTE_OFFSET(x)							(0x00001b20+((x)*64))
+#define  NV20TCL_TX_PALETTE_OFFSET__SIZE						0x00000004
+#define  NV20TCL_TX_BORDER_COLOR(x)							(0x00001b24+((x)*64))
+#define  NV20TCL_TX_BORDER_COLOR__SIZE							0x00000004
+#define   NV20TCL_TX_BORDER_COLOR_B_SHIFT						0
+#define   NV20TCL_TX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV20TCL_TX_BORDER_COLOR_G_SHIFT						8
+#define   NV20TCL_TX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV20TCL_TX_BORDER_COLOR_R_SHIFT						16
+#define   NV20TCL_TX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV20TCL_TX_BORDER_COLOR_A_SHIFT						24
+#define   NV20TCL_TX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX00(x)						(0x00001b28+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX00__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX01(x)						(0x00001b2c+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX01__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX11(x)						(0x00001b30+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX11__SIZE					0x00000004
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX10(x)						(0x00001b34+((x)*64))
+#define  NV20TCL_TX_SHADER_OFFSET_MATRIX10__SIZE					0x00000004
+#define  NV20TCL_DEPTH_UNK17D8								0x00001d78
+#define   NV20TCL_DEPTH_UNK17D8_CLAMP_SHIFT						4
+#define   NV20TCL_DEPTH_UNK17D8_CLAMP_MASK						0x000000f0
+#define  NV20TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV20TCL_CLEAR_DEPTH_VALUE							0x00001d8c
+#define  NV20TCL_CLEAR_VALUE								0x00001d90
+#define  NV20TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV20TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV20TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV20TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV20TCL_RC_COLOR0								0x00001e20
+#define   NV20TCL_RC_COLOR0_B_SHIFT							0
+#define   NV20TCL_RC_COLOR0_B_MASK							0x000000ff
+#define   NV20TCL_RC_COLOR0_G_SHIFT							8
+#define   NV20TCL_RC_COLOR0_G_MASK							0x0000ff00
+#define   NV20TCL_RC_COLOR0_R_SHIFT							16
+#define   NV20TCL_RC_COLOR0_R_MASK							0x00ff0000
+#define   NV20TCL_RC_COLOR0_A_SHIFT							24
+#define   NV20TCL_RC_COLOR0_A_MASK							0xff000000
+#define  NV20TCL_RC_COLOR1								0x00001e24
+#define   NV20TCL_RC_COLOR1_B_SHIFT							0
+#define   NV20TCL_RC_COLOR1_B_MASK							0x000000ff
+#define   NV20TCL_RC_COLOR1_G_SHIFT							8
+#define   NV20TCL_RC_COLOR1_G_MASK							0x0000ff00
+#define   NV20TCL_RC_COLOR1_R_SHIFT							16
+#define   NV20TCL_RC_COLOR1_R_MASK							0x00ff0000
+#define   NV20TCL_RC_COLOR1_A_SHIFT							24
+#define   NV20TCL_RC_COLOR1_A_MASK							0xff000000
+#define  NV20TCL_BACK_MATERIAL_SHININESS(x)						(0x00001e28+((x)*4))
+#define  NV20TCL_BACK_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV20TCL_RC_OUT_RGB(x)								(0x00001e40+((x)*4))
+#define  NV20TCL_RC_OUT_RGB__SIZE							0x00000008
+#define   NV20TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV20TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV20TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV20TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV20TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000000e0
+#define    NV20TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV20TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV20TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV20TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV20TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV20TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV20TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV20TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV20TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV				0x00008000
+#define   NV20TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV20TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV20TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV20TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV20TCL_RC_ENABLE								0x00001e60
+#define   NV20TCL_RC_ENABLE_NUM_COMBINERS_SHIFT						0
+#define   NV20TCL_RC_ENABLE_NUM_COMBINERS_MASK						0x0000000f
+#define  NV20TCL_TX_RCOMP								0x00001e6c
+#define   NV20TCL_TX_RCOMP_NEVER							0x00000000
+#define   NV20TCL_TX_RCOMP_GREATER							0x00000001
+#define   NV20TCL_TX_RCOMP_EQUAL							0x00000002
+#define   NV20TCL_TX_RCOMP_GEQUAL							0x00000003
+#define   NV20TCL_TX_RCOMP_LESS								0x00000004
+#define   NV20TCL_TX_RCOMP_NOTEQUAL							0x00000005
+#define   NV20TCL_TX_RCOMP_LEQUAL							0x00000006
+#define   NV20TCL_TX_RCOMP_ALWAYS							0x00000007
+#define  NV20TCL_TX_SHADER_OP								0x00001e70
+#define   NV20TCL_TX_SHADER_OP_TX0_SHIFT						0
+#define   NV20TCL_TX_SHADER_OP_TX0_MASK							0x0000001f
+#define    NV20TCL_TX_SHADER_OP_TX0_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX0_TEXTURE_2D						0x00000001
+#define    NV20TCL_TX_SHADER_OP_TX0_PASS_THROUGH					0x00000004
+#define    NV20TCL_TX_SHADER_OP_TX0_CULL_FRAGMENT					0x00000005
+#define    NV20TCL_TX_SHADER_OP_TX0_OFFSET_TEXTURE_2D					0x00000006
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT_TEXTURE_2D				0x00000009
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT_DEPTH_REPLACE				0x0000000a
+#define    NV20TCL_TX_SHADER_OP_TX0_DEPENDANT_AR_TEXTURE_2D				0x0000000f
+#define    NV20TCL_TX_SHADER_OP_TX0_DEPENDANT_GB_TEXTURE_2D				0x00000010
+#define    NV20TCL_TX_SHADER_OP_TX0_DOT_PRODUCT						0x00000011
+#define   NV20TCL_TX_SHADER_OP_TX1_SHIFT						5
+#define   NV20TCL_TX_SHADER_OP_TX1_MASK							0x000003e0
+#define    NV20TCL_TX_SHADER_OP_TX1_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX1_TEXTURE_2D						0x00000020
+#define    NV20TCL_TX_SHADER_OP_TX1_PASS_THROUGH					0x00000080
+#define    NV20TCL_TX_SHADER_OP_TX1_CULL_FRAGMENT					0x000000a0
+#define    NV20TCL_TX_SHADER_OP_TX1_OFFSET_TEXTURE_2D					0x000000c0
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT_TEXTURE_2D				0x00000120
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT_DEPTH_REPLACE				0x00000140
+#define    NV20TCL_TX_SHADER_OP_TX1_DEPENDANT_AR_TEXTURE_2D				0x000001e0
+#define    NV20TCL_TX_SHADER_OP_TX1_DEPENDANT_GB_TEXTURE_2D				0x00000200
+#define    NV20TCL_TX_SHADER_OP_TX1_DOT_PRODUCT						0x00000220
+#define   NV20TCL_TX_SHADER_OP_TX2_SHIFT						10
+#define   NV20TCL_TX_SHADER_OP_TX2_MASK							0x00007c00
+#define    NV20TCL_TX_SHADER_OP_TX2_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX2_TEXTURE_2D						0x00000400
+#define    NV20TCL_TX_SHADER_OP_TX2_PASS_THROUGH					0x00001000
+#define    NV20TCL_TX_SHADER_OP_TX2_CULL_FRAGMENT					0x00001400
+#define    NV20TCL_TX_SHADER_OP_TX2_OFFSET_TEXTURE_2D					0x00001800
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT_TEXTURE_2D				0x00002400
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT_DEPTH_REPLACE				0x00002800
+#define    NV20TCL_TX_SHADER_OP_TX2_DEPENDANT_AR_TEXTURE_2D				0x00003c00
+#define    NV20TCL_TX_SHADER_OP_TX2_DEPENDANT_GB_TEXTURE_2D				0x00004000
+#define    NV20TCL_TX_SHADER_OP_TX2_DOT_PRODUCT						0x00004400
+#define   NV20TCL_TX_SHADER_OP_TX3_SHIFT						15
+#define   NV20TCL_TX_SHADER_OP_TX3_MASK							0x000f8000
+#define    NV20TCL_TX_SHADER_OP_TX3_NONE						0x00000000
+#define    NV20TCL_TX_SHADER_OP_TX3_TEXTURE_2D						0x00008000
+#define    NV20TCL_TX_SHADER_OP_TX3_PASS_THROUGH					0x00020000
+#define    NV20TCL_TX_SHADER_OP_TX3_CULL_FRAGMENT					0x00028000
+#define    NV20TCL_TX_SHADER_OP_TX3_OFFSET_TEXTURE_2D					0x00030000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT_TEXTURE_2D				0x00048000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT_DEPTH_REPLACE				0x00050000
+#define    NV20TCL_TX_SHADER_OP_TX3_DEPENDANT_AR_TEXTURE_2D				0x00078000
+#define    NV20TCL_TX_SHADER_OP_TX3_DEPENDANT_GB_TEXTURE_2D				0x00080000
+#define    NV20TCL_TX_SHADER_OP_TX3_DOT_PRODUCT						0x00088000
+#define  NV20TCL_TX_SHADER_DOTMAPPING							0x00001e74
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX0_SHIFT					0
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX0_MASK						0x0000000f
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX1_SHIFT					4
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX1_MASK						0x000000f0
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX2_SHIFT					8
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX2_MASK						0x00000f00
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX3_SHIFT					12
+#define   NV20TCL_TX_SHADER_DOTMAPPING_TX3_MASK						0x0000f000
+#define  NV20TCL_TX_SHADER_PREVIOUS							0x00001e78
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX0_SHIFT						8
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX0_MASK						0x00000f00
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX1_SHIFT						12
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX1_MASK						0x0000f000
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX2_SHIFT						16
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX2_MASK						0x00030000
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX3_SHIFT						20
+#define   NV20TCL_TX_SHADER_PREVIOUS_TX3_MASK						0x00300000
+#define  NV20TCL_ENGINE									0x00001e94
+#define   NV20TCL_ENGINE_VP								(1 <<  1)
+#define   NV20TCL_ENGINE_FIXED								(1 <<  2)
+#define  NV20TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV20TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV20TCL_VP_UPLOAD_CONST_ID							0x00001ea4
+#define  NV20TCL_VIEWPORT_TRANSLATE_X							0x00001f00
+#define  NV20TCL_VIEWPORT_TRANSLATE_Y							0x00001f04
+#define  NV20TCL_VIEWPORT_TRANSLATE_Z							0x00001f08
+#define  NV20TCL_VIEWPORT_TRANSLATE_W							0x00001f0c
+
+
+#define NV17TCL										0x00000099
+
+#define  NV17TCL_DMA_IN_MEMORY4								0x000001ac
+#define  NV17TCL_DMA_IN_MEMORY5								0x000001b0
+#define  NV17TCL_COLOR_MASK_ENABLE							0x000002bc
+#define  NV17TCL_LMA_DEPTH_BUFFER_PITCH							0x00000d5c
+#define  NV17TCL_LMA_DEPTH_BUFFER_OFFSET						0x00000d60
+#define  NV17TCL_LMA_DEPTH_FILL_VALUE							0x00000d68
+#define  NV17TCL_LMA_DEPTH_BUFFER_CLEAR							0x00000d6c
+#define  NV17TCL_LMA_DEPTH_ENABLE							0x00001658
+
+
+#define NV20_SWIZZLED_SURFACE								0x0000009e
+
+
+
+#define NV12_IMAGE_BLIT									0x0000009f
+
+
+
+#define NV30_CONTEXT_SURFACES_2D							0x00000362
+
+
+
+#define NV30_STRETCHED_IMAGE_FROM_CPU							0x00000366
+
+
+
+#define NV30_TEXTURE_FROM_CPU								0x0000037b
+
+
+
+#define NV30_SCALED_IMAGE_FROM_MEMORY							0x00000389
+
+
+
+#define NV30_IMAGE_FROM_CPU								0x0000038a
+
+
+
+#define NV30TCL										0x00000397
+
+
+
+#define NV30_SWIZZLED_SURFACE								0x0000039e
+
+
+
+#define NV35TCL										0x00000497
+
+
+
+#define NV25TCL										0x00000597
+
+#define  NV25TCL_DMA_IN_MEMORY4								0x0000019c
+#define  NV25TCL_DMA_IN_MEMORY5								0x000001a0
+#define  NV25TCL_DMA_IN_MEMORY8								0x000001ac
+#define  NV25TCL_DMA_IN_MEMORY9								0x000001b0
+
+
+#define NV34TCL										0x00000697
+
+#define  NV34TCL_NOP									0x00000100
+#define  NV34TCL_NOTIFY									0x00000104
+#define  NV34TCL_DMA_NOTIFY								0x00000180
+#define  NV34TCL_DMA_TEXTURE0								0x00000184
+#define  NV34TCL_DMA_TEXTURE1								0x00000188
+#define  NV34TCL_DMA_COLOR1								0x0000018c
+#define  NV34TCL_DMA_COLOR0								0x00000194
+#define  NV34TCL_DMA_ZETA								0x00000198
+#define  NV34TCL_DMA_VTXBUF0								0x0000019c
+#define  NV34TCL_DMA_VTXBUF1								0x000001a0
+#define  NV34TCL_DMA_FENCE								0x000001a4
+#define  NV34TCL_DMA_QUERY								0x000001a8
+#define  NV34TCL_DMA_IN_MEMORY7								0x000001ac
+#define  NV34TCL_DMA_IN_MEMORY8								0x000001b0
+#define  NV34TCL_RT_HORIZ								0x00000200
+#define   NV34TCL_RT_HORIZ_X_SHIFT							0
+#define   NV34TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_RT_HORIZ_W_SHIFT							16
+#define   NV34TCL_RT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_RT_VERT								0x00000204
+#define   NV34TCL_RT_VERT_Y_SHIFT							0
+#define   NV34TCL_RT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_RT_VERT_H_SHIFT							16
+#define   NV34TCL_RT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_RT_FORMAT								0x00000208
+#define   NV34TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV34TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV34TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV34TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV34TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV34TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV34TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV34TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV34TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV34TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV34TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV34TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV34TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV34TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV34TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV34TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV34TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV34TCL_COLOR0_PITCH								0x0000020c
+#define   NV34TCL_COLOR0_PITCH_COLOR0_SHIFT						0
+#define   NV34TCL_COLOR0_PITCH_COLOR0_MASK						0x0000ffff
+#define   NV34TCL_COLOR0_PITCH_ZETA_SHIFT						16
+#define   NV34TCL_COLOR0_PITCH_ZETA_MASK						0xffff0000
+#define  NV34TCL_COLOR0_OFFSET								0x00000210
+#define  NV34TCL_ZETA_OFFSET								0x00000214
+#define  NV34TCL_COLOR1_OFFSET								0x00000218
+#define  NV34TCL_COLOR1_PITCH								0x0000021c
+#define  NV34TCL_RT_ENABLE								0x00000220
+#define   NV34TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV34TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV34TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV34TCL_LMA_DEPTH_PITCH							0x0000022c
+#define  NV34TCL_LMA_DEPTH_OFFSET							0x00000230
+#define  NV34TCL_TX_UNITS_ENABLE							0x0000023c
+#define   NV34TCL_TX_UNITS_ENABLE_TX0							(1 <<  0)
+#define   NV34TCL_TX_UNITS_ENABLE_TX1							(1 <<  1)
+#define   NV34TCL_TX_UNITS_ENABLE_TX2							(1 <<  2)
+#define   NV34TCL_TX_UNITS_ENABLE_TX3							(1 <<  3)
+#define   NV34TCL_TX_UNITS_ENABLE_TX4							(1 <<  4)
+#define   NV34TCL_TX_UNITS_ENABLE_TX5							(1 <<  5)
+#define   NV34TCL_TX_UNITS_ENABLE_TX6							(1 <<  6)
+#define   NV34TCL_TX_UNITS_ENABLE_TX7							(1 <<  7)
+#define  NV34TCL_TX_MATRIX_ENABLE(x)							(0x00000240+((x)*4))
+#define  NV34TCL_TX_MATRIX_ENABLE__SIZE							0x00000008
+#define  NV34TCL_VIEWPORT_TX_ORIGIN							0x000002b8
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_X_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_SHIFT						16
+#define   NV34TCL_VIEWPORT_TX_ORIGIN_Y_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_MODE							0x000002bc
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_L_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_HORIZ_R_MASK						0xffff0000
+#define  NV34TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV34TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_SHIFT						0
+#define   NV34TCL_VIEWPORT_CLIP_VERT_T_MASK						0x0000ffff
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_SHIFT						16
+#define   NV34TCL_VIEWPORT_CLIP_VERT_D_MASK						0xffff0000
+#define  NV34TCL_DITHER_ENABLE								0x00000300
+#define  NV34TCL_ALPHA_FUNC_ENABLE							0x00000304
+#define  NV34TCL_ALPHA_FUNC_FUNC							0x00000308
+#define   NV34TCL_ALPHA_FUNC_FUNC_NEVER							0x00000200
+#define   NV34TCL_ALPHA_FUNC_FUNC_LESS							0x00000201
+#define   NV34TCL_ALPHA_FUNC_FUNC_EQUAL							0x00000202
+#define   NV34TCL_ALPHA_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV34TCL_ALPHA_FUNC_FUNC_GREATER						0x00000204
+#define   NV34TCL_ALPHA_FUNC_FUNC_NOTEQUAL						0x00000205
+#define   NV34TCL_ALPHA_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_ALPHA_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_ALPHA_FUNC_REF								0x0000030c
+#define  NV34TCL_BLEND_FUNC_ENABLE							0x00000310
+#define  NV34TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_FUNC_DST								0x00000318
+#define   NV34TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV34TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV34TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV34TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV34TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV34TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV34TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV34TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV34TCL_BLEND_COLOR								0x0000031c
+#define   NV34TCL_BLEND_COLOR_B_SHIFT							0
+#define   NV34TCL_BLEND_COLOR_B_MASK							0x000000ff
+#define   NV34TCL_BLEND_COLOR_G_SHIFT							8
+#define   NV34TCL_BLEND_COLOR_G_MASK							0x0000ff00
+#define   NV34TCL_BLEND_COLOR_R_SHIFT							16
+#define   NV34TCL_BLEND_COLOR_R_MASK							0x00ff0000
+#define   NV34TCL_BLEND_COLOR_A_SHIFT							24
+#define   NV34TCL_BLEND_COLOR_A_MASK							0xff000000
+#define  NV34TCL_BLEND_EQUATION								0x00000320
+#define   NV34TCL_BLEND_EQUATION_FUNC_ADD						0x00008006
+#define   NV34TCL_BLEND_EQUATION_MIN							0x00008007
+#define   NV34TCL_BLEND_EQUATION_MAX							0x00008008
+#define   NV34TCL_BLEND_EQUATION_FUNC_SUBTRACT						0x0000800a
+#define   NV34TCL_BLEND_EQUATION_FUNC_REVERSE_SUBTRACT					0x0000800b
+#define  NV34TCL_COLOR_MASK								0x00000324
+#define   NV34TCL_COLOR_MASK_B_SHIFT							0
+#define   NV34TCL_COLOR_MASK_B_MASK							0x000000ff
+#define   NV34TCL_COLOR_MASK_G_SHIFT							8
+#define   NV34TCL_COLOR_MASK_G_MASK							0x0000ff00
+#define   NV34TCL_COLOR_MASK_R_SHIFT							16
+#define   NV34TCL_COLOR_MASK_R_MASK							0x00ff0000
+#define   NV34TCL_COLOR_MASK_A_SHIFT							24
+#define   NV34TCL_COLOR_MASK_A_MASK							0xff000000
+#define  NV34TCL_STENCIL_BACK_ENABLE							0x00000328
+#define  NV34TCL_STENCIL_BACK_MASK							0x0000032c
+#define  NV34TCL_STENCIL_BACK_FUNC_FUNC							0x00000330
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV34TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV34TCL_STENCIL_BACK_FUNC_REF							0x00000334
+#define  NV34TCL_STENCIL_BACK_FUNC_MASK							0x00000338
+#define  NV34TCL_STENCIL_BACK_OP_FAIL							0x0000033c
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZFAIL							0x00000340
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_BACK_OP_ZPASS							0x00000344
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_ENABLE							0x00000348
+#define  NV34TCL_STENCIL_FRONT_MASK							0x0000034c
+#define  NV34TCL_STENCIL_FRONT_FUNC_FUNC						0x00000350
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV34TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV34TCL_STENCIL_FRONT_FUNC_REF							0x00000354
+#define  NV34TCL_STENCIL_FRONT_FUNC_MASK						0x00000358
+#define  NV34TCL_STENCIL_FRONT_OP_FAIL							0x0000035c
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZFAIL							0x00000360
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV34TCL_STENCIL_FRONT_OP_ZPASS							0x00000364
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV34TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV34TCL_SHADE_MODEL								0x00000368
+#define   NV34TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV34TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV34TCL_FOG_ENABLE								0x0000036c
+#define  NV34TCL_FOG_COLOR								0x00000370
+#define   NV34TCL_FOG_COLOR_R_SHIFT							0
+#define   NV34TCL_FOG_COLOR_R_MASK							0x000000ff
+#define   NV34TCL_FOG_COLOR_G_SHIFT							8
+#define   NV34TCL_FOG_COLOR_G_MASK							0x0000ff00
+#define   NV34TCL_FOG_COLOR_B_SHIFT							16
+#define   NV34TCL_FOG_COLOR_B_MASK							0x00ff0000
+#define   NV34TCL_FOG_COLOR_A_SHIFT							24
+#define   NV34TCL_FOG_COLOR_A_MASK							0xff000000
+#define  NV34TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV34TCL_COLOR_LOGIC_OP_OP							0x00000378
+#define   NV34TCL_COLOR_LOGIC_OP_OP_CLEAR						0x00001500
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND							0x00001501
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_REVERSE						0x00001502
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY						0x00001503
+#define   NV34TCL_COLOR_LOGIC_OP_OP_AND_INVERTED					0x00001504
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOOP						0x00001505
+#define   NV34TCL_COLOR_LOGIC_OP_OP_XOR							0x00001506
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR							0x00001507
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NOR							0x00001508
+#define   NV34TCL_COLOR_LOGIC_OP_OP_EQUIV						0x00001509
+#define   NV34TCL_COLOR_LOGIC_OP_OP_INVERT						0x0000150a
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_REVERSE						0x0000150b
+#define   NV34TCL_COLOR_LOGIC_OP_OP_COPY_INVERTED					0x0000150c
+#define   NV34TCL_COLOR_LOGIC_OP_OP_OR_INVERTED						0x0000150d
+#define   NV34TCL_COLOR_LOGIC_OP_OP_NAND						0x0000150e
+#define   NV34TCL_COLOR_LOGIC_OP_OP_SET							0x0000150f
+#define  NV34TCL_NORMALIZE_ENABLE							0x0000037c
+#define  NV34TCL_COLOR_MATERIAL								0x00000390
+#define   NV34TCL_COLOR_MATERIAL_FRONT_EMISSION_ENABLE					(1 <<  0)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_AMBIENT_ENABLE					(1 <<  2)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_DIFFUSE_ENABLE					(1 <<  4)
+#define   NV34TCL_COLOR_MATERIAL_FRONT_SPECULAR_ENABLE					(1 <<  6)
+#define   NV34TCL_COLOR_MATERIAL_BACK_EMISSION_ENABLE					(1 <<  8)
+#define   NV34TCL_COLOR_MATERIAL_BACK_AMBIENT_ENABLE					(1 << 10)
+#define   NV34TCL_COLOR_MATERIAL_BACK_DIFFUSE_ENABLE					(1 << 12)
+#define   NV34TCL_COLOR_MATERIAL_BACK_SPECULAR_ENABLE					(1 << 14)
+#define  NV34TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV34TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV34TCL_COLOR_MATERIAL_FRONT_R							0x000003a0
+#define  NV34TCL_COLOR_MATERIAL_FRONT_G							0x000003a4
+#define  NV34TCL_COLOR_MATERIAL_FRONT_B							0x000003a8
+#define  NV34TCL_COLOR_MATERIAL_FRONT_A							0x000003b4
+#define  NV34TCL_LINE_WIDTH								0x000003b8
+#define  NV34TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV34TCL_TX_GEN_S(x)								(0x00000400+((x)*16))
+#define  NV34TCL_TX_GEN_S__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_S_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_S_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_S_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_S_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_S_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_S_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_T(x)								(0x00000404+((x)*16))
+#define  NV34TCL_TX_GEN_T__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_T_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_T_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_T_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_T_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_T_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_T_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_R(x)								(0x00000408+((x)*16))
+#define  NV34TCL_TX_GEN_R__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_R_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_R_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_R_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_R_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_R_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_R_REFLECTION_MAP						0x00008512
+#define  NV34TCL_TX_GEN_Q(x)								(0x0000040c+((x)*16))
+#define  NV34TCL_TX_GEN_Q__SIZE								0x00000008
+#define   NV34TCL_TX_GEN_Q_FALSE							0x00000000
+#define   NV34TCL_TX_GEN_Q_EYE_LINEAR							0x00002400
+#define   NV34TCL_TX_GEN_Q_OBJECT_LINEAR						0x00002401
+#define   NV34TCL_TX_GEN_Q_SPHERE_MAP							0x00002402
+#define   NV34TCL_TX_GEN_Q_NORMAL_MAP							0x00008511
+#define   NV34TCL_TX_GEN_Q_REFLECTION_MAP						0x00008512
+#define  NV34TCL_MODELVIEW_MATRIX(x)							(0x00000480+((x)*4))
+#define  NV34TCL_MODELVIEW_MATRIX__SIZE							0x00000010
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX(x)						(0x00000580+((x)*4))
+#define  NV34TCL_INVERSE_MODELVIEW_MATRIX__SIZE						0x0000000c
+#define  NV34TCL_PROJECTION_MATRIX(x)							(0x00000680+((x)*4))
+#define  NV34TCL_PROJECTION_MATRIX__SIZE						0x00000010
+#define  NV34TCL_TX0_MATRIX(x)								(0x000006c0+((x)*4))
+#define  NV34TCL_TX0_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX1_MATRIX(x)								(0x00000700+((x)*4))
+#define  NV34TCL_TX1_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX2_MATRIX(x)								(0x00000740+((x)*4))
+#define  NV34TCL_TX2_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX3_MATRIX(x)								(0x00000780+((x)*4))
+#define  NV34TCL_TX3_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX4_MATRIX(x)								(0x000007c0+((x)*4))
+#define  NV34TCL_TX4_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX5_MATRIX(x)								(0x00000800+((x)*4))
+#define  NV34TCL_TX5_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX6_MATRIX(x)								(0x00000840+((x)*4))
+#define  NV34TCL_TX6_MATRIX__SIZE							0x00000010
+#define  NV34TCL_TX7_MATRIX(x)								(0x00000880+((x)*4))
+#define  NV34TCL_TX7_MATRIX__SIZE							0x00000010
+#define  NV34TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV34TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV34TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV34TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_SCISSOR_VERT								0x000008c4
+#define   NV34TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV34TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV34TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV34TCL_FOG_COORD_DIST								0x000008c8
+#define   NV34TCL_FOG_COORD_DIST_COORD_FALSE						0x00000000
+#define   NV34TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_RADIAL_NV		0x00000001
+#define   NV34TCL_FOG_COORD_DIST_COORD_FRAGMENT_DEPTH_DISTANCE_EYE_PLANE_ABSOLUTE_NV	0x00000002
+#define   NV34TCL_FOG_COORD_DIST_COORD_FOG						0x00000003
+#define  NV34TCL_FOG_MODE								0x000008cc
+#define   NV34TCL_FOG_MODE_EXP								0x00000800
+#define   NV34TCL_FOG_MODE_EXP_2							0x00000802
+#define   NV34TCL_FOG_MODE_EXP2								0x00000803
+#define   NV34TCL_FOG_MODE_LINEAR							0x00000804
+#define   NV34TCL_FOG_MODE_LINEAR_2							0x00002601
+#define  NV34TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV34TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV34TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV34TCL_FP_ACTIVE_PROGRAM							0x000008e4
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA0						(1 <<  0)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_DMA1						(1 <<  1)
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_SHIFT					2
+#define   NV34TCL_FP_ACTIVE_PROGRAM_OFFSET_MASK						0xfffffffc
+#define  NV34TCL_RC_COLOR0								0x000008ec
+#define   NV34TCL_RC_COLOR0_B_SHIFT							0
+#define   NV34TCL_RC_COLOR0_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR0_G_SHIFT							8
+#define   NV34TCL_RC_COLOR0_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR0_R_SHIFT							16
+#define   NV34TCL_RC_COLOR0_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR0_A_SHIFT							24
+#define   NV34TCL_RC_COLOR0_A_MASK							0xff000000
+#define  NV34TCL_RC_COLOR1								0x000008f0
+#define   NV34TCL_RC_COLOR1_B_SHIFT							0
+#define   NV34TCL_RC_COLOR1_B_MASK							0x000000ff
+#define   NV34TCL_RC_COLOR1_G_SHIFT							8
+#define   NV34TCL_RC_COLOR1_G_MASK							0x0000ff00
+#define   NV34TCL_RC_COLOR1_R_SHIFT							16
+#define   NV34TCL_RC_COLOR1_R_MASK							0x00ff0000
+#define   NV34TCL_RC_COLOR1_A_SHIFT							24
+#define   NV34TCL_RC_COLOR1_A_MASK							0xff000000
+#define  NV34TCL_RC_FINAL0								0x000008f4
+#define   NV34TCL_RC_FINAL0_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_FINAL0_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_FINAL0_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV34TCL_RC_FINAL0_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV34TCL_RC_FINAL0_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_FINAL0_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV34TCL_RC_FINAL0_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV34TCL_RC_FINAL0_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV34TCL_RC_FINAL0_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV34TCL_RC_FINAL0_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV34TCL_RC_FINAL0_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_FINAL0_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_FINAL0_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_FINAL0_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL0_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV34TCL_RC_FINAL0_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV34TCL_RC_FINAL0_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV34TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV34TCL_RC_FINAL0_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV34TCL_RC_FINAL0_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV34TCL_RC_FINAL0_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_FINAL0_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_FINAL0_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV34TCL_RC_FINAL0_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV34TCL_RC_FINAL0_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_FINAL0_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV34TCL_RC_FINAL0_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_FINAL0_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV34TCL_RC_FINAL0_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV34TCL_RC_FINAL0_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_FINAL0_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_FINAL0_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_FINAL0_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_FINAL0_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV34TCL_RC_FINAL0_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV34TCL_RC_FINAL0_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_FINAL0_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV34TCL_RC_FINAL0_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV34TCL_RC_FINAL0_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_FINAL0_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV34TCL_RC_FINAL0_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV34TCL_RC_FINAL0_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV34TCL_RC_FINAL0_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_FINAL0_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_FINAL0_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_FINAL0_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV34TCL_RC_FINAL0_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV34TCL_RC_FINAL0_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_FINAL0_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV34TCL_RC_FINAL0_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV34TCL_RC_FINAL0_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_FINAL0_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL0_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_FINAL0_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_FINAL0_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV34TCL_RC_FINAL0_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV34TCL_RC_FINAL1								0x000008f8
+#define   NV34TCL_RC_FINAL1_COLOR_SUM_CLAMP						(1 <<  7)
+#define   NV34TCL_RC_FINAL1_G_INPUT_SHIFT						8
+#define   NV34TCL_RC_FINAL1_G_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_FINAL1_G_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV34TCL_RC_FINAL1_G_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV34TCL_RC_FINAL1_G_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_FINAL1_G_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV34TCL_RC_FINAL1_G_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_FINAL1_G_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE0_NV						0x00000c00
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE1_NV						0x00000d00
+#define    NV34TCL_RC_FINAL1_G_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV34TCL_RC_FINAL1_G_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_FINAL1_G_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_FINAL1_G_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_G_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_FINAL1_G_MAPPING_SHIFT						13
+#define   NV34TCL_RC_FINAL1_G_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV34TCL_RC_FINAL1_G_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV34TCL_RC_FINAL1_F_INPUT_SHIFT						16
+#define   NV34TCL_RC_FINAL1_F_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV34TCL_RC_FINAL1_F_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV34TCL_RC_FINAL1_F_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_FINAL1_F_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV34TCL_RC_FINAL1_F_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE0_NV						0x000c0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE1_NV						0x000d0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV34TCL_RC_FINAL1_F_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV34TCL_RC_FINAL1_F_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_FINAL1_F_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_F_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_FINAL1_F_MAPPING_SHIFT						21
+#define   NV34TCL_RC_FINAL1_F_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV34TCL_RC_FINAL1_F_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV34TCL_RC_FINAL1_E_INPUT_SHIFT						24
+#define   NV34TCL_RC_FINAL1_E_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE0_NV						0x0c000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE1_NV						0x0d000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV34TCL_RC_FINAL1_E_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV34TCL_RC_FINAL1_E_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_FINAL1_E_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_FINAL1_E_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_FINAL1_E_MAPPING_SHIFT						29
+#define   NV34TCL_RC_FINAL1_E_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV34TCL_RC_FINAL1_E_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV34TCL_RC_ENABLE								0x000008fc
+#define   NV34TCL_RC_ENABLE_NUM_COMBINERS_SHIFT						0
+#define   NV34TCL_RC_ENABLE_NUM_COMBINERS_MASK						0x0000000f
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR0_SHIFT					12
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR0_MASK					0x0000f000
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR1_SHIFT					16
+#define   NV34TCL_RC_ENABLE_STAGE_CONSTANT_COLOR1_MASK					0x000f0000
+#define  NV34TCL_RC_IN_ALPHA(x)								(0x00000900+((x)*32))
+#define  NV34TCL_RC_IN_ALPHA__SIZE							0x00000008
+#define   NV34TCL_RC_IN_ALPHA_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_IN_ALPHA_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE0_NV					0x0000000c
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE1_NV					0x0000000d
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV34TCL_RC_IN_ALPHA_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_IN_ALPHA_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_IN_ALPHA_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NORMAL_NV				0x00000040
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_EXPAND_NEGATE_NV				0x00000060
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV34TCL_RC_IN_ALPHA_D_MAPPING_SIGNED_NEGATE_NV				0x000000e0
+#define   NV34TCL_RC_IN_ALPHA_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_IN_ALPHA_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE0_NV					0x00000c00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE1_NV					0x00000d00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV34TCL_RC_IN_ALPHA_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_IN_ALPHA_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_IN_ALPHA_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NORMAL_NV				0x00004000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_EXPAND_NEGATE_NV				0x00006000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV34TCL_RC_IN_ALPHA_C_MAPPING_SIGNED_NEGATE_NV				0x0000e000
+#define   NV34TCL_RC_IN_ALPHA_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_IN_ALPHA_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR0_NV				0x00010000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_CONSTANT_COLOR1_NV				0x00020000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SECONDARY_COLOR_NV				0x00050000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE0_NV					0x000c0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE1_NV					0x000d0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV34TCL_RC_IN_ALPHA_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_IN_ALPHA_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_IN_ALPHA_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NORMAL_NV				0x00400000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_EXPAND_NEGATE_NV				0x00600000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV34TCL_RC_IN_ALPHA_B_MAPPING_SIGNED_NEGATE_NV				0x00e00000
+#define   NV34TCL_RC_IN_ALPHA_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_IN_ALPHA_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR0_NV				0x01000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_CONSTANT_COLOR1_NV				0x02000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SECONDARY_COLOR_NV				0x05000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE0_NV					0x0c000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE1_NV					0x0d000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV34TCL_RC_IN_ALPHA_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_BLUE					0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_IN_ALPHA_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_IN_ALPHA_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NORMAL_NV				0x40000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_EXPAND_NEGATE_NV				0x60000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV34TCL_RC_IN_ALPHA_A_MAPPING_SIGNED_NEGATE_NV				0xe0000000
+#define  NV34TCL_RC_IN_RGB(x)								(0x00000904+((x)*32))
+#define  NV34TCL_RC_IN_RGB__SIZE							0x00000008
+#define   NV34TCL_RC_IN_RGB_D_INPUT_SHIFT						0
+#define   NV34TCL_RC_IN_RGB_D_INPUT_MASK						0x0000000f
+#define    NV34TCL_RC_IN_RGB_D_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR0_NV					0x00000001
+#define    NV34TCL_RC_IN_RGB_D_INPUT_CONSTANT_COLOR1_NV					0x00000002
+#define    NV34TCL_RC_IN_RGB_D_INPUT_FOG						0x00000003
+#define    NV34TCL_RC_IN_RGB_D_INPUT_PRIMARY_COLOR_NV					0x00000004
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SECONDARY_COLOR_NV					0x00000005
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE0_ARB					0x00000008
+#define    NV34TCL_RC_IN_RGB_D_INPUT_TEXTURE1_ARB					0x00000009
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE0_NV						0x0000000c
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE1_NV						0x0000000d
+#define    NV34TCL_RC_IN_RGB_D_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV34TCL_RC_IN_RGB_D_INPUT_E_TIMES_F_NV					0x0000000f
+#define   NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE						(1 <<  4)
+#define    NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_D_COMPONENT_USAGE_ALPHA					0x00000010
+#define   NV34TCL_RC_IN_RGB_D_MAPPING_SHIFT						5
+#define   NV34TCL_RC_IN_RGB_D_MAPPING_MASK						0x000000e0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_UNSIGNED_INVERT_NV				0x00000020
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_EXPAND_NORMAL_NV					0x00000040
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_EXPAND_NEGATE_NV					0x00000060
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NORMAL_NV				0x00000080
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_HALF_BIAS_NEGATE_NV				0x000000a0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_SIGNED_IDENTITY_NV				0x000000c0
+#define    NV34TCL_RC_IN_RGB_D_MAPPING_SIGNED_NEGATE_NV					0x000000e0
+#define   NV34TCL_RC_IN_RGB_C_INPUT_SHIFT						8
+#define   NV34TCL_RC_IN_RGB_C_INPUT_MASK						0x00000f00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR0_NV					0x00000100
+#define    NV34TCL_RC_IN_RGB_C_INPUT_CONSTANT_COLOR1_NV					0x00000200
+#define    NV34TCL_RC_IN_RGB_C_INPUT_FOG						0x00000300
+#define    NV34TCL_RC_IN_RGB_C_INPUT_PRIMARY_COLOR_NV					0x00000400
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SECONDARY_COLOR_NV					0x00000500
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_IN_RGB_C_INPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE0_NV						0x00000c00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE1_NV						0x00000d00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV34TCL_RC_IN_RGB_C_INPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE						(1 << 12)
+#define    NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_C_COMPONENT_USAGE_ALPHA					0x00001000
+#define   NV34TCL_RC_IN_RGB_C_MAPPING_SHIFT						13
+#define   NV34TCL_RC_IN_RGB_C_MAPPING_MASK						0x0000e000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_UNSIGNED_INVERT_NV				0x00002000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_EXPAND_NORMAL_NV					0x00004000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_EXPAND_NEGATE_NV					0x00006000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NORMAL_NV				0x00008000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_HALF_BIAS_NEGATE_NV				0x0000a000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_SIGNED_IDENTITY_NV				0x0000c000
+#define    NV34TCL_RC_IN_RGB_C_MAPPING_SIGNED_NEGATE_NV					0x0000e000
+#define   NV34TCL_RC_IN_RGB_B_INPUT_SHIFT						16
+#define   NV34TCL_RC_IN_RGB_B_INPUT_MASK						0x000f0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR0_NV					0x00010000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_CONSTANT_COLOR1_NV					0x00020000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_FOG						0x00030000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_PRIMARY_COLOR_NV					0x00040000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SECONDARY_COLOR_NV					0x00050000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE0_ARB					0x00080000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_TEXTURE1_ARB					0x00090000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE0_NV						0x000c0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE1_NV						0x000d0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000e0000
+#define    NV34TCL_RC_IN_RGB_B_INPUT_E_TIMES_F_NV					0x000f0000
+#define   NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE						(1 << 20)
+#define    NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_B_COMPONENT_USAGE_ALPHA					0x00100000
+#define   NV34TCL_RC_IN_RGB_B_MAPPING_SHIFT						21
+#define   NV34TCL_RC_IN_RGB_B_MAPPING_MASK						0x00e00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_UNSIGNED_INVERT_NV				0x00200000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_EXPAND_NORMAL_NV					0x00400000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_EXPAND_NEGATE_NV					0x00600000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NORMAL_NV				0x00800000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_HALF_BIAS_NEGATE_NV				0x00a00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_SIGNED_IDENTITY_NV				0x00c00000
+#define    NV34TCL_RC_IN_RGB_B_MAPPING_SIGNED_NEGATE_NV					0x00e00000
+#define   NV34TCL_RC_IN_RGB_A_INPUT_SHIFT						24
+#define   NV34TCL_RC_IN_RGB_A_INPUT_MASK						0x0f000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_ZERO						0x00000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR0_NV					0x01000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_CONSTANT_COLOR1_NV					0x02000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_FOG						0x03000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_PRIMARY_COLOR_NV					0x04000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SECONDARY_COLOR_NV					0x05000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE0_ARB					0x08000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_TEXTURE1_ARB					0x09000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE0_NV						0x0c000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE1_NV						0x0d000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0e000000
+#define    NV34TCL_RC_IN_RGB_A_INPUT_E_TIMES_F_NV					0x0f000000
+#define   NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE						(1 << 28)
+#define    NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE_RGB					0x00000000
+#define    NV34TCL_RC_IN_RGB_A_COMPONENT_USAGE_ALPHA					0x10000000
+#define   NV34TCL_RC_IN_RGB_A_MAPPING_SHIFT						29
+#define   NV34TCL_RC_IN_RGB_A_MAPPING_MASK						0xe0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_IDENTITY_NV				0x00000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_UNSIGNED_INVERT_NV				0x20000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_EXPAND_NORMAL_NV					0x40000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_EXPAND_NEGATE_NV					0x60000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NORMAL_NV				0x80000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_HALF_BIAS_NEGATE_NV				0xa0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_SIGNED_IDENTITY_NV				0xc0000000
+#define    NV34TCL_RC_IN_RGB_A_MAPPING_SIGNED_NEGATE_NV					0xe0000000
+#define  NV34TCL_RC_CONSTANT_COLOR0(x)							(0x00000908+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR0__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR0_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR0_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR0_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR0_A_MASK						0xff000000
+#define  NV34TCL_RC_CONSTANT_COLOR1(x)							(0x0000090c+((x)*32))
+#define  NV34TCL_RC_CONSTANT_COLOR1__SIZE						0x00000008
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_SHIFT						0
+#define   NV34TCL_RC_CONSTANT_COLOR1_B_MASK						0x000000ff
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_SHIFT						8
+#define   NV34TCL_RC_CONSTANT_COLOR1_G_MASK						0x0000ff00
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_SHIFT						16
+#define   NV34TCL_RC_CONSTANT_COLOR1_R_MASK						0x00ff0000
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_SHIFT						24
+#define   NV34TCL_RC_CONSTANT_COLOR1_A_MASK						0xff000000
+#define  NV34TCL_RC_OUT_ALPHA(x)							(0x00000910+((x)*32))
+#define  NV34TCL_RC_OUT_ALPHA__SIZE							0x00000008
+#define   NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SHIFT						0
+#define   NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_MASK						0x0000000f
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_FOG						0x00000003
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x0000000e
+#define    NV34TCL_RC_OUT_ALPHA_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SHIFT						4
+#define   NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_MASK						0x000000f0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_FOG						0x00000030
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x000000e0
+#define    NV34TCL_RC_OUT_ALPHA_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SHIFT						8
+#define   NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_MASK						0x00000f00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_FOG						0x00000300
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV		0x00000e00
+#define    NV34TCL_RC_OUT_ALPHA_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_OUT_ALPHA_CD_DOT_PRODUCT						(1 << 12)
+#define   NV34TCL_RC_OUT_ALPHA_AB_DOT_PRODUCT						(1 << 13)
+#define   NV34TCL_RC_OUT_ALPHA_MUX_SUM							(1 << 14)
+#define   NV34TCL_RC_OUT_ALPHA_BIAS							(1 << 15)
+#define    NV34TCL_RC_OUT_ALPHA_BIAS_NONE						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV			0x00008000
+#define   NV34TCL_RC_OUT_ALPHA_SCALE_SHIFT						17
+#define   NV34TCL_RC_OUT_ALPHA_SCALE_MASK						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_NONE						0x00000000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV34TCL_RC_OUT_ALPHA_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV34TCL_RC_OUT_RGB(x)								(0x00000914+((x)*32))
+#define  NV34TCL_RC_OUT_RGB__SIZE							0x00000008
+#define   NV34TCL_RC_OUT_RGB_CD_OUTPUT_SHIFT						0
+#define   NV34TCL_RC_OUT_RGB_CD_OUTPUT_MASK						0x0000000f
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR0_NV				0x00000001
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_CONSTANT_COLOR1_NV				0x00000002
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_FOG						0x00000003
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_PRIMARY_COLOR_NV				0x00000004
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SECONDARY_COLOR_NV				0x00000005
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE0_ARB					0x00000008
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_TEXTURE1_ARB					0x00000009
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_NV					0x0000000c
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE1_NV					0x0000000d
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x0000000e
+#define    NV34TCL_RC_OUT_RGB_CD_OUTPUT_E_TIMES_F_NV					0x0000000f
+#define   NV34TCL_RC_OUT_RGB_AB_OUTPUT_SHIFT						4
+#define   NV34TCL_RC_OUT_RGB_AB_OUTPUT_MASK						0x000000f0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR0_NV				0x00000010
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_CONSTANT_COLOR1_NV				0x00000020
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_FOG						0x00000030
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_PRIMARY_COLOR_NV				0x00000040
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SECONDARY_COLOR_NV				0x00000050
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE0_ARB					0x00000080
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_TEXTURE1_ARB					0x00000090
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_NV					0x000000c0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE1_NV					0x000000d0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x000000e0
+#define    NV34TCL_RC_OUT_RGB_AB_OUTPUT_E_TIMES_F_NV					0x000000f0
+#define   NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SHIFT						8
+#define   NV34TCL_RC_OUT_RGB_SUM_OUTPUT_MASK						0x00000f00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_ZERO						0x00000000
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR0_NV				0x00000100
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_CONSTANT_COLOR1_NV				0x00000200
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_FOG						0x00000300
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_PRIMARY_COLOR_NV				0x00000400
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SECONDARY_COLOR_NV				0x00000500
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE0_ARB					0x00000800
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_TEXTURE1_ARB					0x00000900
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_NV					0x00000c00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE1_NV					0x00000d00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_SPARE0_PLUS_SECONDARY_COLOR_NV			0x00000e00
+#define    NV34TCL_RC_OUT_RGB_SUM_OUTPUT_E_TIMES_F_NV					0x00000f00
+#define   NV34TCL_RC_OUT_RGB_CD_DOT_PRODUCT						(1 << 12)
+#define   NV34TCL_RC_OUT_RGB_AB_DOT_PRODUCT						(1 << 13)
+#define   NV34TCL_RC_OUT_RGB_MUX_SUM							(1 << 14)
+#define   NV34TCL_RC_OUT_RGB_BIAS							(1 << 15)
+#define    NV34TCL_RC_OUT_RGB_BIAS_NONE							0x00000000
+#define    NV34TCL_RC_OUT_RGB_BIAS_BIAS_BY_NEGATIVE_ONE_HALF_NV				0x00008000
+#define   NV34TCL_RC_OUT_RGB_SCALE_SHIFT						17
+#define   NV34TCL_RC_OUT_RGB_SCALE_MASK							0x00000000
+#define    NV34TCL_RC_OUT_RGB_SCALE_NONE						0x00000000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_TWO_NV					0x00020000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_FOUR_NV					0x00040000
+#define    NV34TCL_RC_OUT_RGB_SCALE_SCALE_BY_ONE_HALF_NV				0x00060000
+#define  NV34TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV34TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV34TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV34TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV34TCL_VIEWPORT_VERT								0x00000a04
+#define   NV34TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV34TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV34TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV34TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x00000a10
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x00000a14
+#define  NV34TCL_LIGHT_MODEL_FRONT_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x00000a18
+#define  NV34TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV34TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV34TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV34TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV34TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV34TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV34TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV34TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV34TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV34TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV34TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV34TCL_DEPTH_FUNC								0x00000a6c
+#define   NV34TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV34TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV34TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV34TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV34TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV34TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV34TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV34TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV34TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV34TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV34TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV34TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV34TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV34TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV34TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV34TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV34TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV34TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_A(x)							(0x00000e00+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_B(x)							(0x00000e04+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_C(x)							(0x00000e08+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX0_CLIP_PLANE_D(x)							(0x00000e0c+((x)*16))
+#define  NV34TCL_TX0_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_A(x)							(0x00000e40+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_B(x)							(0x00000e44+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_C(x)							(0x00000e48+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX1_CLIP_PLANE_D(x)							(0x00000e4c+((x)*16))
+#define  NV34TCL_TX1_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_A(x)							(0x00000e80+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_B(x)							(0x00000e84+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_C(x)							(0x00000e88+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX2_CLIP_PLANE_D(x)							(0x00000e8c+((x)*16))
+#define  NV34TCL_TX2_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_A(x)							(0x00000ec0+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_B(x)							(0x00000ec4+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_C(x)							(0x00000ec8+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX3_CLIP_PLANE_D(x)							(0x00000ecc+((x)*16))
+#define  NV34TCL_TX3_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_A(x)							(0x00000f00+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_B(x)							(0x00000f04+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_C(x)							(0x00000f08+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX4_CLIP_PLANE_D(x)							(0x00000f0c+((x)*16))
+#define  NV34TCL_TX4_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_A(x)							(0x00000f40+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_B(x)							(0x00000f44+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_C(x)							(0x00000f48+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX5_CLIP_PLANE_D(x)							(0x00000f4c+((x)*16))
+#define  NV34TCL_TX5_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_A(x)							(0x00000f80+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_B(x)							(0x00000f84+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_C(x)							(0x00000f88+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX6_CLIP_PLANE_D(x)							(0x00000f8c+((x)*16))
+#define  NV34TCL_TX6_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_A(x)							(0x00000fc0+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_A__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_B(x)							(0x00000fc4+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_B__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_C(x)							(0x00000fc8+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_C__SIZE							0x00000004
+#define  NV34TCL_TX7_CLIP_PLANE_D(x)							(0x00000fcc+((x)*16))
+#define  NV34TCL_TX7_CLIP_PLANE_D__SIZE							0x00000004
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R(x)					(0x00001000+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G(x)					(0x00001004+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B(x)					(0x00001008+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_AMBIENT_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R(x)					(0x0000100c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G(x)					(0x00001010+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B(x)					(0x00001014+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_DIFFUSE_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R(x)					(0x00001018+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_R__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G(x)					(0x0000101c+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_G__SIZE				0x00000008
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B(x)					(0x00001020+((x)*64))
+#define  NV34TCL_LIGHT_FRONT_SIDE_PRODUCT_SPECULAR_B__SIZE				0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_X(x)							(0x00001028+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y(x)							(0x0000102c+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z(x)							(0x00001030+((x)*64))
+#define  NV34TCL_LIGHT_HALF_VECTOR_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_X(x)							(0x00001034+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_X__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Y(x)							(0x00001038+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Y__SIZE						0x00000008
+#define  NV34TCL_LIGHT_DIRECTION_Z(x)							(0x0000103c+((x)*64))
+#define  NV34TCL_LIGHT_DIRECTION_Z__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A(x)							(0x00001200+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_A__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B(x)							(0x00001204+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_B__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C(x)							(0x00001208+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_C__SIZE						0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_X(x)							(0x0000120c+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Y(x)							(0x00001210+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_DIR_Z(x)							(0x00001214+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_DIR_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D(x)							(0x00001218+((x)*64))
+#define  NV34TCL_LIGHT_SPOT_CUTOFF_D__SIZE						0x00000008
+#define  NV34TCL_LIGHT_POSITION_X(x)							(0x0000121c+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_X__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Y(x)							(0x00001220+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Y__SIZE							0x00000008
+#define  NV34TCL_LIGHT_POSITION_Z(x)							(0x00001224+((x)*64))
+#define  NV34TCL_LIGHT_POSITION_Z__SIZE							0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT(x)						(0x00001228+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_CONSTANT__SIZE					0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR(x)						(0x0000122c+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_LINEAR__SIZE						0x00000008
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC(x)						(0x00001230+((x)*64))
+#define  NV34TCL_LIGHT_ATTENUATION_QUADRATIC__SIZE					0x00000008
+#define  NV34TCL_FRONT_MATERIAL_SHININESS(x)						(0x00001400+((x)*4))
+#define  NV34TCL_FRONT_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_ENABLED_LIGHTS								0x00001420
+#define  NV34TCL_FP_REG_CONTROL								0x00001450
+#define   NV34TCL_FP_REG_CONTROL_UNK1_SHIFT						16
+#define   NV34TCL_FP_REG_CONTROL_UNK1_MASK						0xffff0000
+#define   NV34TCL_FP_REG_CONTROL_UNK0_SHIFT						0
+#define   NV34TCL_FP_REG_CONTROL_UNK0_MASK						0x0000ffff
+#define  NV34TCL_VP_CLIP_PLANES_ENABLE							0x00001478
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE0						(1 <<  1)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE1						(1 <<  5)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE2						(1 <<  9)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE3						(1 << 13)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE4						(1 << 17)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE5						(1 << 21)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE6						(1 << 25)
+#define   NV34TCL_VP_CLIP_PLANES_ENABLE_PLANE7						(1 << 29)
+#define  NV34TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV34TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV34TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV34TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV34TCL_VP_CLIP_PLANE_A(x)							(0x00001600+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_A__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_B(x)							(0x00001604+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_B__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_C(x)							(0x00001608+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_C__SIZE							0x00000006
+#define  NV34TCL_VP_CLIP_PLANE_D(x)							(0x0000160c+((x)*16))
+#define  NV34TCL_VP_CLIP_PLANE_D__SIZE							0x00000006
+#define  NV34TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV34TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV34TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV34TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV34TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV34TCL_VTXFMT__SIZE								0x00000010
+#define   NV34TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV34TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV34TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV34TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV34TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV34TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV34TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV34TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV34TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_R			0x000017a0
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_G			0x000017a4
+#define  NV34TCL_LIGHT_MODEL_BACK_SIDE_PRODUCT_AMBIENT_PLUS_EMISSION_B			0x000017a8
+#define  NV34TCL_COLOR_MATERIAL_BACK_R							0x000017b0
+#define  NV34TCL_COLOR_MATERIAL_BACK_G							0x000017b4
+#define  NV34TCL_COLOR_MATERIAL_BACK_B							0x000017b8
+#define  NV34TCL_COLOR_MATERIAL_BACK_A							0x000017c0
+#define  NV34TCL_QUERY_RESET								0x000017c8
+#define  NV34TCL_QUERY_UNK17CC								0x000017cc
+#define  NV34TCL_QUERY_GET								0x00001800
+#define   NV34TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV34TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV34TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV34TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV34TCL_VERTEX_BEGIN_END							0x00001808
+#define   NV34TCL_VERTEX_BEGIN_END_STOP							0x00000000
+#define   NV34TCL_VERTEX_BEGIN_END_POINTS						0x00000001
+#define   NV34TCL_VERTEX_BEGIN_END_LINES						0x00000002
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_LOOP						0x00000003
+#define   NV34TCL_VERTEX_BEGIN_END_LINE_STRIP						0x00000004
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLES						0x00000005
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_STRIP					0x00000006
+#define   NV34TCL_VERTEX_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV34TCL_VERTEX_BEGIN_END_QUADS						0x00000008
+#define   NV34TCL_VERTEX_BEGIN_END_QUAD_STRIP						0x00000009
+#define   NV34TCL_VERTEX_BEGIN_END_POLYGON						0x0000000a
+#define  NV34TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV34TCL_VB_ELEMENT_U16_I0_SHIFT						0
+#define   NV34TCL_VB_ELEMENT_U16_I0_MASK						0x0000ffff
+#define   NV34TCL_VB_ELEMENT_U16_I1_SHIFT						16
+#define   NV34TCL_VB_ELEMENT_U16_I1_MASK						0xffff0000
+#define  NV34TCL_VB_ELEMENT_U32								0x00001810
+#define  NV34TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_SHIFT						0
+#define   NV34TCL_VB_VERTEX_BATCH_OFFSET_MASK						0x00ffffff
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define  NV34TCL_VERTEX_DATA								0x00001818
+#define  NV34TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV34TCL_IDXBUF_FORMAT								0x00001820
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV34TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV34TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV34TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV34TCL_VB_INDEX_BATCH								0x00001824
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV34TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV34TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV34TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV34TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV34TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV34TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV34TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV34TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV34TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV34TCL_CULL_FACE								0x00001830
+#define   NV34TCL_CULL_FACE_FRONT							0x00000404
+#define   NV34TCL_CULL_FACE_BACK							0x00000405
+#define   NV34TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV34TCL_FRONT_FACE								0x00001834
+#define   NV34TCL_FRONT_FACE_CW								0x00000900
+#define   NV34TCL_FRONT_FACE_CCW							0x00000901
+#define  NV34TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV34TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV34TCL_TX_PALETTE_OFFSET(x)							(0x00001840+((x)*4))
+#define  NV34TCL_TX_PALETTE_OFFSET__SIZE						0x00000004
+#define  NV34TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV34TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV34TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV34TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV34TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV34TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV34TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV34TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV34TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV34TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV34TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV34TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV34TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV34TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV34TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV34TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV34TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV34TCL_TX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV34TCL_TX_OFFSET__SIZE							0x00000004
+#define  NV34TCL_TX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV34TCL_TX_FORMAT__SIZE							0x00000004
+#define   NV34TCL_TX_FORMAT_DMA0							(1 <<  0)
+#define   NV34TCL_TX_FORMAT_DMA1							(1 <<  1)
+#define   NV34TCL_TX_FORMAT_CUBIC							(1 <<  2)
+#define   NV34TCL_TX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV34TCL_TX_FORMAT_DIMS_SHIFT							4
+#define   NV34TCL_TX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV34TCL_TX_FORMAT_DIMS_1D							0x00000010
+#define    NV34TCL_TX_FORMAT_DIMS_2D							0x00000020
+#define    NV34TCL_TX_FORMAT_DIMS_3D							0x00000030
+#define   NV34TCL_TX_FORMAT_FORMAT_SHIFT						8
+#define   NV34TCL_TX_FORMAT_FORMAT_MASK							0x0000ff00
+#define    NV34TCL_TX_FORMAT_FORMAT_L8							0x00000000
+#define    NV34TCL_TX_FORMAT_FORMAT_A8							0x00000100
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV34TCL_TX_FORMAT_FORMAT_A8_RECT						0x00000300
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4						0x00000400
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5						0x00000500
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8						0x00000600
+#define    NV34TCL_TX_FORMAT_FORMAT_X8R8G8B8						0x00000700
+#define    NV34TCL_TX_FORMAT_FORMAT_INDEX8						0x00000b00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT1						0x00000c00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT3						0x00000e00
+#define    NV34TCL_TX_FORMAT_FORMAT_DXT5						0x00000f00
+#define    NV34TCL_TX_FORMAT_FORMAT_A1R5G5B5_RECT					0x00001000
+#define    NV34TCL_TX_FORMAT_FORMAT_R5G6B5_RECT						0x00001100
+#define    NV34TCL_TX_FORMAT_FORMAT_A8R8G8B8_RECT					0x00001200
+#define    NV34TCL_TX_FORMAT_FORMAT_L8_RECT						0x00001300
+#define    NV34TCL_TX_FORMAT_FORMAT_A8L8						0x00001a00
+#define    NV34TCL_TX_FORMAT_FORMAT_A8_RECT2						0x00001b00
+#define    NV34TCL_TX_FORMAT_FORMAT_A4R4G4B4_RECT					0x00001d00
+#define    NV34TCL_TX_FORMAT_FORMAT_R8G8B8_RECT						0x00001e00
+#define    NV34TCL_TX_FORMAT_FORMAT_L8A8_RECT						0x00002000
+#define    NV34TCL_TX_FORMAT_FORMAT_DSDT						0x00002800
+#define    NV34TCL_TX_FORMAT_FORMAT_A16							0x00003200
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16						0x00003300
+#define    NV34TCL_TX_FORMAT_FORMAT_A16_RECT						0x00003500
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO16_RECT						0x00003600
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8						0x00004400
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8					0x00004500
+#define    NV34TCL_TX_FORMAT_FORMAT_HILO8_RECT						0x00004600
+#define    NV34TCL_TX_FORMAT_FORMAT_SIGNED_HILO8_RECT					0x00004700
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA16_NV					0x00004a00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_RGBA32_NV					0x00004b00
+#define    NV34TCL_TX_FORMAT_FORMAT_FLOAT_R32_NV					0x00004c00
+#define   NV34TCL_TX_FORMAT_MIPMAP							(1 << 19)
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_SHIFT						20
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_U_MASK						0x00f00000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_SHIFT						24
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_V_MASK						0x0f000000
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_SHIFT						28
+#define   NV34TCL_TX_FORMAT_BASE_SIZE_W_MASK						0xf0000000
+#define  NV34TCL_TX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV34TCL_TX_WRAP__SIZE								0x00000004
+#define   NV34TCL_TX_WRAP_S_SHIFT							0
+#define   NV34TCL_TX_WRAP_S_MASK							0x000000ff
+#define    NV34TCL_TX_WRAP_S_REPEAT							0x00000001
+#define    NV34TCL_TX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV34TCL_TX_WRAP_S_CLAMP							0x00000005
+#define   NV34TCL_TX_WRAP_T_SHIFT							8
+#define   NV34TCL_TX_WRAP_T_MASK							0x00000f00
+#define    NV34TCL_TX_WRAP_T_REPEAT							0x00000100
+#define    NV34TCL_TX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV34TCL_TX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV34TCL_TX_WRAP_T_CLAMP							0x00000500
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_SHIFT						12
+#define   NV34TCL_TX_WRAP_EXPAND_NORMAL_MASK						0x0000f000
+#define   NV34TCL_TX_WRAP_R_SHIFT							16
+#define   NV34TCL_TX_WRAP_R_MASK							0x000f0000
+#define    NV34TCL_TX_WRAP_R_REPEAT							0x00010000
+#define    NV34TCL_TX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV34TCL_TX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV34TCL_TX_WRAP_R_CLAMP							0x00050000
+#define   NV34TCL_TX_WRAP_RCOMP_SHIFT							28
+#define   NV34TCL_TX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV34TCL_TX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV34TCL_TX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV34TCL_TX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV34TCL_TX_WRAP_RCOMP_GEQUAL							0x30000000
+#define    NV34TCL_TX_WRAP_RCOMP_LESS							0x40000000
+#define    NV34TCL_TX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV34TCL_TX_WRAP_RCOMP_LEQUAL							0x60000000
+#define    NV34TCL_TX_WRAP_RCOMP_ALWAYS							0x70000000
+#define  NV34TCL_TX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV34TCL_TX_ENABLE__SIZE							0x00000004
+#define   NV34TCL_TX_ENABLE_ANISO_SHIFT							4
+#define   NV34TCL_TX_ENABLE_ANISO_MASK							0x00000030
+#define    NV34TCL_TX_ENABLE_ANISO_NONE							0x00000000
+#define    NV34TCL_TX_ENABLE_ANISO_2X							0x00000010
+#define    NV34TCL_TX_ENABLE_ANISO_4X							0x00000020
+#define    NV34TCL_TX_ENABLE_ANISO_8X							0x00000030
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT					14
+#define   NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_MASK						0x0003c000
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT					26
+#define   NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_MASK						0x3c000000
+#define   NV34TCL_TX_ENABLE_ENABLE							(1 << 30)
+#define  NV34TCL_TX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV34TCL_TX_SWIZZLE__SIZE							0x00000004
+#define   NV34TCL_TX_SWIZZLE_S0_X_SHIFT							14
+#define   NV34TCL_TX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV34TCL_TX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV34TCL_TX_SWIZZLE_S0_Y_SHIFT							12
+#define   NV34TCL_TX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV34TCL_TX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV34TCL_TX_SWIZZLE_S0_Z_SHIFT							10
+#define   NV34TCL_TX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV34TCL_TX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV34TCL_TX_SWIZZLE_S0_W_SHIFT							8
+#define   NV34TCL_TX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV34TCL_TX_SWIZZLE_S0_W_ZERO							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV34TCL_TX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV34TCL_TX_SWIZZLE_S1_X_SHIFT							6
+#define   NV34TCL_TX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV34TCL_TX_SWIZZLE_S1_X_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV34TCL_TX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV34TCL_TX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV34TCL_TX_SWIZZLE_S1_Y_SHIFT							4
+#define   NV34TCL_TX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV34TCL_TX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV34TCL_TX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV34TCL_TX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV34TCL_TX_SWIZZLE_S1_Z_SHIFT							2
+#define   NV34TCL_TX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV34TCL_TX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV34TCL_TX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV34TCL_TX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV34TCL_TX_SWIZZLE_S1_W_SHIFT							0
+#define   NV34TCL_TX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV34TCL_TX_SWIZZLE_S1_W_W							0x00000000
+#define    NV34TCL_TX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV34TCL_TX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV34TCL_TX_SWIZZLE_S1_W_X							0x00000003
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_SHIFT						16
+#define   NV34TCL_TX_SWIZZLE_RECT_PITCH_MASK						0xffff0000
+#define  NV34TCL_TX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV34TCL_TX_FILTER__SIZE							0x00000004
+#define   NV34TCL_TX_FILTER_LOD_BIAS_SHIFT						8
+#define   NV34TCL_TX_FILTER_LOD_BIAS_MASK						0x00000f00
+#define   NV34TCL_TX_FILTER_MINIFY_SHIFT						16
+#define   NV34TCL_TX_FILTER_MINIFY_MASK							0x000f0000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST						0x00010000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR						0x00020000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST				0x00040000
+#define    NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR				0x00050000
+#define    NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR				0x00060000
+#define   NV34TCL_TX_FILTER_MAGNIFY_SHIFT						24
+#define   NV34TCL_TX_FILTER_MAGNIFY_MASK						0x0f000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_NEAREST						0x01000000
+#define    NV34TCL_TX_FILTER_MAGNIFY_LINEAR						0x02000000
+#define   NV34TCL_TX_FILTER_SIGNED_BLUE							(1 << 28)
+#define   NV34TCL_TX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV34TCL_TX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV34TCL_TX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define  NV34TCL_TX_NPOT_SIZE(x)							(0x00001a18+((x)*32))
+#define  NV34TCL_TX_NPOT_SIZE__SIZE							0x00000004
+#define   NV34TCL_TX_NPOT_SIZE_H_SHIFT							0
+#define   NV34TCL_TX_NPOT_SIZE_H_MASK							0x0000ffff
+#define   NV34TCL_TX_NPOT_SIZE_W_SHIFT							16
+#define   NV34TCL_TX_NPOT_SIZE_W_MASK							0xffff0000
+#define  NV34TCL_TX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV34TCL_TX_BORDER_COLOR__SIZE							0x00000004
+#define   NV34TCL_TX_BORDER_COLOR_B_SHIFT						0
+#define   NV34TCL_TX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV34TCL_TX_BORDER_COLOR_G_SHIFT						8
+#define   NV34TCL_TX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV34TCL_TX_BORDER_COLOR_R_SHIFT						16
+#define   NV34TCL_TX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV34TCL_TX_BORDER_COLOR_A_SHIFT						24
+#define   NV34TCL_TX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV34TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV34TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV34TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV34TCL_FP_CONTROL								0x00001d60
+#define   NV34TCL_FP_CONTROL_USES_KIL							(1 <<  7)
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_SHIFT				0
+#define   NV34TCL_FP_CONTROL_USED_REGS_MINUS1_DIV2_MASK					0x0000000f
+#define  NV34TCL_DEPTH_UNK17D8								0x00001d78
+#define   NV34TCL_DEPTH_UNK17D8_CLAMP_SHIFT						4
+#define   NV34TCL_DEPTH_UNK17D8_CLAMP_MASK						0x000000f0
+#define  NV34TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define   NV34TCL_MULTISAMPLE_CONTROL_ENABLE						(1 <<  0)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_COVERAGE				(1 <<  4)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_ALPHA_TO_ONE				(1 <<  8)
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_SHIFT				16
+#define   NV34TCL_MULTISAMPLE_CONTROL_SAMPLE_COVERAGE_MASK				0xffff0000
+#define  NV34TCL_CLEAR_DEPTH_VALUE							0x00001d8c
+#define  NV34TCL_CLEAR_COLOR_VALUE							0x00001d90
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_SHIFT						0
+#define   NV34TCL_CLEAR_COLOR_VALUE_B_MASK						0x000000ff
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_SHIFT						8
+#define   NV34TCL_CLEAR_COLOR_VALUE_G_MASK						0x0000ff00
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_SHIFT						16
+#define   NV34TCL_CLEAR_COLOR_VALUE_R_MASK						0x00ff0000
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_SHIFT						24
+#define   NV34TCL_CLEAR_COLOR_VALUE_A_MASK						0xff000000
+#define  NV34TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV34TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV34TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV34TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV34TCL_DO_VERTICES								0x00001dac
+#define  NV34TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV34TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV34TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV34TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV34TCL_BACK_MATERIAL_SHININESS(x)						(0x00001e20+((x)*4))
+#define  NV34TCL_BACK_MATERIAL_SHININESS__SIZE						0x00000006
+#define  NV34TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV34TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV34TCL_ENGINE									0x00001e94
+#define   NV34TCL_ENGINE_FP								(1 <<  0)
+#define   NV34TCL_ENGINE_VP								(1 <<  1)
+#define   NV34TCL_ENGINE_FIXED								(1 <<  2)
+#define  NV34TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV34TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV34TCL_POINT_PARAMETERS(x)							(0x00001ec0+((x)*4))
+#define  NV34TCL_POINT_PARAMETERS__SIZE							0x00000008
+#define  NV34TCL_POINT_SIZE								0x00001ee0
+#define  NV34TCL_POINT_PARAMETERS_ENABLE						0x00001ee4
+#define  NV34TCL_POINT_SPRITE								0x00001ee8
+#define   NV34TCL_POINT_SPRITE_ENABLE							(1 <<  0)
+#define   NV34TCL_POINT_SPRITE_R_MODE_SHIFT						1
+#define   NV34TCL_POINT_SPRITE_R_MODE_MASK						0x00000006
+#define    NV34TCL_POINT_SPRITE_R_MODE_ZERO						0x00000000
+#define    NV34TCL_POINT_SPRITE_R_MODE_R						0x00000002
+#define    NV34TCL_POINT_SPRITE_R_MODE_S						0x00000004
+#define   NV34TCL_POINT_SPRITE_COORD_REPLACE						(1 << 11)
+#define  NV34TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV34TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV34TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV34TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV34TCL_UNK1f80(x)								(0x00001f80+((x)*4))
+#define  NV34TCL_UNK1f80__SIZE								0x00000010
+
+
+#define NV40_CONTEXT_SURFACES_2D							0x00003062
+
+
+
+#define NV40_STRETCHED_IMAGE_FROM_CPU							0x00003066
+
+
+
+#define NV40_TEXTURE_FROM_CPU								0x0000307b
+
+
+
+#define NV40_SCALED_IMAGE_FROM_MEMORY							0x00003089
+
+
+
+#define NV40_IMAGE_FROM_CPU								0x0000308a
+
+
+
+#define NV40_SWIZZLED_SURFACE								0x0000309e
+
+
+
+#define NV40TCL										0x00004097
+
+#define  NV40TCL_REF_CNT								0x00000050
+#define  NV40TCL_NOP									0x00000100
+#define  NV40TCL_NOTIFY									0x00000104
+#define  NV40TCL_DMA_NOTIFY								0x00000180
+#define  NV40TCL_DMA_TEXTURE0								0x00000184
+#define  NV40TCL_DMA_TEXTURE1								0x00000188
+#define  NV40TCL_DMA_COLOR1								0x0000018c
+#define  NV40TCL_DMA_COLOR0								0x00000194
+#define  NV40TCL_DMA_ZETA								0x00000198
+#define  NV40TCL_DMA_VTXBUF0								0x0000019c
+#define  NV40TCL_DMA_VTXBUF1								0x000001a0
+#define  NV40TCL_DMA_FENCE								0x000001a4
+#define  NV40TCL_DMA_QUERY								0x000001a8
+#define  NV40TCL_DMA_UNK01AC								0x000001ac
+#define  NV40TCL_DMA_UNK01B0								0x000001b0
+#define  NV40TCL_DMA_COLOR2								0x000001b4
+#define  NV40TCL_DMA_COLOR3								0x000001b8
+#define  NV40TCL_RT_HORIZ								0x00000200
+#define   NV40TCL_RT_HORIZ_W_SHIFT							16
+#define   NV40TCL_RT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_RT_HORIZ_X_SHIFT							0
+#define   NV40TCL_RT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_RT_VERT								0x00000204
+#define   NV40TCL_RT_VERT_H_SHIFT							16
+#define   NV40TCL_RT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_RT_VERT_Y_SHIFT							0
+#define   NV40TCL_RT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_RT_FORMAT								0x00000208
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT						24
+#define   NV40TCL_RT_FORMAT_LOG2_HEIGHT_MASK						0xff000000
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT						16
+#define   NV40TCL_RT_FORMAT_LOG2_WIDTH_MASK						0x00ff0000
+#define   NV40TCL_RT_FORMAT_TYPE_SHIFT							8
+#define   NV40TCL_RT_FORMAT_TYPE_MASK							0x00000f00
+#define    NV40TCL_RT_FORMAT_TYPE_LINEAR						0x00000100
+#define    NV40TCL_RT_FORMAT_TYPE_SWIZZLED						0x00000200
+#define   NV40TCL_RT_FORMAT_ZETA_SHIFT							5
+#define   NV40TCL_RT_FORMAT_ZETA_MASK							0x000000e0
+#define    NV40TCL_RT_FORMAT_ZETA_Z16							0x00000020
+#define    NV40TCL_RT_FORMAT_ZETA_Z24S8							0x00000040
+#define   NV40TCL_RT_FORMAT_COLOR_SHIFT							0
+#define   NV40TCL_RT_FORMAT_COLOR_MASK							0x0000001f
+#define    NV40TCL_RT_FORMAT_COLOR_R5G6B5						0x00000003
+#define    NV40TCL_RT_FORMAT_COLOR_X8R8G8B8						0x00000005
+#define    NV40TCL_RT_FORMAT_COLOR_A8R8G8B8						0x00000008
+#define    NV40TCL_RT_FORMAT_COLOR_B8							0x00000009
+#define    NV40TCL_RT_FORMAT_COLOR_UNKNOWN						0x0000000d
+#define    NV40TCL_RT_FORMAT_COLOR_X8B8G8R8						0x0000000f
+#define    NV40TCL_RT_FORMAT_COLOR_A8B8G8R8						0x00000010
+#define  NV40TCL_COLOR0_PITCH								0x0000020c
+#define  NV40TCL_COLOR0_OFFSET								0x00000210
+#define  NV40TCL_ZETA_OFFSET								0x00000214
+#define  NV40TCL_COLOR1_OFFSET								0x00000218
+#define  NV40TCL_COLOR1_PITCH								0x0000021c
+#define  NV40TCL_RT_ENABLE								0x00000220
+#define   NV40TCL_RT_ENABLE_MRT								(1 <<  4)
+#define   NV40TCL_RT_ENABLE_COLOR3							(1 <<  3)
+#define   NV40TCL_RT_ENABLE_COLOR2							(1 <<  2)
+#define   NV40TCL_RT_ENABLE_COLOR1							(1 <<  1)
+#define   NV40TCL_RT_ENABLE_COLOR0							(1 <<  0)
+#define  NV40TCL_ZETA_PITCH								0x0000022c
+#define  NV40TCL_COLOR2_PITCH								0x00000280
+#define  NV40TCL_COLOR3_PITCH								0x00000284
+#define  NV40TCL_COLOR2_OFFSET								0x00000288
+#define  NV40TCL_COLOR3_OFFSET								0x0000028c
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ(x)							(0x000002c0+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV40TCL_VIEWPORT_CLIP_VERT(x)							(0x000002c4+((x)*8))
+#define  NV40TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV40TCL_DITHER_ENABLE								0x00000300
+#define  NV40TCL_ALPHA_TEST_ENABLE							0x00000304
+#define  NV40TCL_ALPHA_TEST_FUNC							0x00000308
+#define   NV40TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV40TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV40TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV40TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV40TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV40TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV40TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_ALPHA_TEST_REF								0x0000030c
+#define  NV40TCL_BLEND_ENABLE								0x00000310
+#define  NV40TCL_BLEND_FUNC_SRC								0x00000314
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_SRC_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_SRC_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_FUNC_DST								0x00000318
+#define   NV40TCL_BLEND_FUNC_DST_RGB_SHIFT						0
+#define   NV40TCL_BLEND_FUNC_DST_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define    NV40TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define    NV40TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE				0x00000308
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define    NV40TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define    NV40TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_FUNC_DST_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x03000000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x03010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x03020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x03030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x03040000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x03050000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x03060000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x03070000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x03080000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x80010000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR			0x80020000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x80030000
+#define    NV40TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA			0x80040000
+#define  NV40TCL_BLEND_COLOR								0x0000031c
+#define  NV40TCL_BLEND_EQUATION								0x00000320
+#define   NV40TCL_BLEND_EQUATION_RGB_SHIFT						0
+#define   NV40TCL_BLEND_EQUATION_RGB_MASK						0x0000ffff
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define    NV40TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define    NV40TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define    NV40TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define   NV40TCL_BLEND_EQUATION_ALPHA_SHIFT						16
+#define   NV40TCL_BLEND_EQUATION_ALPHA_MASK						0xffff0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_ADD					0x80060000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MIN						0x80070000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_MAX						0x80080000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x800a0000
+#define    NV40TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x800b0000
+#define  NV40TCL_COLOR_MASK								0x00000324
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_SHIFT						0
+#define   NV40TCL_COLOR_MASK_BUFFER0_B_MASK						0x000000ff
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_SHIFT						8
+#define   NV40TCL_COLOR_MASK_BUFFER0_G_MASK						0x0000ff00
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_SHIFT						16
+#define   NV40TCL_COLOR_MASK_BUFFER0_R_MASK						0x00ff0000
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_SHIFT						24
+#define   NV40TCL_COLOR_MASK_BUFFER0_A_MASK						0xff000000
+#define  NV40TCL_STENCIL_FRONT_ENABLE							0x00000328
+#define  NV40TCL_STENCIL_FRONT_MASK							0x0000032c
+#define  NV40TCL_STENCIL_FRONT_FUNC_FUNC						0x00000330
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV40TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV40TCL_STENCIL_FRONT_FUNC_REF							0x00000334
+#define  NV40TCL_STENCIL_FRONT_FUNC_MASK						0x00000338
+#define  NV40TCL_STENCIL_FRONT_OP_FAIL							0x0000033c
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZFAIL							0x00000340
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_FRONT_OP_ZPASS							0x00000344
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_ENABLE							0x00000348
+#define  NV40TCL_STENCIL_BACK_MASK							0x0000034c
+#define  NV40TCL_STENCIL_BACK_FUNC_FUNC							0x00000350
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV40TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV40TCL_STENCIL_BACK_FUNC_REF							0x00000354
+#define  NV40TCL_STENCIL_BACK_FUNC_MASK							0x00000358
+#define  NV40TCL_STENCIL_BACK_OP_FAIL							0x0000035c
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZFAIL							0x00000360
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV40TCL_STENCIL_BACK_OP_ZPASS							0x00000364
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV40TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV40TCL_SHADE_MODEL								0x00000368
+#define   NV40TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV40TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV40TCL_MRT_COLOR_MASK								0x00000370
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_A						(1 <<  4)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_R						(1 <<  5)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_G						(1 <<  6)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER1_B						(1 <<  7)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_A						(1 <<  8)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_R						(1 <<  9)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_G						(1 << 10)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER2_B						(1 << 11)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_A						(1 << 12)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_R						(1 << 13)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_G						(1 << 14)
+#define   NV40TCL_MRT_COLOR_MASK_BUFFER3_B						(1 << 15)
+#define  NV40TCL_COLOR_LOGIC_OP_ENABLE							0x00000374
+#define  NV40TCL_COLOR_LOGIC_OP								0x00000378
+#define   NV40TCL_COLOR_LOGIC_OP_CLEAR							0x00001500
+#define   NV40TCL_COLOR_LOGIC_OP_AND							0x00001501
+#define   NV40TCL_COLOR_LOGIC_OP_AND_REVERSE						0x00001502
+#define   NV40TCL_COLOR_LOGIC_OP_COPY							0x00001503
+#define   NV40TCL_COLOR_LOGIC_OP_AND_INVERTED						0x00001504
+#define   NV40TCL_COLOR_LOGIC_OP_NOOP							0x00001505
+#define   NV40TCL_COLOR_LOGIC_OP_XOR							0x00001506
+#define   NV40TCL_COLOR_LOGIC_OP_OR							0x00001507
+#define   NV40TCL_COLOR_LOGIC_OP_NOR							0x00001508
+#define   NV40TCL_COLOR_LOGIC_OP_EQUIV							0x00001509
+#define   NV40TCL_COLOR_LOGIC_OP_INVERT							0x0000150a
+#define   NV40TCL_COLOR_LOGIC_OP_OR_REVERSE						0x0000150b
+#define   NV40TCL_COLOR_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV40TCL_COLOR_LOGIC_OP_OR_INVERTED						0x0000150d
+#define   NV40TCL_COLOR_LOGIC_OP_NAND							0x0000150e
+#define   NV40TCL_COLOR_LOGIC_OP_SET							0x0000150f
+#define  NV40TCL_DEPTH_RANGE_NEAR							0x00000394
+#define  NV40TCL_DEPTH_RANGE_FAR							0x00000398
+#define  NV40TCL_LINE_WIDTH								0x000003b8
+#define  NV40TCL_LINE_SMOOTH_ENABLE							0x000003bc
+#define  NV40TCL_UNK03C0(x)								(0x000003c0+((x)*4))
+#define  NV40TCL_UNK03C0__SIZE								0x00000010
+#define  NV40TCL_UNK0400(x)								(0x00000400+((x)*4))
+#define  NV40TCL_UNK0400__SIZE								0x00000010
+#define  NV40TCL_UNK0440(x)								(0x00000440+((x)*4))
+#define  NV40TCL_UNK0440__SIZE								0x00000020
+#define  NV40TCL_SCISSOR_HORIZ								0x000008c0
+#define   NV40TCL_SCISSOR_HORIZ_X_SHIFT							0
+#define   NV40TCL_SCISSOR_HORIZ_X_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_HORIZ_W_SHIFT							16
+#define   NV40TCL_SCISSOR_HORIZ_W_MASK							0xffff0000
+#define  NV40TCL_SCISSOR_VERT								0x000008c4
+#define   NV40TCL_SCISSOR_VERT_Y_SHIFT							0
+#define   NV40TCL_SCISSOR_VERT_Y_MASK							0x0000ffff
+#define   NV40TCL_SCISSOR_VERT_H_SHIFT							16
+#define   NV40TCL_SCISSOR_VERT_H_MASK							0xffff0000
+#define  NV40TCL_FOG_MODE								0x000008cc
+#define  NV40TCL_FOG_EQUATION_CONSTANT							0x000008d0
+#define  NV40TCL_FOG_EQUATION_LINEAR							0x000008d4
+#define  NV40TCL_FOG_EQUATION_QUADRATIC							0x000008d8
+#define  NV40TCL_FP_ADDRESS								0x000008e4
+#define   NV40TCL_FP_ADDRESS_OFFSET_SHIFT						8
+#define   NV40TCL_FP_ADDRESS_OFFSET_MASK						0xffffff00
+#define   NV40TCL_FP_ADDRESS_DMA1							(1 <<  1)
+#define   NV40TCL_FP_ADDRESS_DMA0							(1 <<  0)
+#define  NV40TCL_VIEWPORT_HORIZ								0x00000a00
+#define   NV40TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV40TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV40TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_VERT								0x00000a04
+#define   NV40TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV40TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define   NV40TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV40TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define  NV40TCL_VIEWPORT_TRANSLATE_X							0x00000a20
+#define  NV40TCL_VIEWPORT_TRANSLATE_Y							0x00000a24
+#define  NV40TCL_VIEWPORT_TRANSLATE_Z							0x00000a28
+#define  NV40TCL_VIEWPORT_TRANSLATE_W							0x00000a2c
+#define  NV40TCL_VIEWPORT_SCALE_X							0x00000a30
+#define  NV40TCL_VIEWPORT_SCALE_Y							0x00000a34
+#define  NV40TCL_VIEWPORT_SCALE_Z							0x00000a38
+#define  NV40TCL_VIEWPORT_SCALE_W							0x00000a3c
+#define  NV40TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000a60
+#define  NV40TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000a64
+#define  NV40TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000a68
+#define  NV40TCL_DEPTH_FUNC								0x00000a6c
+#define   NV40TCL_DEPTH_FUNC_NEVER							0x00000200
+#define   NV40TCL_DEPTH_FUNC_LESS							0x00000201
+#define   NV40TCL_DEPTH_FUNC_EQUAL							0x00000202
+#define   NV40TCL_DEPTH_FUNC_LEQUAL							0x00000203
+#define   NV40TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV40TCL_DEPTH_FUNC_GREATER							0x00000204
+#define   NV40TCL_DEPTH_FUNC_NOTEQUAL							0x00000205
+#define   NV40TCL_DEPTH_FUNC_GEQUAL							0x00000206
+#define   NV40TCL_DEPTH_FUNC_ALWAYS							0x00000207
+#define  NV40TCL_DEPTH_WRITE_ENABLE							0x00000a70
+#define  NV40TCL_DEPTH_TEST_ENABLE							0x00000a74
+#define  NV40TCL_POLYGON_OFFSET_FACTOR							0x00000a78
+#define  NV40TCL_POLYGON_OFFSET_UNITS							0x00000a7c
+#define  NV40TCL_VTX_ATTR_3I_XY(x)							(0x00000a80+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_3I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_3I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_3I_Z(x)							(0x00000a84+((x)*8))
+#define  NV40TCL_VTX_ATTR_3I_Z__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_SHIFT							0
+#define   NV40TCL_VTX_ATTR_3I_Z_Z_MASK							0x0000ffff
+#define  NV40TCL_UNK0B40(x)								(0x00000b40+((x)*4))
+#define  NV40TCL_UNK0B40__SIZE								0x00000008
+#define  NV40TCL_VP_UPLOAD_INST(x)							(0x00000b80+((x)*4))
+#define  NV40TCL_VP_UPLOAD_INST__SIZE							0x00000004
+#define  NV40TCL_CLIP_PLANE_ENABLE							0x00001478
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE0						(1 <<  1)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE1						(1 <<  5)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE2						(1 <<  9)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE3						(1 << 13)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE4						(1 << 17)
+#define   NV40TCL_CLIP_PLANE_ENABLE_PLANE5						(1 << 21)
+#define  NV40TCL_POLYGON_STIPPLE_ENABLE							0x0000147c
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001480+((x)*4))
+#define  NV40TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV40TCL_VTX_ATTR_3F_X(x)							(0x00001500+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Y(x)							(0x00001504+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_3F_Z(x)							(0x00001508+((x)*16))
+#define  NV40TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV40TCL_VTXBUF_ADDRESS(x)							(0x00001680+((x)*4))
+#define  NV40TCL_VTXBUF_ADDRESS__SIZE							0x00000010
+#define   NV40TCL_VTXBUF_ADDRESS_DMA1							(1 << 31)
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_SHIFT						0
+#define   NV40TCL_VTXBUF_ADDRESS_OFFSET_MASK						0x0fffffff
+#define  NV40TCL_VTX_CACHE_INVALIDATE							0x00001714
+#define  NV40TCL_VTXFMT(x)								(0x00001740+((x)*4))
+#define  NV40TCL_VTXFMT__SIZE								0x00000010
+#define   NV40TCL_VTXFMT_TYPE_SHIFT							0
+#define   NV40TCL_VTXFMT_TYPE_MASK							0x0000000f
+#define    NV40TCL_VTXFMT_TYPE_FLOAT							0x00000002
+#define    NV40TCL_VTXFMT_TYPE_UBYTE							0x00000004
+#define    NV40TCL_VTXFMT_TYPE_USHORT							0x00000005
+#define   NV40TCL_VTXFMT_SIZE_SHIFT							4
+#define   NV40TCL_VTXFMT_SIZE_MASK							0x000000f0
+#define   NV40TCL_VTXFMT_STRIDE_SHIFT							8
+#define   NV40TCL_VTXFMT_STRIDE_MASK							0x0000ff00
+#define  NV40TCL_QUERY_RESET								0x000017c8
+#define  NV40TCL_QUERY_UNK17CC								0x000017cc
+#define  NV40TCL_QUERY_GET								0x00001800
+#define   NV40TCL_QUERY_GET_UNK24_SHIFT							24
+#define   NV40TCL_QUERY_GET_UNK24_MASK							0xff000000
+#define   NV40TCL_QUERY_GET_OFFSET_SHIFT						0
+#define   NV40TCL_QUERY_GET_OFFSET_MASK							0x00ffffff
+#define  NV40TCL_BEGIN_END								0x00001808
+#define   NV40TCL_BEGIN_END_STOP							0x00000000
+#define   NV40TCL_BEGIN_END_POINTS							0x00000001
+#define   NV40TCL_BEGIN_END_LINES							0x00000002
+#define   NV40TCL_BEGIN_END_LINE_LOOP							0x00000003
+#define   NV40TCL_BEGIN_END_LINE_STRIP							0x00000004
+#define   NV40TCL_BEGIN_END_TRIANGLES							0x00000005
+#define   NV40TCL_BEGIN_END_TRIANGLE_STRIP						0x00000006
+#define   NV40TCL_BEGIN_END_TRIANGLE_FAN						0x00000007
+#define   NV40TCL_BEGIN_END_QUADS							0x00000008
+#define   NV40TCL_BEGIN_END_QUAD_STRIP							0x00000009
+#define   NV40TCL_BEGIN_END_POLYGON							0x0000000a
+#define  NV40TCL_VB_ELEMENT_U16								0x0000180c
+#define   NV40TCL_VB_ELEMENT_U16_1_SHIFT						16
+#define   NV40TCL_VB_ELEMENT_U16_1_MASK							0xffff0000
+#define   NV40TCL_VB_ELEMENT_U16_0_SHIFT						0
+#define   NV40TCL_VB_ELEMENT_U16_0_MASK							0x0000ffff
+#define  NV40TCL_VB_ELEMENT_U32								0x00001810
+#define  NV40TCL_VB_VERTEX_BATCH							0x00001814
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_VERTEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_VERTEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_VERTEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_VERTEX_DATA								0x00001818
+#define  NV40TCL_IDXBUF_ADDRESS								0x0000181c
+#define  NV40TCL_IDXBUF_FORMAT								0x00001820
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_SHIFT						4
+#define   NV40TCL_IDXBUF_FORMAT_TYPE_MASK						0x000000f0
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U32						0x00000000
+#define    NV40TCL_IDXBUF_FORMAT_TYPE_U16						0x00000010
+#define   NV40TCL_IDXBUF_FORMAT_DMA1							(1 <<  0)
+#define  NV40TCL_VB_INDEX_BATCH								0x00001824
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_SHIFT						24
+#define   NV40TCL_VB_INDEX_BATCH_COUNT_MASK						0xff000000
+#define   NV40TCL_VB_INDEX_BATCH_START_SHIFT						0
+#define   NV40TCL_VB_INDEX_BATCH_START_MASK						0x00ffffff
+#define  NV40TCL_POLYGON_MODE_FRONT							0x00001828
+#define   NV40TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV40TCL_POLYGON_MODE_BACK							0x0000182c
+#define   NV40TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV40TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV40TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV40TCL_CULL_FACE								0x00001830
+#define   NV40TCL_CULL_FACE_FRONT							0x00000404
+#define   NV40TCL_CULL_FACE_BACK							0x00000405
+#define   NV40TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV40TCL_FRONT_FACE								0x00001834
+#define   NV40TCL_FRONT_FACE_CW								0x00000900
+#define   NV40TCL_FRONT_FACE_CCW							0x00000901
+#define  NV40TCL_POLYGON_SMOOTH_ENABLE							0x00001838
+#define  NV40TCL_CULL_FACE_ENABLE							0x0000183c
+#define  NV40TCL_TEX_SIZE1(x)								(0x00001840+((x)*4))
+#define  NV40TCL_TEX_SIZE1__SIZE							0x00000008
+#define   NV40TCL_TEX_SIZE1_DEPTH_SHIFT							20
+#define   NV40TCL_TEX_SIZE1_DEPTH_MASK							0xfff00000
+#define   NV40TCL_TEX_SIZE1_PITCH_SHIFT							0
+#define   NV40TCL_TEX_SIZE1_PITCH_MASK							0x0000ffff
+#define  NV40TCL_VTX_ATTR_2F_X(x)							(0x00001880+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2F_Y(x)							(0x00001884+((x)*8))
+#define  NV40TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_2I(x)								(0x00001900+((x)*4))
+#define  NV40TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV40TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4UB(x)							(0x00001940+((x)*4))
+#define  NV40TCL_VTX_ATTR_4UB__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4UB_X_SHIFT							0
+#define   NV40TCL_VTX_ATTR_4UB_X_MASK							0x000000ff
+#define   NV40TCL_VTX_ATTR_4UB_Y_SHIFT							8
+#define   NV40TCL_VTX_ATTR_4UB_Y_MASK							0x0000ff00
+#define   NV40TCL_VTX_ATTR_4UB_Z_SHIFT							16
+#define   NV40TCL_VTX_ATTR_4UB_Z_MASK							0x00ff0000
+#define   NV40TCL_VTX_ATTR_4UB_W_SHIFT							24
+#define   NV40TCL_VTX_ATTR_4UB_W_MASK							0xff000000
+#define  NV40TCL_VTX_ATTR_4I_XY(x)							(0x00001980+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_XY__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_XY_X_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_XY_X_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_XY_Y_MASK							0xffff0000
+#define  NV40TCL_VTX_ATTR_4I_ZW(x)							(0x00001984+((x)*8))
+#define  NV40TCL_VTX_ATTR_4I_ZW__SIZE							0x00000010
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_SHIFT						0
+#define   NV40TCL_VTX_ATTR_4I_ZW_Z_MASK							0x0000ffff
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_SHIFT						16
+#define   NV40TCL_VTX_ATTR_4I_ZW_W_MASK							0xffff0000
+#define  NV40TCL_TEX_OFFSET(x)								(0x00001a00+((x)*32))
+#define  NV40TCL_TEX_OFFSET__SIZE							0x00000010
+#define  NV40TCL_TEX_FORMAT(x)								(0x00001a04+((x)*32))
+#define  NV40TCL_TEX_FORMAT__SIZE							0x00000010
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT						16
+#define   NV40TCL_TEX_FORMAT_MIPMAP_COUNT_MASK						0x000f0000
+#define   NV40TCL_TEX_FORMAT_RECT							(1 << 14)
+#define   NV40TCL_TEX_FORMAT_LINEAR							(1 << 13)
+#define   NV40TCL_TEX_FORMAT_FORMAT_SHIFT						8
+#define   NV40TCL_TEX_FORMAT_FORMAT_MASK						0x00001f00
+#define    NV40TCL_TEX_FORMAT_FORMAT_L8							0x00000100
+#define    NV40TCL_TEX_FORMAT_FORMAT_A1R5G5B5						0x00000200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A4R4G4B4						0x00000300
+#define    NV40TCL_TEX_FORMAT_FORMAT_R5G6B5						0x00000400
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8R8G8B8						0x00000500
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT1						0x00000600
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT3						0x00000700
+#define    NV40TCL_TEX_FORMAT_FORMAT_DXT5						0x00000800
+#define    NV40TCL_TEX_FORMAT_FORMAT_A8L8						0x00000b00
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z24						0x00001000
+#define    NV40TCL_TEX_FORMAT_FORMAT_Z16						0x00001200
+#define    NV40TCL_TEX_FORMAT_FORMAT_A16						0x00001400
+#define    NV40TCL_TEX_FORMAT_FORMAT_A16L16						0x00001500
+#define    NV40TCL_TEX_FORMAT_FORMAT_HILO8						0x00001800
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA16F						0x00001a00
+#define    NV40TCL_TEX_FORMAT_FORMAT_RGBA32F						0x00001b00
+#define   NV40TCL_TEX_FORMAT_DIMS_SHIFT							4
+#define   NV40TCL_TEX_FORMAT_DIMS_MASK							0x000000f0
+#define    NV40TCL_TEX_FORMAT_DIMS_1D							0x00000010
+#define    NV40TCL_TEX_FORMAT_DIMS_2D							0x00000020
+#define    NV40TCL_TEX_FORMAT_DIMS_3D							0x00000030
+#define   NV40TCL_TEX_FORMAT_NO_BORDER							(1 <<  3)
+#define   NV40TCL_TEX_FORMAT_CUBIC							(1 <<  2)
+#define   NV40TCL_TEX_FORMAT_DMA1							(1 <<  1)
+#define   NV40TCL_TEX_FORMAT_DMA0							(1 <<  0)
+#define  NV40TCL_TEX_WRAP(x)								(0x00001a08+((x)*32))
+#define  NV40TCL_TEX_WRAP__SIZE								0x00000010
+#define   NV40TCL_TEX_WRAP_S_SHIFT							0
+#define   NV40TCL_TEX_WRAP_S_MASK							0x000000ff
+#define    NV40TCL_TEX_WRAP_S_REPEAT							0x00000001
+#define    NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT						0x00000002
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE						0x00000003
+#define    NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER						0x00000004
+#define    NV40TCL_TEX_WRAP_S_CLAMP							0x00000005
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE					0x00000006
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER					0x00000007
+#define    NV40TCL_TEX_WRAP_S_MIRROR_CLAMP						0x00000008
+#define   NV40TCL_TEX_WRAP_T_SHIFT							8
+#define   NV40TCL_TEX_WRAP_T_MASK							0x00000f00
+#define    NV40TCL_TEX_WRAP_T_REPEAT							0x00000100
+#define    NV40TCL_TEX_WRAP_T_MIRRORED_REPEAT						0x00000200
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_EDGE						0x00000300
+#define    NV40TCL_TEX_WRAP_T_CLAMP_TO_BORDER						0x00000400
+#define    NV40TCL_TEX_WRAP_T_CLAMP							0x00000500
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_EDGE					0x00000600
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP_TO_BORDER					0x00000700
+#define    NV40TCL_TEX_WRAP_T_MIRROR_CLAMP						0x00000800
+#define   NV40TCL_TEX_WRAP_EXPAND_NORMAL_SHIFT						12
+#define   NV40TCL_TEX_WRAP_EXPAND_NORMAL_MASK						0x0000f000
+#define   NV40TCL_TEX_WRAP_R_SHIFT							16
+#define   NV40TCL_TEX_WRAP_R_MASK							0x00ff0000
+#define    NV40TCL_TEX_WRAP_R_REPEAT							0x00010000
+#define    NV40TCL_TEX_WRAP_R_MIRRORED_REPEAT						0x00020000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_EDGE						0x00030000
+#define    NV40TCL_TEX_WRAP_R_CLAMP_TO_BORDER						0x00040000
+#define    NV40TCL_TEX_WRAP_R_CLAMP							0x00050000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_EDGE					0x00060000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP_TO_BORDER					0x00070000
+#define    NV40TCL_TEX_WRAP_R_MIRROR_CLAMP						0x00080000
+#define   NV40TCL_TEX_WRAP_RCOMP_SHIFT							28
+#define   NV40TCL_TEX_WRAP_RCOMP_MASK							0xf0000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NEVER							0x00000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GREATER						0x10000000
+#define    NV40TCL_TEX_WRAP_RCOMP_EQUAL							0x20000000
+#define    NV40TCL_TEX_WRAP_RCOMP_GEQUAL						0x30000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LESS							0x40000000
+#define    NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL						0x50000000
+#define    NV40TCL_TEX_WRAP_RCOMP_LEQUAL						0x60000000
+#define    NV40TCL_TEX_WRAP_RCOMP_ALWAYS						0x70000000
+#define  NV40TCL_TEX_ENABLE(x)								(0x00001a0c+((x)*32))
+#define  NV40TCL_TEX_ENABLE__SIZE							0x00000010
+#define   NV40TCL_TEX_ENABLE_ENABLE							(1 << 31)
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MIN_LOD_SHIFT					27
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MIN_LOD_MASK					0x38000000
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MAX_LOD_SHIFT					15
+#define   NV40TCL_TEX_ENABLE_MIPMAP_MAX_LOD_MASK					0x00038000
+#define   NV40TCL_TEX_ENABLE_ANISO_SHIFT						4
+#define   NV40TCL_TEX_ENABLE_ANISO_MASK							0x000000f0
+#define    NV40TCL_TEX_ENABLE_ANISO_NONE						0x00000000
+#define    NV40TCL_TEX_ENABLE_ANISO_2X							0x00000010
+#define    NV40TCL_TEX_ENABLE_ANISO_4X							0x00000020
+#define    NV40TCL_TEX_ENABLE_ANISO_6X							0x00000030
+#define    NV40TCL_TEX_ENABLE_ANISO_8X							0x00000040
+#define    NV40TCL_TEX_ENABLE_ANISO_10X							0x00000050
+#define    NV40TCL_TEX_ENABLE_ANISO_12X							0x00000060
+#define    NV40TCL_TEX_ENABLE_ANISO_16X							0x00000070
+#define  NV40TCL_TEX_SWIZZLE(x)								(0x00001a10+((x)*32))
+#define  NV40TCL_TEX_SWIZZLE__SIZE							0x00000010
+#define   NV40TCL_TEX_SWIZZLE_S0_X_SHIFT						14
+#define   NV40TCL_TEX_SWIZZLE_S0_X_MASK							0x0000c000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_ONE							0x00004000
+#define    NV40TCL_TEX_SWIZZLE_S0_X_S1							0x00008000
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_SHIFT						12
+#define   NV40TCL_TEX_SWIZZLE_S0_Y_MASK							0x00003000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_ONE							0x00001000
+#define    NV40TCL_TEX_SWIZZLE_S0_Y_S1							0x00002000
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_SHIFT						10
+#define   NV40TCL_TEX_SWIZZLE_S0_Z_MASK							0x00000c00
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_ONE							0x00000400
+#define    NV40TCL_TEX_SWIZZLE_S0_Z_S1							0x00000800
+#define   NV40TCL_TEX_SWIZZLE_S0_W_SHIFT						8
+#define   NV40TCL_TEX_SWIZZLE_S0_W_MASK							0x00000300
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ZERO						0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S0_W_ONE							0x00000100
+#define    NV40TCL_TEX_SWIZZLE_S0_W_S1							0x00000200
+#define   NV40TCL_TEX_SWIZZLE_S1_X_SHIFT						6
+#define   NV40TCL_TEX_SWIZZLE_S1_X_MASK							0x000000c0
+#define    NV40TCL_TEX_SWIZZLE_S1_X_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Z							0x00000040
+#define    NV40TCL_TEX_SWIZZLE_S1_X_Y							0x00000080
+#define    NV40TCL_TEX_SWIZZLE_S1_X_X							0x000000c0
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_SHIFT						4
+#define   NV40TCL_TEX_SWIZZLE_S1_Y_MASK							0x00000030
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Z							0x00000010
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_Y							0x00000020
+#define    NV40TCL_TEX_SWIZZLE_S1_Y_X							0x00000030
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_SHIFT						2
+#define   NV40TCL_TEX_SWIZZLE_S1_Z_MASK							0x0000000c
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Z							0x00000004
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_Y							0x00000008
+#define    NV40TCL_TEX_SWIZZLE_S1_Z_X							0x0000000c
+#define   NV40TCL_TEX_SWIZZLE_S1_W_SHIFT						0
+#define   NV40TCL_TEX_SWIZZLE_S1_W_MASK							0x00000003
+#define    NV40TCL_TEX_SWIZZLE_S1_W_W							0x00000000
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Z							0x00000001
+#define    NV40TCL_TEX_SWIZZLE_S1_W_Y							0x00000002
+#define    NV40TCL_TEX_SWIZZLE_S1_W_X							0x00000003
+#define  NV40TCL_TEX_FILTER(x)								(0x00001a14+((x)*32))
+#define  NV40TCL_TEX_FILTER__SIZE							0x00000010
+#define   NV40TCL_TEX_FILTER_SIGNED_ALPHA						(1 << 31)
+#define   NV40TCL_TEX_FILTER_SIGNED_RED							(1 << 30)
+#define   NV40TCL_TEX_FILTER_SIGNED_GREEN						(1 << 29)
+#define   NV40TCL_TEX_FILTER_SIGNED_BLUE						(1 << 28)
+#define   NV40TCL_TEX_FILTER_MIN_SHIFT							16
+#define   NV40TCL_TEX_FILTER_MIN_MASK							0x000f0000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST						0x00010000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR						0x00020000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST				0x00030000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST					0x00040000
+#define    NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR					0x00050000
+#define    NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR					0x00060000
+#define   NV40TCL_TEX_FILTER_MAG_SHIFT							24
+#define   NV40TCL_TEX_FILTER_MAG_MASK							0x0f000000
+#define    NV40TCL_TEX_FILTER_MAG_NEAREST						0x01000000
+#define    NV40TCL_TEX_FILTER_MAG_LINEAR						0x02000000
+#define  NV40TCL_TEX_SIZE0(x)								(0x00001a18+((x)*32))
+#define  NV40TCL_TEX_SIZE0__SIZE							0x00000010
+#define   NV40TCL_TEX_SIZE0_H_SHIFT							0
+#define   NV40TCL_TEX_SIZE0_H_MASK							0x0000ffff
+#define   NV40TCL_TEX_SIZE0_W_SHIFT							16
+#define   NV40TCL_TEX_SIZE0_W_MASK							0xffff0000
+#define  NV40TCL_TEX_BORDER_COLOR(x)							(0x00001a1c+((x)*32))
+#define  NV40TCL_TEX_BORDER_COLOR__SIZE							0x00000010
+#define   NV40TCL_TEX_BORDER_COLOR_B_SHIFT						0
+#define   NV40TCL_TEX_BORDER_COLOR_B_MASK						0x000000ff
+#define   NV40TCL_TEX_BORDER_COLOR_G_SHIFT						8
+#define   NV40TCL_TEX_BORDER_COLOR_G_MASK						0x0000ff00
+#define   NV40TCL_TEX_BORDER_COLOR_R_SHIFT						16
+#define   NV40TCL_TEX_BORDER_COLOR_R_MASK						0x00ff0000
+#define   NV40TCL_TEX_BORDER_COLOR_A_SHIFT						24
+#define   NV40TCL_TEX_BORDER_COLOR_A_MASK						0xff000000
+#define  NV40TCL_VTX_ATTR_4F_X(x)							(0x00001c00+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Y(x)							(0x00001c04+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_Z(x)							(0x00001c08+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV40TCL_VTX_ATTR_4F_W(x)							(0x00001c0c+((x)*16))
+#define  NV40TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV40TCL_FP_CONTROL								0x00001d60
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT						24
+#define   NV40TCL_FP_CONTROL_TEMP_COUNT_MASK						0xff000000
+#define   NV40TCL_FP_CONTROL_KIL							(1 <<  7)
+#define  NV40TCL_MULTISAMPLE_CONTROL							0x00001d7c
+#define  NV40TCL_CLEAR_VALUE_DEPTH							0x00001d8c
+#define  NV40TCL_CLEAR_VALUE_COLOR							0x00001d90
+#define  NV40TCL_CLEAR_BUFFERS								0x00001d94
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_A							(1 <<  7)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_B							(1 <<  6)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_G							(1 <<  5)
+#define   NV40TCL_CLEAR_BUFFERS_COLOR_R							(1 <<  4)
+#define   NV40TCL_CLEAR_BUFFERS_STENCIL							(1 <<  1)
+#define   NV40TCL_CLEAR_BUFFERS_DEPTH							(1 <<  0)
+#define  NV40TCL_LINE_STIPPLE_ENABLE							0x00001db4
+#define  NV40TCL_LINE_STIPPLE_PATTERN							0x00001db8
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_SHIFT					0
+#define   NV40TCL_LINE_STIPPLE_PATTERN_FACTOR_MASK					0x0000ffff
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_SHIFT					16
+#define   NV40TCL_LINE_STIPPLE_PATTERN_PATTERN_MASK					0xffff0000
+#define  NV40TCL_VTX_ATTR_1F(x)								(0x00001e40+((x)*4))
+#define  NV40TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV40TCL_VP_UPLOAD_FROM_ID							0x00001e9c
+#define  NV40TCL_VP_START_FROM_ID							0x00001ea0
+#define  NV40TCL_POINT_SIZE								0x00001ee0
+#define  NV40TCL_POINT_SPRITE								0x00001ee8
+#define  NV40TCL_VP_UPLOAD_CONST_ID							0x00001efc
+#define  NV40TCL_VP_UPLOAD_CONST_X(x)							(0x00001f00+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_X__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Y(x)							(0x00001f04+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Y__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_Z(x)							(0x00001f08+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_Z__SIZE						0x00000004
+#define  NV40TCL_VP_UPLOAD_CONST_W(x)							(0x00001f0c+((x)*16))
+#define  NV40TCL_VP_UPLOAD_CONST_W__SIZE						0x00000004
+#define  NV40TCL_TEX_CACHE_CTL								0x00001fd8
+#define  NV40TCL_VP_ATTRIB_EN								0x00001ff0
+#define  NV40TCL_VP_RESULT_EN								0x00001ff4
+
+
+#define NV44TCL										0x00004497
+
+
+
+#define NV50_2D										0x0000502d
+
+#define  NV50_2D_NOP									0x00000100
+#define  NV50_2D_NOTIFY									0x00000104
+#define  NV50_2D_DMA_NOTIFY								0x00000180
+#define  NV50_2D_DMA_IN_MEMORY0								0x00000184
+#define  NV50_2D_DMA_IN_MEMORY1								0x00000188
+#define  NV50_2D_DMA_IN_MEMORY2								0x0000018c
+#define  NV50_2D_DST_FORMAT								0x00000200
+#define   NV50_2D_DST_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_DST_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_DST_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_DST_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_DST_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_DST_PITCH								0x00000214
+#define  NV50_2D_DST_WIDTH								0x00000218
+#define  NV50_2D_DST_HEIGHT								0x0000021c
+#define  NV50_2D_DST_ADDRESS_HIGH							0x00000220
+#define  NV50_2D_DST_ADDRESS_LOW							0x00000224
+#define  NV50_2D_SRC_FORMAT								0x00000230
+#define   NV50_2D_SRC_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_SRC_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_SRC_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_SRC_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_SRC_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_SRC_PITCH								0x00000244
+#define  NV50_2D_SRC_WIDTH								0x00000248
+#define  NV50_2D_SRC_HEIGHT								0x0000024c
+#define  NV50_2D_SRC_ADDRESS_HIGH							0x00000250
+#define  NV50_2D_SRC_ADDRESS_LOW							0x00000254
+#define  NV50_2D_CLIP_X									0x00000280
+#define  NV50_2D_CLIP_Y									0x00000284
+#define  NV50_2D_CLIP_Z									0x00000288
+#define  NV50_2D_CLIP_W									0x0000028c
+#define  NV50_2D_ROP									0x000002a0
+#define  NV50_2D_OPERATION								0x000002ac
+#define   NV50_2D_OPERATION_SRCCOPY_AND							0x00000000
+#define   NV50_2D_OPERATION_ROP_AND							0x00000001
+#define   NV50_2D_OPERATION_BLEND_AND							0x00000002
+#define   NV50_2D_OPERATION_SRCCOPY							0x00000003
+#define   NV50_2D_OPERATION_SRCCOPY_PREMULT						0x00000004
+#define   NV50_2D_OPERATION_BLEND_PREMULT						0x00000005
+#define  NV50_2D_PATTERN_FORMAT								0x000002e8
+#define   NV50_2D_PATTERN_FORMAT_16BPP							0x00000000
+#define   NV50_2D_PATTERN_FORMAT_15BPP							0x00000001
+#define   NV50_2D_PATTERN_FORMAT_32BPP							0x00000002
+#define   NV50_2D_PATTERN_FORMAT_8BPP							0x00000003
+#define  NV50_2D_PATTERN_COLOR(x)							(0x000002f0+((x)*4))
+#define  NV50_2D_PATTERN_COLOR__SIZE							0x00000002
+#define  NV50_2D_PATTERN_BITMAP(x)							(0x000002f8+((x)*4))
+#define  NV50_2D_PATTERN_BITMAP__SIZE							0x00000002
+#define  NV50_2D_RECT_FORMAT								0x00000584
+#define   NV50_2D_RECT_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_RECT_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_RECT_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_RECT_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_RECT_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_RECT_COLOR								0x00000588
+#define  NV50_2D_RECT_X1								0x00000600
+#define  NV50_2D_RECT_Y1								0x00000604
+#define  NV50_2D_RECT_X2								0x00000608
+#define  NV50_2D_RECT_Y2								0x0000060c
+#define  NV50_2D_SIFC_UNK0800								0x00000800
+#define  NV50_2D_SIFC_FORMAT								0x00000804
+#define   NV50_2D_SIFC_FORMAT_32BPP							0x000000cf
+#define   NV50_2D_SIFC_FORMAT_24BPP							0x000000e6
+#define   NV50_2D_SIFC_FORMAT_16BPP							0x000000e8
+#define   NV50_2D_SIFC_FORMAT_8BPP							0x000000f3
+#define   NV50_2D_SIFC_FORMAT_15BPP							0x000000f8
+#define  NV50_2D_SIFC_WIDTH								0x00000838
+#define  NV50_2D_SIFC_HEIGHT								0x0000083c
+#define  NV50_2D_SIFC_SCALE_UNK0840							0x00000840
+#define  NV50_2D_SIFC_SCALE_UNK0844							0x00000844
+#define  NV50_2D_SIFC_SCALE_UNK0848							0x00000848
+#define  NV50_2D_SIFC_SCALE_UNK084C							0x0000084c
+#define  NV50_2D_SIFC_UNK0850								0x00000850
+#define  NV50_2D_SIFC_DST_X								0x00000854
+#define  NV50_2D_SIFC_UNK0858								0x00000858
+#define  NV50_2D_SIFC_DST_Y								0x0000085c
+#define  NV50_2D_SIFC_DATA								0x00000860
+#define  NV50_2D_BLIT_DST_X								0x000008b0
+#define  NV50_2D_BLIT_DST_Y								0x000008b4
+#define  NV50_2D_BLIT_DST_W								0x000008b8
+#define  NV50_2D_BLIT_DST_H								0x000008bc
+#define  NV50_2D_BLIT_SRC_X								0x000008d4
+#define  NV50_2D_BLIT_SRC_Y								0x000008dc
+
+
+#define NV50_MEMORY_TO_MEMORY_FORMAT							0x00005039
+
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN_HIGH					0x00000238
+#define  NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_OUT_HIGH					0x0000023c
+
+
+#define NV50TCL										0x00005097
+
+#define  NV50TCL_NOP									0x00000100
+#define  NV50TCL_NOTIFY									0x00000104
+#define  NV50TCL_DMA_NOTIFY								0x00000180
+#define  NV50TCL_DMA_UNK0(x)								(0x00000184+((x)*4))
+#define  NV50TCL_DMA_UNK0__SIZE								0x0000000b
+#define  NV50TCL_DMA_UNK1(x)								(0x000001c0+((x)*4))
+#define  NV50TCL_DMA_UNK1__SIZE								0x00000008
+#define  NV50TCL_RT_ADDRESS_HIGH(x)							(0x00000200+((x)*32))
+#define  NV50TCL_RT_ADDRESS_HIGH__SIZE							0x00000008
+#define  NV50TCL_RT_ADDRESS_LOW(x)							(0x00000204+((x)*32))
+#define  NV50TCL_RT_ADDRESS_LOW__SIZE							0x00000008
+#define  NV50TCL_RT_FORMAT(x)								(0x00000208+((x)*32))
+#define  NV50TCL_RT_FORMAT__SIZE							0x00000008
+#define   NV50TCL_RT_FORMAT_32BPP							0x000000cf
+#define   NV50TCL_RT_FORMAT_24BPP							0x000000e6
+#define   NV50TCL_RT_FORMAT_16BPP							0x000000e8
+#define   NV50TCL_RT_FORMAT_8BPP							0x000000f3
+#define   NV50TCL_RT_FORMAT_15BPP							0x000000f8
+#define  NV50TCL_RT_TILE_UNK(x)								(0x0000020c+((x)*32))
+#define  NV50TCL_RT_TILE_UNK__SIZE							0x00000008
+#define  NV50TCL_RT_UNK4(x)								(0x00000210+((x)*32))
+#define  NV50TCL_RT_UNK4__SIZE								0x00000008
+#define  NV50TCL_VTX_ATTR_1F(x)								(0x00000300+((x)*4))
+#define  NV50TCL_VTX_ATTR_1F__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2F_X(x)							(0x00000380+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2F_Y(x)							(0x00000384+((x)*8))
+#define  NV50TCL_VTX_ATTR_2F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_X(x)							(0x00000400+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Y(x)							(0x00000404+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_Z(x)							(0x00000408+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_3F_W(x)							(0x0000040c+((x)*16))
+#define  NV50TCL_VTX_ATTR_3F_W__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_X(x)							(0x00000500+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_X__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Y(x)							(0x00000504+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Y__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_Z(x)							(0x00000508+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_Z__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_4F_W(x)							(0x0000050c+((x)*16))
+#define  NV50TCL_VTX_ATTR_4F_W__SIZE							0x00000010
+#define  NV50TCL_VTX_ATTR_2I(x)								(0x00000680+((x)*4))
+#define  NV50TCL_VTX_ATTR_2I__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_2I_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_2I_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_2I_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_2I_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_0(x)							(0x00000700+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_0_X_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_0_Y_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4I_1(x)							(0x00000704+((x)*8))
+#define  NV50TCL_VTX_ATTR_4I_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4I_1_Z_SHIFT							0
+#define   NV50TCL_VTX_ATTR_4I_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4I_1_W_SHIFT							16
+#define   NV50TCL_VTX_ATTR_4I_1_W_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_0(x)							(0x00000780+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_0__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_0_X_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_0_X_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_0_Y_MASK							0xffff0000
+#define  NV50TCL_VTX_ATTR_4NI_1(x)							(0x00000784+((x)*8))
+#define  NV50TCL_VTX_ATTR_4NI_1__SIZE							0x00000010
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_SHIFT						0
+#define   NV50TCL_VTX_ATTR_4NI_1_Z_MASK							0x0000ffff
+#define   NV50TCL_VTX_ATTR_4NI_1_W_SHIFT						16
+#define   NV50TCL_VTX_ATTR_4NI_1_W_MASK							0xffff0000
+#define  NV50TCL_VERTEX_ARRAY_FORMAT(x)							(0x00000900+((x)*16))
+#define  NV50TCL_VERTEX_ARRAY_FORMAT__SIZE						0x00000010
+#define  NV50TCL_VIEWPORT_UNK0(x)							(0x00000a00+((x)*4))
+#define  NV50TCL_VIEWPORT_UNK0__SIZE							0x00000003
+#define  NV50TCL_VIEWPORT_UNK1(x)							(0x00000a0c+((x)*4))
+#define  NV50TCL_VIEWPORT_UNK1__SIZE							0x00000003
+#define  NV50TCL_VIEWPORT_HORIZ								0x00000c00
+#define   NV50TCL_VIEWPORT_HORIZ_X_SHIFT						0
+#define   NV50TCL_VIEWPORT_HORIZ_X_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_HORIZ_W_SHIFT						16
+#define   NV50TCL_VIEWPORT_HORIZ_W_MASK							0xffff0000
+#define  NV50TCL_VIEWPORT_VERT								0x00000c04
+#define   NV50TCL_VIEWPORT_VERT_Y_SHIFT							0
+#define   NV50TCL_VIEWPORT_VERT_Y_MASK							0x0000ffff
+#define   NV50TCL_VIEWPORT_VERT_H_SHIFT							16
+#define   NV50TCL_VIEWPORT_VERT_H_MASK							0xffff0000
+#define  NV50TCL_DEPTH_RANGE_NEAR							0x00000c08
+#define  NV50TCL_DEPTH_RANGE_FAR							0x00000c0c
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ(x)							(0x00000d00+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_HORIZ__SIZE						0x00000008
+#define  NV50TCL_VIEWPORT_CLIP_VERT(x)							(0x00000d04+((x)*8))
+#define  NV50TCL_VIEWPORT_CLIP_VERT__SIZE						0x00000008
+#define  NV50TCL_VERTEX_BUFFER_FIRST							0x00000d74
+#define  NV50TCL_VERTEX_BUFFER_COUNT							0x00000d78
+#define  NV50TCL_CLEAR_COLOR(x)								(0x00000d80+((x)*4))
+#define  NV50TCL_CLEAR_COLOR__SIZE							0x00000004
+#define  NV50TCL_CLEAR_DEPTH								0x00000d90
+#define  NV50TCL_CLEAR_STENCIL								0x00000da0
+#define  NV50TCL_POLYGON_MODE_FRONT							0x00000dac
+#define   NV50TCL_POLYGON_MODE_FRONT_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_FRONT_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_FRONT_FILL						0x00001b02
+#define  NV50TCL_POLYGON_MODE_BACK							0x00000db0
+#define   NV50TCL_POLYGON_MODE_BACK_POINT						0x00001b00
+#define   NV50TCL_POLYGON_MODE_BACK_LINE						0x00001b01
+#define   NV50TCL_POLYGON_MODE_BACK_FILL						0x00001b02
+#define  NV50TCL_POLYGON_SMOOTH_ENABLE							0x00000db4
+#define  NV50TCL_POLYGON_OFFSET_POINT_ENABLE						0x00000dc0
+#define  NV50TCL_POLYGON_OFFSET_LINE_ENABLE						0x00000dc4
+#define  NV50TCL_POLYGON_OFFSET_FILL_ENABLE						0x00000dc8
+#define  NV50TCL_SCISSOR_HORIZ								0x00000e04
+#define   NV50TCL_SCISSOR_HORIZ_L_SHIFT							0
+#define   NV50TCL_SCISSOR_HORIZ_L_MASK							0x0000ffff
+#define   NV50TCL_SCISSOR_HORIZ_R_SHIFT							16
+#define   NV50TCL_SCISSOR_HORIZ_R_MASK							0xffff0000
+#define  NV50TCL_SCISSOR_VERT								0x00000e08
+#define   NV50TCL_SCISSOR_VERT_T_SHIFT							0
+#define   NV50TCL_SCISSOR_VERT_T_MASK							0x0000ffff
+#define   NV50TCL_SCISSOR_VERT_B_SHIFT							16
+#define   NV50TCL_SCISSOR_VERT_B_MASK							0xffff0000
+#define  NV50TCL_CB_ADDR								0x00000f00
+#define   NV50TCL_CB_ADDR_ID_SHIFT							8
+#define   NV50TCL_CB_ADDR_ID_MASK							0xffffff00
+#define   NV50TCL_CB_ADDR_BUFFER_SHIFT							0
+#define   NV50TCL_CB_ADDR_BUFFER_MASK							0x000000ff
+#define  NV50TCL_CB_DATA(x)								(0x00000f04+((x)*4))
+#define  NV50TCL_CB_DATA__SIZE								0x00000010
+#define  NV50TCL_STENCIL_FRONT_FUNC_REF							0x00000f54
+#define  NV50TCL_STENCIL_FRONT_MASK							0x00000f58
+#define  NV50TCL_STENCIL_FRONT_FUNC_MASK						0x00000f5c
+#define  NV50TCL_GP_ADDRESS_HIGH							0x00000f70
+#define  NV50TCL_GP_ADDRESS_LOW								0x00000f74
+#define  NV50TCL_VP_ADDRESS_HIGH							0x00000f7c
+#define  NV50TCL_VP_ADDRESS_LOW								0x00000f80
+#define  NV50TCL_FP_ADDRESS_HIGH							0x00000fa4
+#define  NV50TCL_FP_ADDRESS_LOW								0x00000fa8
+#define  NV50TCL_ZETA_ADDRESS_HIGH							0x00000fe0
+#define  NV50TCL_ZETA_ADDRESS_LOW							0x00000fe4
+#define  NV50TCL_UNKFF4									0x00000ff4
+#define   NV50TCL_UNKFF4_W_SHIFT							16
+#define   NV50TCL_UNKFF4_W_MASK								0xffff0000
+#define  NV50TCL_UNKFF8									0x00000ff8
+#define   NV50TCL_UNKFF8_H_SHIFT							16
+#define   NV50TCL_UNKFF8_H_MASK								0xffff0000
+#define  NV50TCL_RT_HORIZ(x)								(0x00001240+((x)*8))
+#define  NV50TCL_RT_HORIZ__SIZE								0x00000008
+#define  NV50TCL_RT_VERT(x)								(0x00001244+((x)*8))
+#define  NV50TCL_RT_VERT__SIZE								0x00000008
+#define  NV50TCL_CB_DEF_ADDRESS_HIGH							0x00001280
+#define  NV50TCL_CB_DEF_ADDRESS_LOW							0x00001284
+#define  NV50TCL_CB_DEF_SET								0x00001288
+#define   NV50TCL_CB_DEF_SET_SIZE_SHIFT							0
+#define   NV50TCL_CB_DEF_SET_SIZE_MASK							0x0000ffff
+#define   NV50TCL_CB_DEF_SET_BUFFER_SHIFT						16
+#define   NV50TCL_CB_DEF_SET_BUFFER_MASK						0xffff0000
+#define  NV50TCL_DEPTH_TEST_ENABLE							0x000012cc
+#define  NV50TCL_SHADE_MODEL								0x000012d4
+#define   NV50TCL_SHADE_MODEL_FLAT							0x00001d00
+#define   NV50TCL_SHADE_MODEL_SMOOTH							0x00001d01
+#define  NV50TCL_DEPTH_WRITE_ENABLE							0x000012e8
+#define  NV50TCL_ALPHA_TEST_ENABLE							0x000012ec
+#define  NV50TCL_DEPTH_TEST_FUNC							0x0000130c
+#define   NV50TCL_DEPTH_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_DEPTH_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_DEPTH_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_DEPTH_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_DEPTH_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_DEPTH_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_DEPTH_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_DEPTH_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_ALPHA_TEST_REF								0x00001310
+#define  NV50TCL_ALPHA_TEST_FUNC							0x00001314
+#define   NV50TCL_ALPHA_TEST_FUNC_NEVER							0x00000200
+#define   NV50TCL_ALPHA_TEST_FUNC_LESS							0x00000201
+#define   NV50TCL_ALPHA_TEST_FUNC_EQUAL							0x00000202
+#define   NV50TCL_ALPHA_TEST_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_ALPHA_TEST_FUNC_GREATER						0x00000204
+#define   NV50TCL_ALPHA_TEST_FUNC_NOTEQUAL						0x00000205
+#define   NV50TCL_ALPHA_TEST_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_ALPHA_TEST_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_BLEND_COLOR(x)								(0x0000131c+((x)*4))
+#define  NV50TCL_BLEND_COLOR__SIZE							0x00000004
+#define  NV50TCL_BLEND_EQUATION_RGB							0x00001340
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_RGB_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_RGB_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_RGB_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_RGB							0x00001344
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_RGB							0x00001348
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_COLOR						0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA						0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_ALPHA						0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_RGB_DST_COLOR						0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_RGB_SRC_ALPHA_SATURATE					0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_RGB_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_RGB_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_EQUATION_ALPHA							0x0000134c
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_ADD						0x00008006
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MIN						0x00008007
+#define   NV50TCL_BLEND_EQUATION_ALPHA_MAX						0x00008008
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_SUBTRACT					0x0000800a
+#define   NV50TCL_BLEND_EQUATION_ALPHA_FUNC_REVERSE_SUBTRACT				0x0000800b
+#define  NV50TCL_BLEND_FUNC_SRC_ALPHA							0x00001350
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_SRC_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_FUNC_DST_ALPHA							0x00001358
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ZERO						0x00000000
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE						0x00000001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_COLOR					0x00000300
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_COLOR				0x00000301
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA					0x00000302
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_SRC_ALPHA				0x00000303
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_ALPHA					0x00000304
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_ALPHA				0x00000305
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_DST_COLOR					0x00000306
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_DST_COLOR				0x00000307
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_SRC_ALPHA_SATURATE				0x00000308
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_COLOR					0x00008001
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_COLOR				0x00008002
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_CONSTANT_ALPHA					0x00008003
+#define   NV50TCL_BLEND_FUNC_DST_ALPHA_ONE_MINUS_CONSTANT_ALPHA				0x00008004
+#define  NV50TCL_BLEND_ENABLE(x)							(0x00001360+((x)*4))
+#define  NV50TCL_BLEND_ENABLE__SIZE							0x00000008
+#define  NV50TCL_STENCIL_BACK_ENABLE							0x00001380
+#define  NV50TCL_STENCIL_BACK_OP_FAIL							0x00001384
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZFAIL							0x00001388
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_OP_ZPASS							0x0000138c
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_BACK_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_BACK_FUNC_FUNC							0x00001390
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_LEQUAL						0x00000203
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_GEQUAL						0x00000206
+#define   NV50TCL_STENCIL_BACK_FUNC_FUNC_ALWAYS						0x00000207
+#define  NV50TCL_STENCIL_BACK_FUNC_REF							0x00001394
+#define  NV50TCL_STENCIL_BACK_MASK							0x00001398
+#define  NV50TCL_STENCIL_BACK_FUNC_MASK							0x0000139c
+#define  NV50TCL_LINE_WIDTH								0x000013b0
+#define  NV50TCL_VP_START_ID								0x0000140c
+#define  NV50TCL_GP_START_ID								0x00001410
+#define  NV50TCL_FP_START_ID								0x00001414
+#define  NV50TCL_POINT_SIZE								0x00001518
+#define  NV50TCL_TSC_ADDRESS_HIGH							0x0000155c
+#define  NV50TCL_TSC_ADDRESS_LOW							0x00001560
+#define  NV50TCL_POLYGON_OFFSET_FACTOR							0x0000156c
+#define  NV50TCL_LINE_SMOOTH_ENABLE							0x00001570
+#define  NV50TCL_TIC_ADDRESS_HIGH							0x00001574
+#define  NV50TCL_TIC_ADDRESS_LOW							0x00001578
+#define  NV50TCL_STENCIL_FRONT_ENABLE							0x00001594
+#define  NV50TCL_STENCIL_FRONT_OP_FAIL							0x00001598
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_REPLACE						0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_FAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZFAIL							0x0000159c
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZFAIL_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_OP_ZPASS							0x000015a0
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_ZERO						0x00000000
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INVERT						0x0000150a
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_KEEP						0x00001e00
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_REPLACE					0x00001e01
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR						0x00001e02
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR						0x00001e03
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_INCR_WRAP					0x00008507
+#define   NV50TCL_STENCIL_FRONT_OP_ZPASS_DECR_WRAP					0x00008508
+#define  NV50TCL_STENCIL_FRONT_FUNC_FUNC						0x000015a4
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NEVER						0x00000200
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LESS						0x00000201
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_EQUAL						0x00000202
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_LEQUAL					0x00000203
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GREATER					0x00000204
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_NOTEQUAL					0x00000205
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_GEQUAL					0x00000206
+#define   NV50TCL_STENCIL_FRONT_FUNC_FUNC_ALWAYS					0x00000207
+#define  NV50TCL_POLYGON_OFFSET_UNITS							0x000015bc
+#define  NV50TCL_VERTEX_BEGIN								0x000015dc
+#define   NV50TCL_VERTEX_BEGIN_POINTS							0x00000000
+#define   NV50TCL_VERTEX_BEGIN_LINES							0x00000001
+#define   NV50TCL_VERTEX_BEGIN_LINE_LOOP						0x00000002
+#define   NV50TCL_VERTEX_BEGIN_LINE_STRIP						0x00000003
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLES						0x00000004
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP						0x00000005
+#define   NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN						0x00000006
+#define   NV50TCL_VERTEX_BEGIN_QUADS							0x00000007
+#define   NV50TCL_VERTEX_BEGIN_QUAD_STRIP						0x00000008
+#define   NV50TCL_VERTEX_BEGIN_POLYGON							0x00000009
+#define  NV50TCL_VERTEX_END								0x000015e0
+#define  NV50TCL_VERTEX_DATA								0x00001640
+#define  NV50TCL_VP_ATTR_EN_0								0x00001650
+#define   NV50TCL_VP_ATTR_EN_0_7_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_0_7_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_0_7_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_0_6_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_0_6_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_0_6_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_0_6_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_0_5_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_0_5_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_0_5_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_0_5_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_0_4_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_0_4_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_0_4_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_0_4_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_0_3_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_0_3_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_0_3_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_0_3_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_0_3_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_0_3_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_0_3_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_0_2_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_0_2_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_0_2_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_0_2_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_0_2_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_0_2_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_0_2_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_0_2_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_0_2_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_0_2_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_0_1_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_0_1_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_0_1_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_0_1_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_0_1_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_0_1_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_0_1_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_0_1_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_0_1_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_0_1_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_0_0_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_0_0_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_0_0_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_0_0_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_0_0_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_0_0_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_0_0_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_0_0_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_0_0_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_0_0_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_0_0_XYZW							0x0000000f
+#define  NV50TCL_VP_ATTR_EN_1								0x00001654
+#define   NV50TCL_VP_ATTR_EN_1_15_SHIFT							28
+#define   NV50TCL_VP_ATTR_EN_1_15_MASK							0xf0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNN							0x10000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNN							0x20000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNN							0x30000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZN							0x40000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZN							0x50000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZN							0x60000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZN							0x70000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNNW							0x80000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNNW							0x90000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYNW							0xa0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYNW							0xb0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NNZW							0xc0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XNZW							0xd0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_NYZW							0xe0000000
+#define    NV50TCL_VP_ATTR_EN_1_15_XYZW							0xf0000000
+#define   NV50TCL_VP_ATTR_EN_1_14_SHIFT							24
+#define   NV50TCL_VP_ATTR_EN_1_14_MASK							0x0f000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNN							0x01000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNN							0x02000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNN							0x03000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZN							0x04000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZN							0x05000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZN							0x06000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZN							0x07000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNNW							0x08000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNNW							0x09000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYNW							0x0a000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYNW							0x0b000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NNZW							0x0c000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XNZW							0x0d000000
+#define    NV50TCL_VP_ATTR_EN_1_14_NYZW							0x0e000000
+#define    NV50TCL_VP_ATTR_EN_1_14_XYZW							0x0f000000
+#define   NV50TCL_VP_ATTR_EN_1_13_SHIFT							20
+#define   NV50TCL_VP_ATTR_EN_1_13_MASK							0x00f00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNN							0x00100000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNN							0x00200000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNN							0x00300000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZN							0x00400000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZN							0x00500000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZN							0x00600000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZN							0x00700000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNNW							0x00800000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNNW							0x00900000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYNW							0x00a00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYNW							0x00b00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NNZW							0x00c00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XNZW							0x00d00000
+#define    NV50TCL_VP_ATTR_EN_1_13_NYZW							0x00e00000
+#define    NV50TCL_VP_ATTR_EN_1_13_XYZW							0x00f00000
+#define   NV50TCL_VP_ATTR_EN_1_12_SHIFT							16
+#define   NV50TCL_VP_ATTR_EN_1_12_MASK							0x000f0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNN							0x00010000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNN							0x00020000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNN							0x00030000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZN							0x00040000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZN							0x00050000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZN							0x00060000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZN							0x00070000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNNW							0x00080000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNNW							0x00090000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYNW							0x000a0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYNW							0x000b0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NNZW							0x000c0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XNZW							0x000d0000
+#define    NV50TCL_VP_ATTR_EN_1_12_NYZW							0x000e0000
+#define    NV50TCL_VP_ATTR_EN_1_12_XYZW							0x000f0000
+#define   NV50TCL_VP_ATTR_EN_1_11_SHIFT							12
+#define   NV50TCL_VP_ATTR_EN_1_11_MASK							0x0000f000
+#define    NV50TCL_VP_ATTR_EN_1_11_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNN							0x00001000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNN							0x00002000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNN							0x00003000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZN							0x00004000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZN							0x00005000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZN							0x00006000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZN							0x00007000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNNW							0x00008000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNNW							0x00009000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYNW							0x0000a000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYNW							0x0000b000
+#define    NV50TCL_VP_ATTR_EN_1_11_NNZW							0x0000c000
+#define    NV50TCL_VP_ATTR_EN_1_11_XNZW							0x0000d000
+#define    NV50TCL_VP_ATTR_EN_1_11_NYZW							0x0000e000
+#define    NV50TCL_VP_ATTR_EN_1_11_XYZW							0x0000f000
+#define   NV50TCL_VP_ATTR_EN_1_10_SHIFT							8
+#define   NV50TCL_VP_ATTR_EN_1_10_MASK							0x00000f00
+#define    NV50TCL_VP_ATTR_EN_1_10_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNN							0x00000100
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNN							0x00000200
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNN							0x00000300
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZN							0x00000400
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZN							0x00000500
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZN							0x00000600
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZN							0x00000700
+#define    NV50TCL_VP_ATTR_EN_1_10_NNNW							0x00000800
+#define    NV50TCL_VP_ATTR_EN_1_10_XNNW							0x00000900
+#define    NV50TCL_VP_ATTR_EN_1_10_NYNW							0x00000a00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYNW							0x00000b00
+#define    NV50TCL_VP_ATTR_EN_1_10_NNZW							0x00000c00
+#define    NV50TCL_VP_ATTR_EN_1_10_XNZW							0x00000d00
+#define    NV50TCL_VP_ATTR_EN_1_10_NYZW							0x00000e00
+#define    NV50TCL_VP_ATTR_EN_1_10_XYZW							0x00000f00
+#define   NV50TCL_VP_ATTR_EN_1_9_SHIFT							4
+#define   NV50TCL_VP_ATTR_EN_1_9_MASK							0x000000f0
+#define    NV50TCL_VP_ATTR_EN_1_9_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNN							0x00000010
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNN							0x00000020
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNN							0x00000030
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZN							0x00000040
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZN							0x00000050
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZN							0x00000060
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZN							0x00000070
+#define    NV50TCL_VP_ATTR_EN_1_9_NNNW							0x00000080
+#define    NV50TCL_VP_ATTR_EN_1_9_XNNW							0x00000090
+#define    NV50TCL_VP_ATTR_EN_1_9_NYNW							0x000000a0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYNW							0x000000b0
+#define    NV50TCL_VP_ATTR_EN_1_9_NNZW							0x000000c0
+#define    NV50TCL_VP_ATTR_EN_1_9_XNZW							0x000000d0
+#define    NV50TCL_VP_ATTR_EN_1_9_NYZW							0x000000e0
+#define    NV50TCL_VP_ATTR_EN_1_9_XYZW							0x000000f0
+#define   NV50TCL_VP_ATTR_EN_1_8_SHIFT							0
+#define   NV50TCL_VP_ATTR_EN_1_8_MASK							0x0000000f
+#define    NV50TCL_VP_ATTR_EN_1_8_NONE							0x00000000
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNN							0x00000001
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNN							0x00000002
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNN							0x00000003
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZN							0x00000004
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZN							0x00000005
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZN							0x00000006
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZN							0x00000007
+#define    NV50TCL_VP_ATTR_EN_1_8_NNNW							0x00000008
+#define    NV50TCL_VP_ATTR_EN_1_8_XNNW							0x00000009
+#define    NV50TCL_VP_ATTR_EN_1_8_NYNW							0x0000000a
+#define    NV50TCL_VP_ATTR_EN_1_8_XYNW							0x0000000b
+#define    NV50TCL_VP_ATTR_EN_1_8_NNZW							0x0000000c
+#define    NV50TCL_VP_ATTR_EN_1_8_XNZW							0x0000000d
+#define    NV50TCL_VP_ATTR_EN_1_8_NYZW							0x0000000e
+#define    NV50TCL_VP_ATTR_EN_1_8_XYZW							0x0000000f
+#define  NV50TCL_LINE_STIPPLE_ENABLE							0x0000166c
+#define  NV50TCL_LINE_STIPPLE_PATTERN							0x00001680
+#define  NV50TCL_POLYGON_STIPPLE_ENABLE							0x0000168c
+#define  NV50TCL_VP_REG_HPOS								0x000016bc
+#define   NV50TCL_VP_REG_HPOS_X_SHIFT							0
+#define   NV50TCL_VP_REG_HPOS_X_MASK							0x000000ff
+#define   NV50TCL_VP_REG_HPOS_Y_SHIFT							8
+#define   NV50TCL_VP_REG_HPOS_Y_MASK							0x0000ff00
+#define   NV50TCL_VP_REG_HPOS_Z_SHIFT							16
+#define   NV50TCL_VP_REG_HPOS_Z_MASK							0x00ff0000
+#define   NV50TCL_VP_REG_HPOS_W_SHIFT							24
+#define   NV50TCL_VP_REG_HPOS_W_MASK							0xff000000
+#define  NV50TCL_VP_REG_COL0								0x000016c0
+#define   NV50TCL_VP_REG_COL0_X_SHIFT							0
+#define   NV50TCL_VP_REG_COL0_X_MASK							0x000000ff
+#define   NV50TCL_VP_REG_COL0_Y_SHIFT							8
+#define   NV50TCL_VP_REG_COL0_Y_MASK							0x0000ff00
+#define   NV50TCL_VP_REG_COL0_Z_SHIFT							16
+#define   NV50TCL_VP_REG_COL0_Z_MASK							0x00ff0000
+#define   NV50TCL_VP_REG_COL0_W_SHIFT							24
+#define   NV50TCL_VP_REG_COL0_W_MASK							0xff000000
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN(x)						(0x00001700+((x)*4))
+#define  NV50TCL_POLYGON_STIPPLE_PATTERN__SIZE						0x00000020
+#define  NV50TCL_CULL_FACE_ENABLE							0x00001918
+#define  NV50TCL_FRONT_FACE								0x0000191c
+#define   NV50TCL_FRONT_FACE_CW								0x00000900
+#define   NV50TCL_FRONT_FACE_CCW							0x00000901
+#define  NV50TCL_CULL_FACE								0x00001920
+#define   NV50TCL_CULL_FACE_FRONT							0x00000404
+#define   NV50TCL_CULL_FACE_BACK							0x00000405
+#define   NV50TCL_CULL_FACE_FRONT_AND_BACK						0x00000408
+#define  NV50TCL_LOGIC_OP_ENABLE							0x000019c4
+#define  NV50TCL_LOGIC_OP								0x000019c8
+#define   NV50TCL_LOGIC_OP_CLEAR							0x00001500
+#define   NV50TCL_LOGIC_OP_AND								0x00001501
+#define   NV50TCL_LOGIC_OP_AND_REVERSE							0x00001502
+#define   NV50TCL_LOGIC_OP_COPY								0x00001503
+#define   NV50TCL_LOGIC_OP_AND_INVERTED							0x00001504
+#define   NV50TCL_LOGIC_OP_NOOP								0x00001505
+#define   NV50TCL_LOGIC_OP_XOR								0x00001506
+#define   NV50TCL_LOGIC_OP_OR								0x00001507
+#define   NV50TCL_LOGIC_OP_NOR								0x00001508
+#define   NV50TCL_LOGIC_OP_EQUIV							0x00001509
+#define   NV50TCL_LOGIC_OP_INVERT							0x0000150a
+#define   NV50TCL_LOGIC_OP_OR_REVERSE							0x0000150b
+#define   NV50TCL_LOGIC_OP_COPY_INVERTED						0x0000150c
+#define   NV50TCL_LOGIC_OP_OR_INVERTED							0x0000150d
+#define   NV50TCL_LOGIC_OP_NAND								0x0000150e
+#define   NV50TCL_LOGIC_OP_SET								0x0000150f
+#define  NV50TCL_CLEAR_BUFFERS								0x000019d0
+#define  NV50TCL_COLOR_MASK(x)								(0x00001a00+((x)*4))
+#define  NV50TCL_COLOR_MASK__SIZE							0x00000008
+#define   NV50TCL_COLOR_MASK_R_SHIFT							0
+#define   NV50TCL_COLOR_MASK_R_MASK							0x0000000f
+#define   NV50TCL_COLOR_MASK_G_SHIFT							4
+#define   NV50TCL_COLOR_MASK_G_MASK							0x000000f0
+#define   NV50TCL_COLOR_MASK_B_SHIFT							8
+#define   NV50TCL_COLOR_MASK_B_MASK							0x00000f00
+#define   NV50TCL_COLOR_MASK_A_SHIFT							12
+#define   NV50TCL_COLOR_MASK_A_MASK							0x0000f000
+
+
+#define NV50_COMPUTE									0x000050c0
+
+#define  NV50_COMPUTE_DMA_UNK0								0x000001a0
+#define  NV50_COMPUTE_DMA_STATUS							0x000001a4
+#define  NV50_COMPUTE_DMA_UNK1								0x000001b8
+#define  NV50_COMPUTE_DMA_UNK2								0x000001bc
+#define  NV50_COMPUTE_DMA_UNK3								0x000001c0
+#define  NV50_COMPUTE_UNK4_HIGH								0x00000210
+#define  NV50_COMPUTE_UNK4_LOW								0x00000214
+#define  NV50_COMPUTE_UNK5_HIGH								0x00000218
+#define  NV50_COMPUTE_UNK5_LOW								0x0000021c
+#define  NV50_COMPUTE_UNK6_HIGH								0x00000294
+#define  NV50_COMPUTE_UNK6_LOW								0x00000298
+#define  NV50_COMPUTE_CONST_BASE_HIGH							0x000002a4
+#define  NV50_COMPUTE_CONST_BASE_LO							0x000002a8
+#define  NV50_COMPUTE_CONST_SIZE_SEG							0x000002ac
+#define  NV50_COMPUTE_REG_COUNT								0x000002c0
+#define  NV50_COMPUTE_STATUS_HIGH							0x00000310
+#define  NV50_COMPUTE_STATUS_LOW							0x00000314
+#define  NV50_COMPUTE_EXECUTE								0x0000031c
+#define  NV50_COMPUTE_USER_PARAM_COUNT							0x00000374
+#define  NV50_COMPUTE_GRIDDIM_YX							0x000003a4
+#define  NV50_COMPUTE_SHARED_SIZE							0x000003a8
+#define  NV50_COMPUTE_BLOCKDIM_YX							0x000003ac
+#define  NV50_COMPUTE_BLOCKDIM_Z							0x000003b0
+#define  NV50_COMPUTE_CALL_ADDRESS							0x000003b4
+#define  NV50_COMPUTE_GLOBAL_BASE_HIGH(x)						(0x00000400+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_BASE_HIGH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_BASE_LOW(x)						(0x00000404+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_BASE_LOW__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_LIMIT_HIGH(x)						(0x00000408+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_LIMIT_HIGH__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_LIMIT_LOW(x)						(0x0000040c+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_LIMIT_LOW__SIZE						0x00000010
+#define  NV50_COMPUTE_GLOBAL_UNK(x)							(0x00000410+((x)*32))
+#define  NV50_COMPUTE_GLOBAL_UNK__SIZE							0x00000010
+#define  NV50_COMPUTE_USER_PARAM(x)							(0x00000600+((x)*4))
+#define  NV50_COMPUTE_USER_PARAM__SIZE							0x00000040
+
+
+#define NV54TCL										0x00008297
+
+
+
+#endif /* NOUVEAU_REG_H */
diff --git a/src/gallium/drivers/nouveau/nouveau_device.h b/src/gallium/drivers/nouveau/nouveau_device.h
new file mode 100644
index 0000000000..e25e89fedd
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_device.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DEVICE_H__
+#define __NOUVEAU_DEVICE_H__
+
+struct nouveau_device {
+	unsigned chipset;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_gldefs.h b/src/gallium/drivers/nouveau/nouveau_gldefs.h
new file mode 100644
index 0000000000..ff97aaa9af
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_gldefs.h
@@ -0,0 +1,196 @@
+#ifndef __NOUVEAU_GLDEFS_H__
+#define __NOUVEAU_GLDEFS_H__
+
+static INLINE unsigned
+nvgl_blend_func(unsigned factor)
+{
+	switch (factor) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return 0x0000;
+	case PIPE_BLENDFACTOR_ONE:
+		return 0x0001;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return 0x0300;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return 0x0301;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return 0x0302;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return 0x0303;
+	case PIPE_BLENDFACTOR_DST_ALPHA:
+		return 0x0304;
+	case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+		return 0x0305;
+	case PIPE_BLENDFACTOR_DST_COLOR:
+		return 0x0306;
+	case PIPE_BLENDFACTOR_INV_DST_COLOR:
+		return 0x0307;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return 0x0308;
+	case PIPE_BLENDFACTOR_CONST_COLOR:
+		return 0x8001;
+	case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+		return 0x8002;
+	case PIPE_BLENDFACTOR_CONST_ALPHA:
+		return 0x8003;
+	case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+		return 0x8004;
+	default:
+		return 0x0000;
+	}
+}
+
+static INLINE unsigned
+nvgl_blend_eqn(unsigned func)
+{
+	switch (func) {
+	case PIPE_BLEND_ADD:
+		return 0x8006;
+	case PIPE_BLEND_MIN:
+		return 0x8007;
+	case PIPE_BLEND_MAX:
+		return 0x8008;
+	case PIPE_BLEND_SUBTRACT:
+		return 0x800a;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return 0x800b;
+	default:
+		return 0x8006;
+	}
+}
+
+static INLINE unsigned
+nvgl_logicop_func(unsigned func)
+{
+	switch (func) {
+	case PIPE_LOGICOP_CLEAR:
+		return 0x1500;
+	case PIPE_LOGICOP_NOR:
+		return 0x1508;
+	case PIPE_LOGICOP_AND_INVERTED:
+		return 0x1504;
+	case PIPE_LOGICOP_COPY_INVERTED:
+		return 0x150c;
+	case PIPE_LOGICOP_AND_REVERSE:
+		return 0x1502;
+	case PIPE_LOGICOP_INVERT:
+		return 0x150a;
+	case PIPE_LOGICOP_XOR:
+		return 0x1506;
+	case PIPE_LOGICOP_NAND:
+		return 0x150e;
+	case PIPE_LOGICOP_AND:
+		return 0x1501;
+	case PIPE_LOGICOP_EQUIV:
+		return 0x1509;
+	case PIPE_LOGICOP_NOOP:
+		return 0x1505;
+	case PIPE_LOGICOP_OR_INVERTED:
+		return 0x150d;
+	case PIPE_LOGICOP_COPY:
+		return 0x1503;
+	case PIPE_LOGICOP_OR_REVERSE:
+		return 0x150b;
+	case PIPE_LOGICOP_OR:
+		return 0x1507;
+	case PIPE_LOGICOP_SET:
+		return 0x150f;
+	default:
+		return 0x1505;
+	}
+}
+
+static INLINE unsigned
+nvgl_comparison_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_FUNC_NEVER:
+		return 0x0200;
+	case PIPE_FUNC_LESS:
+		return 0x0201;
+	case PIPE_FUNC_EQUAL:
+		return 0x0202;
+	case PIPE_FUNC_LEQUAL:
+		return 0x0203;
+	case PIPE_FUNC_GREATER:
+		return 0x0204;
+	case PIPE_FUNC_NOTEQUAL:
+		return 0x0205;
+	case PIPE_FUNC_GEQUAL:
+		return 0x0206;
+	case PIPE_FUNC_ALWAYS:
+		return 0x0207;
+	default:
+		return 0x0207;
+	}
+}
+
+static INLINE unsigned
+nvgl_polygon_mode(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_POLYGON_MODE_POINT:
+		return 0x1b00;
+	case PIPE_POLYGON_MODE_LINE:
+		return 0x1b01;
+	case PIPE_POLYGON_MODE_FILL:
+		return 0x1b02;
+	default:
+		return 0x1b02;
+	}
+}
+
+static INLINE unsigned
+nvgl_stencil_op(unsigned op)
+{
+	switch (op) {
+	case PIPE_STENCIL_OP_ZERO:
+		return 0x0000;
+	case PIPE_STENCIL_OP_INVERT:
+		return 0x150a;
+	case PIPE_STENCIL_OP_KEEP:
+		return 0x1e00;
+	case PIPE_STENCIL_OP_REPLACE:
+		return 0x1e01;
+	case PIPE_STENCIL_OP_INCR:
+		return 0x1e02;
+	case PIPE_STENCIL_OP_DECR:
+		return 0x1e03;
+	case PIPE_STENCIL_OP_INCR_WRAP:
+		return 0x8507;
+	case PIPE_STENCIL_OP_DECR_WRAP:
+		return 0x8508;
+	default:
+		return 0x1e00;
+	}
+}
+
+static INLINE unsigned
+nvgl_primitive(unsigned prim) {
+	switch (prim) {
+	case PIPE_PRIM_POINTS:
+		return 0x0001;
+	case PIPE_PRIM_LINES:
+		return 0x0002;
+	case PIPE_PRIM_LINE_LOOP:
+		return 0x0003;
+	case PIPE_PRIM_LINE_STRIP:
+		return 0x0004;
+	case PIPE_PRIM_TRIANGLES:
+		return 0x0005;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return 0x0006;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		return 0x0007;
+	case PIPE_PRIM_QUADS:
+		return 0x0008;
+	case PIPE_PRIM_QUAD_STRIP:
+		return 0x0009;
+	case PIPE_PRIM_POLYGON:
+		return 0x000a;
+	default:
+		return 0;
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_grobj.h b/src/gallium/drivers/nouveau/nouveau_grobj.h
new file mode 100644
index 0000000000..8f5abf9051
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_grobj.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_GROBJ_H__
+#define __NOUVEAU_GROBJ_H__
+
+#include "nouveau_channel.h"
+
+struct nouveau_grobj {
+	struct nouveau_channel *channel;
+	int grclass;
+	uint32_t handle;
+	int subc;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_notifier.h b/src/gallium/drivers/nouveau/nouveau_notifier.h
new file mode 100644
index 0000000000..35adde1e32
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_notifier.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_NOTIFIER_H__
+#define __NOUVEAU_NOTIFIER_H__
+
+#define NV_NOTIFIER_SIZE                                                      32
+#define NV_NOTIFY_TIME_0                                              0x00000000
+#define NV_NOTIFY_TIME_1                                              0x00000004
+#define NV_NOTIFY_RETURN_VALUE                                        0x00000008
+#define NV_NOTIFY_STATE                                               0x0000000C
+#define NV_NOTIFY_STATE_STATUS_MASK                                   0xFF000000
+#define NV_NOTIFY_STATE_STATUS_SHIFT                                          24
+#define NV_NOTIFY_STATE_STATUS_COMPLETED                                    0x00
+#define NV_NOTIFY_STATE_STATUS_IN_PROCESS                                   0x01
+#define NV_NOTIFY_STATE_ERROR_CODE_MASK                               0x0000FFFF
+#define NV_NOTIFY_STATE_ERROR_CODE_SHIFT                                       0
+
+struct nouveau_notifier {
+	struct nouveau_channel *channel;
+	uint32_t handle;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_push.h b/src/gallium/drivers/nouveau/nouveau_push.h
new file mode 100644
index 0000000000..54ef1c1291
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_push.h
@@ -0,0 +1,82 @@
+#ifndef __NOUVEAU_PUSH_H__
+#define __NOUVEAU_PUSH_H__
+
+#include "nouveau/nouveau_winsys.h"
+
+#ifndef NOUVEAU_PUSH_CONTEXT
+#error undefined push context
+#endif
+
+#define OUT_RING(data) do {                                                    \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	(*pc->nvws->channel->pushbuf->cur++) = (data);                         \
+} while(0)
+
+#define OUT_RINGp(src,size) do {                                               \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	memcpy(pc->nvws->channel->pushbuf->cur, (src), (size) * 4);            \
+	pc->nvws->channel->pushbuf->cur += (size);                             \
+} while(0)
+
+#define OUT_RINGf(data) do {                                                   \
+	union { float v; uint32_t u; } c;                                      \
+	c.v = (data);                                                          \
+	OUT_RING(c.u);                                                         \
+} while(0)
+
+#define BEGIN_RING(obj,mthd,size) do {                                         \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws, ((size) + 1), NULL);            \
+	OUT_RING((pc->obj->subc << 13) | ((size) << 18) | (mthd));             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#define BEGIN_RING_NI(obj,mthd,size) do {                                      \
+	BEGIN_RING(obj, (mthd) | 0x40000000, (size));                          \
+} while(0)
+
+#define FIRE_RING(fence) do {                                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_flush(pc->nvws, 0, fence);                              \
+} while(0)
+
+#define OUT_RELOC(bo,data,flags,vor,tor) do {                                  \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	pc->nvws->push_reloc(pc->nvws, pc->nvws->channel->pushbuf->cur++,      \
+			     (bo), (data), (flags), (vor), (tor));             \
+} while(0)
+
+/* Raw data + flags depending on FB/TT buffer */
+#define OUT_RELOCd(bo,data,flags,vor,tor) do {                                 \
+	OUT_RELOC((bo), (data), (flags) | NOUVEAU_BO_OR, (vor), (tor));        \
+} while(0)
+
+/* FB/TT object handle */
+#define OUT_RELOCo(bo,flags) do {                                              \
+	OUT_RELOC((bo), 0, (flags) | NOUVEAU_BO_OR,                            \
+		  pc->nvws->channel->vram->handle,                             \
+		  pc->nvws->channel->gart->handle);                            \
+} while(0)
+
+/* Low 32-bits of offset */
+#define OUT_RELOCl(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_LOW, 0, 0);              \
+} while(0)
+
+/* High 32-bits of offset */
+#define OUT_RELOCh(bo,delta,flags) do {                                        \
+	OUT_RELOC((bo), (delta), (flags) | NOUVEAU_BO_HIGH, 0, 0);             \
+} while(0)
+
+/* A reloc which'll recombine into a NV_DMA_METHOD packet header */
+#define OUT_RELOCm(bo, flags, obj, mthd, size) do {                            \
+	NOUVEAU_PUSH_CONTEXT(pc);                                              \
+	if (pc->nvws->channel->pushbuf->remaining < ((size) + 1))              \
+		pc->nvws->push_flush(pc->nvws->channel, ((size) + 1), NULL);   \
+	OUT_RELOCd((bo), (pc->obj->subc << 13) | ((size) << 18) | (mthd),      \
+		   (flags), 0, 0);                                             \
+	pc->nvws->channel->pushbuf->remaining -= ((size) + 1);                 \
+} while(0)
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_pushbuf.h b/src/gallium/drivers/nouveau/nouveau_pushbuf.h
new file mode 100644
index 0000000000..1909765098
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_pushbuf.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_PUSHBUF_H__
+#define __NOUVEAU_PUSHBUF_H__
+
+struct nouveau_pushbuf {
+	struct nouveau_channel *channel;
+	unsigned remaining;
+	uint32_t *cur;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_resource.h b/src/gallium/drivers/nouveau/nouveau_resource.h
new file mode 100644
index 0000000000..1af7961d30
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_resource.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_RESOURCE_H__
+#define __NOUVEAU_RESOURCE_H__
+
+struct nouveau_resource {
+	struct nouveau_resource *prev;
+	struct nouveau_resource *next;
+
+	int in_use;
+	void *priv;
+
+	unsigned int start;
+	unsigned int size;
+};
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_stateobj.h b/src/gallium/drivers/nouveau/nouveau_stateobj.h
new file mode 100644
index 0000000000..729988b095
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_stateobj.h
@@ -0,0 +1,158 @@
+#ifndef __NOUVEAU_STATEOBJ_H__
+#define __NOUVEAU_STATEOBJ_H__
+
+#include "pipe/p_debug.h"
+
+struct nouveau_stateobj_reloc {
+	struct pipe_buffer *bo;
+
+	unsigned offset;
+	unsigned packet;
+
+	unsigned data;
+	unsigned flags;
+	unsigned vor;
+	unsigned tor;
+};
+
+struct nouveau_stateobj {
+	int refcount;
+
+	unsigned *push;
+	struct nouveau_stateobj_reloc *reloc;
+
+	unsigned *cur;
+	unsigned cur_packet;
+	unsigned cur_reloc;
+};
+
+static INLINE struct nouveau_stateobj *
+so_new(unsigned push, unsigned reloc)
+{
+	struct nouveau_stateobj *so;
+
+	so = MALLOC(sizeof(struct nouveau_stateobj));
+	so->refcount = 0;
+	so->push = MALLOC(sizeof(unsigned) * push);
+	so->reloc = MALLOC(sizeof(struct nouveau_stateobj_reloc) * reloc);
+
+	so->cur = so->push;
+	so->cur_reloc = so->cur_packet = 0;
+
+	return so;
+}
+
+static INLINE void
+so_ref(struct nouveau_stateobj *ref, struct nouveau_stateobj **pso)
+{
+	struct nouveau_stateobj *so = *pso;
+
+	if (ref) {
+		ref->refcount++;
+	}
+
+	if (so && --so->refcount <= 0) {
+		free(so->push);
+		free(so->reloc);
+		free(so);
+	}
+
+	*pso = ref;
+}
+
+static INLINE void
+so_data(struct nouveau_stateobj *so, unsigned data)
+{
+	(*so->cur++) = (data);
+	so->cur_packet += 4;
+}
+
+static INLINE void
+so_datap(struct nouveau_stateobj *so, unsigned *data, unsigned size)
+{
+	so->cur_packet += (4 * size);
+	while (size--)
+		(*so->cur++) = (*data++);
+}
+
+static INLINE void
+so_method(struct nouveau_stateobj *so, struct nouveau_grobj *gr,
+	  unsigned mthd, unsigned size)
+{
+	so->cur_packet = (gr->subc << 13) | (1 << 18) | (mthd - 4);
+	so_data(so, (gr->subc << 13) | (size << 18) | mthd);
+}
+
+static INLINE void
+so_reloc(struct nouveau_stateobj *so, struct pipe_buffer *bo,
+	 unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	struct nouveau_stateobj_reloc *r = &so->reloc[so->cur_reloc++];
+	
+	r->bo = bo;
+	r->offset = so->cur - so->push;
+	r->packet = so->cur_packet;
+	r->data = data;
+	r->flags = flags;
+	r->vor = vor;
+	r->tor = tor;
+	so_data(so, data);
+}
+
+static INLINE void
+so_dump(struct nouveau_stateobj *so)
+{
+	unsigned i, nr = so->cur - so->push;
+
+	for (i = 0; i < nr; i++)
+		debug_printf("+0x%04x: 0x%08x\n", i, so->push[i]);
+}
+
+static INLINE void
+so_emit(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned nr, i;
+
+	nr = so->cur - so->push;
+	if (pb->remaining < nr)
+		nvws->push_flush(nvws, nr, NULL);
+	pb->remaining -= nr;
+
+	memcpy(pb->cur, so->push, nr * 4);
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur + r->offset, r->bo,
+				 r->data, r->flags, r->vor, r->tor);
+	}
+	pb->cur += nr;
+}
+
+static INLINE void
+so_emit_reloc_markers(struct nouveau_winsys *nvws, struct nouveau_stateobj *so)
+{
+	struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+	unsigned i;
+
+	if (!so)
+		return;
+
+	i = so->cur_reloc << 1;
+	if (nvws->channel->pushbuf->remaining < i)
+		nvws->push_flush(nvws, i, NULL);
+	nvws->channel->pushbuf->remaining -= i;
+
+	for (i = 0; i < so->cur_reloc; i++) {
+		struct nouveau_stateobj_reloc *r = &so->reloc[i];
+
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->packet,
+				 (r->flags &
+				  (NOUVEAU_BO_VRAM | NOUVEAU_BO_GART)) |
+				 NOUVEAU_BO_DUMMY, 0, 0);
+		nvws->push_reloc(nvws, pb->cur++, r->bo, r->data,
+				 r->flags | NOUVEAU_BO_DUMMY, r->vor, r->tor);
+	}
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_util.h b/src/gallium/drivers/nouveau/nouveau_util.h
new file mode 100644
index 0000000000..a10114beab
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_util.h
@@ -0,0 +1,91 @@
+#ifndef __NOUVEAU_UTIL_H__
+#define __NOUVEAU_UTIL_H__
+
+/* Determine how many vertices can be pushed into the command stream.
+ * Where the remaining space isn't large enough to represent all verices,
+ * split the buffer at primitive boundaries.
+ *
+ * Returns a count of vertices that can be rendered, and an index to
+ * restart drawing at after a flush.
+ */
+static INLINE unsigned
+nouveau_vbuf_split(unsigned remaining, unsigned overhead, unsigned vpp,
+		   unsigned mode, unsigned start, unsigned count,
+		   unsigned *restart)
+{
+	int max, adj = 0;
+
+	max  = remaining - overhead;
+	if (max < 0)
+		return 0;
+
+	max *= vpp;
+	if (max >= count)
+		return count;
+
+	switch (mode) {
+	case PIPE_PRIM_POINTS:
+		break;
+	case PIPE_PRIM_LINES:
+		max = max & 1;
+		break;
+	case PIPE_PRIM_TRIANGLES:
+		max = max - (max % 3);
+		break;
+	case PIPE_PRIM_QUADS:
+		max = max & 3;
+		break;
+	case PIPE_PRIM_LINE_LOOP:
+	case PIPE_PRIM_LINE_STRIP:
+		if (max < 2)
+			max = 0;
+		adj = 1;
+		break;
+	case PIPE_PRIM_POLYGON:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		if (max < 3)
+			max = 0;
+		adj = 2;
+		break;
+	case PIPE_PRIM_QUAD_STRIP:
+		if (max < 4)
+			max = 0;
+		adj = 3;
+		break;
+	default:
+		assert(0);
+	}
+
+	*restart = start + max - adj;
+	return max;
+}
+
+/* Integer base-2 logarithm, rounded towards zero. */
+static INLINE unsigned log2i(unsigned i)
+{
+	unsigned r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+#endif
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
new file mode 100644
index 0000000000..5535ebb6a9
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -0,0 +1,98 @@
+#ifndef NOUVEAU_WINSYS_H
+#define NOUVEAU_WINSYS_H
+
+#include <stdint.h>
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_class.h"
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+#define NOUVEAU_CAP_HW_VTXBUF (0xbeef0000)
+#define NOUVEAU_CAP_HW_IDXBUF (0xbeef0001)
+
+#define NOUVEAU_TEXTURE_USAGE_LINEAR (1 << 16)
+
+#define NOUVEAU_BUFFER_USAGE_TEXTURE (1 << 16)
+#define NOUVEAU_BUFFER_USAGE_ZETA    (1 << 17)
+
+struct nouveau_winsys {
+	struct nouveau_context *nv;
+
+	struct nouveau_channel *channel;
+
+	int  (*res_init)(struct nouveau_resource **heap, unsigned start,
+			 unsigned size);
+	int  (*res_alloc)(struct nouveau_resource *heap, int size, void *priv,
+			  struct nouveau_resource **);
+	void (*res_free)(struct nouveau_resource **);
+
+	int  (*push_reloc)(struct nouveau_winsys *, void *ptr,
+			   struct pipe_buffer *, uint32_t data,
+			   uint32_t flags, uint32_t vor, uint32_t tor);
+	int  (*push_flush)(struct nouveau_winsys *, unsigned size,
+			   struct pipe_fence_handle **fence);
+			       
+	int       (*grobj_alloc)(struct nouveau_winsys *, int grclass,
+				 struct nouveau_grobj **);
+	void      (*grobj_free)(struct nouveau_grobj **);
+
+	int       (*notifier_alloc)(struct nouveau_winsys *, int count,
+				    struct nouveau_notifier **);
+	void      (*notifier_free)(struct nouveau_notifier **);
+	void      (*notifier_reset)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_status)(struct nouveau_notifier *, int id);
+	uint32_t  (*notifier_retval)(struct nouveau_notifier *, int id);
+	int       (*notifier_wait)(struct nouveau_notifier *, int id,
+				   int status, int timeout);
+
+	int (*surface_copy)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned);
+	int (*surface_fill)(struct nouveau_winsys *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv04_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv10_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv20_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv30_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv40_create(struct pipe_screen *, unsigned pctx_id);
+
+extern struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *);
+
+extern struct pipe_context *
+nv50_create(struct pipe_screen *, unsigned pctx_id);
+
+#endif
diff --git a/src/gallium/drivers/nv04/Makefile b/src/gallium/drivers/nv04/Makefile
new file mode 100644
index 0000000000..5ea51a2f42
--- /dev/null
+++ b/src/gallium/drivers/nv04/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv04
+
+DRIVER_SOURCES = \
+	nv04_clear.c \
+	nv04_context.c \
+	nv04_fragprog.c \
+	nv04_fragtex.c \
+	nv04_miptree.c \
+	nv04_prim_vbuf.c \
+	nv04_screen.c \
+	nv04_state.c \
+	nv04_state_emit.c \
+	nv04_surface.c \
+	nv04_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv04/nv04_clear.c b/src/gallium/drivers/nv04/nv04_clear.c
new file mode 100644
index 0000000000..01cacd36fe
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+
+void
+nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
new file mode 100644
index 0000000000..9f75253363
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -0,0 +1,106 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv04_destroy(struct pipe_context *pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (nv04->draw)
+		draw_destroy(nv04->draw);
+
+	FREE(nv04);
+}
+
+static void
+nv04_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+static boolean
+nv04_init_hwctx(struct nv04_context *nv04)
+{
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+	OUT_RING(0);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
+	OUT_RING(0);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(0x40182800);
+//	OUT_RING(1<<20/*no cull*/);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+//	OUT_RING(0x24|(1<<6)|(1<<8));
+	OUT_RING(0x120001a4);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
+	OUT_RING(0x332213a1);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
+	OUT_RING(0x11001010);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
+	OUT_RING(0x0);
+//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+//	OUT_RING(SCREEN_OFFSET);
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	OUT_RING(0xff000000);
+
+
+
+	FIRE_RING (NULL);
+	return TRUE;
+}
+
+struct pipe_context *
+nv04_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_context *nv04;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv04 = CALLOC(1, sizeof(struct nv04_context));
+	if (!nv04)
+		return NULL;
+	nv04->screen = screen;
+	nv04->pctx_id = pctx_id;
+
+	nv04->nvws = nvws;
+
+	nv04->pipe.winsys = ws;
+	nv04->pipe.screen = pscreen;
+	nv04->pipe.destroy = nv04_destroy;
+	nv04->pipe.set_edgeflags = nv04_set_edgeflags;
+	nv04->pipe.draw_arrays = nv04_draw_arrays;
+	nv04->pipe.draw_elements = nv04_draw_elements;
+	nv04->pipe.clear = nv04_clear;
+	nv04->pipe.flush = nv04_flush;
+
+	nv04_init_surface_functions(nv04);
+	nv04_init_state_functions(nv04);
+
+	nv04->draw = draw_create();
+	assert(nv04->draw);
+	draw_wide_point_threshold(nv04->draw, 0.0);
+	draw_wide_line_threshold(nv04->draw, 0.0);
+	draw_enable_line_stipple(nv04->draw, FALSE);
+	draw_enable_point_sprites(nv04->draw, FALSE);
+	draw_set_rasterize_stage(nv04->draw, nv04_draw_vbuf_stage(nv04));
+
+	nv04_init_hwctx(nv04);
+
+	return &nv04->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_context.h b/src/gallium/drivers/nv04/nv04_context.h
new file mode 100644
index 0000000000..3e6a085270
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_context.h
@@ -0,0 +1,146 @@
+#ifndef __NV04_CONTEXT_H__
+#define __NV04_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv04_screen *ctx = nv04->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv04_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#include "nv04_screen.h"
+
+#define NV04_NEW_VERTPROG	(1 << 1)
+#define NV04_NEW_FRAGPROG	(1 << 2)
+#define NV04_NEW_BLEND		(1 << 3)
+#define NV04_NEW_RAST		(1 << 4)
+#define NV04_NEW_CONTROL	(1 << 5)
+#define NV04_NEW_VIEWPORT	(1 << 6)
+#define NV04_NEW_SAMPLER	(1 << 7)
+
+struct nv04_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv04_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	int chipset;
+	struct nouveau_notifier *sync;
+
+	uint32_t dirty;
+
+	struct nv04_blend_state *blend;
+	struct nv04_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+	struct nv04_fragtex_state fragtex;
+	struct nv04_rasterizer_state *rast;
+	struct nv04_depth_stencil_alpha_state *dsa;
+
+	struct nv04_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[16];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+	struct vertex_info vertex_info;
+	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv04_vertex_program *active;
+
+		struct nv04_vertex_program *current;
+		struct pipe_buffer *constant_buf;
+	} vertprog;
+
+	struct {
+		struct nv04_fragment_program *active;
+
+		struct nv04_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vertex_buffer[PIPE_MAX_ATTRIBS];
+	unsigned num_vertex_buffers;
+	unsigned num_vertex_elements;
+
+	struct pipe_viewport_state viewport;
+};
+
+static INLINE struct nv04_context *
+nv04_context(struct pipe_context *pipe)
+{
+	return (struct nv04_context *)pipe;
+}
+
+extern void nv04_init_state_functions(struct nv04_context *nv04);
+extern void nv04_init_surface_functions(struct nv04_context *nv04);
+extern void nv04_init_miptree_functions(struct pipe_screen *screen);
+
+/* nv04_clear.c */
+extern void nv04_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv04_draw.c */
+extern struct draw_stage *nv04_draw_render_stage(struct nv04_context *nv04);
+
+/* nv04_fragprog.c */
+extern void nv04_fragprog_bind(struct nv04_context *,
+			       struct nv04_fragment_program *);
+extern void nv04_fragprog_destroy(struct nv04_context *,
+				  struct nv04_fragment_program *);
+
+/* nv04_fragtex.c */
+extern void nv04_fragtex_bind(struct nv04_context *);
+
+/* nv04_prim_vbuf.c */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 );
+
+/* nv04_state.c and friends */
+extern void nv04_emit_hw_state(struct nv04_context *nv04);
+extern void nv04_state_tex_update(struct nv04_context *nv04);
+
+/* nv04_vbo.c */
+extern boolean nv04_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_fragprog.c b/src/gallium/drivers/nv04/nv04_fragprog.c
new file mode 100644
index 0000000000..8a2af41fe0
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv04_context.h"
+
+void
+nv04_fragprog_bind(struct nv04_context *nv04, struct nv04_fragment_program *fp)
+{
+}
+
+void
+nv04_fragprog_destroy(struct nv04_context *nv04,
+		      struct nv04_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
new file mode 100644
index 0000000000..21f990fd53
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -0,0 +1,73 @@
+#include "nv04_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  PIPE_FORMAT_##m,                                                             \
+  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+}
+
+struct nv04_texture_format {
+	uint	pipe;
+	int     format;
+};
+
+static struct nv04_texture_format
+nv04_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(X8R8G8B8_UNORM, X8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM,       Y8      ),
+	_(A8_UNORM,       Y8      ),
+};
+
+static uint32_t
+nv04_fragtex_format(uint pipe_format)
+{
+	struct nv04_texture_format *tf = nv04_texture_formats;
+	int i;
+
+	for (i=0; i< sizeof(nv04_texture_formats)/sizeof(nv04_texture_formats[0]); i++) {
+		if (tf->pipe == pipe_format)
+			return tf->format;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return 0;
+}
+
+
+static void
+nv04_fragtex_build(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_2D:
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+		| nv04_fragtex_format(pt->format)
+		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height[0]) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		;
+}
+
+
+void
+nv04_fragtex_bind(struct nv04_context *nv04)
+{
+	nv04_fragtex_build(nv04, 0);
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_miptree.c b/src/gallium/drivers/nv04/nv04_miptree.c
new file mode 100644
index 0000000000..0cbb91e187
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_miptree.c
@@ -0,0 +1,138 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static void
+nv04_miptree_layout(struct nv04_miptree *nv04mt)
+{
+	struct pipe_texture *pt = &nv04mt->base;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	nr_faces = 1;
+
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+		
+		nv04mt->level[l].pitch = pt->width[0] * pt->block.size;
+		nv04mt->level[l].pitch = (nv04mt->level[l].pitch + 63) & ~63;
+
+		nv04mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv04mt->level[l].image_offset[f] = offset;
+			offset += nv04mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv04mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv04_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv04_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv04_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv04_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv04_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv04_miptree *nv04mt = (struct nv04_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv04mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv04mt->level[l].image_offset)
+				FREE(nv04mt->level[l].image_offset);
+		}
+		FREE(nv04mt);
+	}
+}
+
+static struct pipe_surface *
+nv04_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv04_miptree *nv04mt = (struct nv04_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = ws->surface_alloc(ws);
+	if (!ps)
+		return NULL;
+	pipe_buffer_reference(pscreen, &ps->buffer, nv04mt->buffer);
+	ps->format = pt->format;
+		ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv04mt->level[level].pitch;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv04mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv04mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv04_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+}
+
+void
+nv04_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv04_miptree_create;
+	pscreen->texture_release = nv04_miptree_release;
+	pscreen->get_tex_surface = nv04_miptree_surface_new;
+	pscreen->tex_surface_release = nv04_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
new file mode 100644
index 0000000000..19979fff79
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -0,0 +1,309 @@
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_compiler.h"
+
+#include "draw/draw_vbuf.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#define VERTEX_SIZE 40
+#define VERTEX_BUFFER_SIZE (4096*VERTEX_SIZE) // 4096 vertices of 40 bytes each
+
+/**
+ * Primitive renderer for nv04.
+ */
+struct nv04_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv04_context *nv04;   
+
+	/** Vertex buffer */
+	unsigned char* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Current primitive */
+	unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv04_vbuf_render *
+nv04_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv04_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv04_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	struct nv04_context *nv04 = nv04_render->nv04;
+	return &nv04->vertex_info;
+}
+
+
+static void *
+nv04_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	nv04_render->buffer = (unsigned char*) MALLOC(VERTEX_BUFFER_SIZE);
+	assert(!nv04_render->buffer);
+
+	return nv04_render->buffer;
+}
+
+
+static boolean 
+nv04_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	if (prim <= PIPE_PRIM_LINE_STRIP)
+		return FALSE;
+
+	nv04_render->prim = prim;
+	return TRUE;
+}
+
+static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v4,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v5,8);
+	OUT_RING(0xFEDCBA);
+}
+
+static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RING(0xFED);
+}
+
+static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
+{
+	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
+	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
+	OUT_RINGp(buffer + VERTEX_SIZE * v3,8);
+	OUT_RING(0xFECEDC);
+}
+
+static void nv04_vbuf_render_triangles_elts(struct nv04_vbuf_render * render, const ushort * indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for( i=0; i< nr_indices-5; i+=6)
+		nv04_2triangles(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3],
+				indices[i+4],
+				indices[i+5]
+			       );
+	if (i != nr_indices)
+	{
+		nv04_1triangle(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2]
+			       );
+		i+=3;
+	}
+	if (i != nr_indices)
+		NOUVEAU_ERR("Houston, we have lost some vertices\n");
+}
+
+static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t striptbl[]={0x321210,0x543432,0x765654,0x987876,0xBA9A98,0xDCBCBA,0xFEDEDC};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	for(i = 0; i<nr_indices; i+=14) 
+	{
+		int numvert = MIN2(16, nr_indices - i);
+		int numtri = numvert - 2;
+		if (numvert<3)
+			break;
+
+		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		for(j = 0; j<numvert; j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
+
+		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		for(j = 0; j<numtri/2; j++ )
+			OUT_RING(striptbl[j]);
+		if (numtri%2)
+			OUT_RING(striptbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	const uint32_t fantbl[]={0x320210,0x540430,0x760650,0x980870,0xBA0A90,0xDC0CB0,0xFE0ED0};
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i,j;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
+
+	for(i = 1; i<nr_indices; i+=14)
+	{
+		int numvert=MIN2(15, nr_indices - i);
+		int numtri=numvert-2;
+		if (numvert < 3)
+			break;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+
+		for(j=0;j<numvert;j++)
+			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
+
+		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		for(j = 0; j<numtri/2; j++)
+			OUT_RING(fantbl[j]);
+		if (numtri%2)
+			OUT_RING(fantbl[numtri/2]&0xFFF);
+	}
+}
+
+static void nv04_vbuf_render_quads_elts(struct nv04_vbuf_render* render, const ushort* indices, uint nr_indices)
+{
+	unsigned char* buffer = render->buffer;
+	struct nv04_context* nv04 = render->nv04;
+	int i;
+
+	for(i = 0; i < nr_indices; i += 4)
+		nv04_1quad(nv04,
+				buffer,
+				indices[i+0],
+				indices[i+1],
+				indices[i+2],
+				indices[i+3]
+			       );
+}
+
+
+static void 
+nv04_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	// emit the indices
+	switch( nv04_render->prim )
+	{
+		case PIPE_PRIM_TRIANGLES:
+			nv04_vbuf_render_triangles_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUAD_STRIP:
+		case PIPE_PRIM_TRIANGLE_STRIP:
+			nv04_vbuf_render_tri_strip_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_TRIANGLE_FAN:
+		case PIPE_PRIM_POLYGON:
+			nv04_vbuf_render_tri_fan_elts(nv04_render, indices, nr_indices);
+			break;
+		case PIPE_PRIM_QUADS:
+			nv04_vbuf_render_quads_elts(nv04_render, indices, nr_indices);
+			break;
+		default:
+			NOUVEAU_ERR("You have to implement primitive %d, young padawan\n", nv04_render->prim);
+			break;
+	}
+}
+
+
+static void
+nv04_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+
+	free(nv04_render->buffer);
+	nv04_render->buffer = NULL;
+}
+
+
+static void
+nv04_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv04_vbuf_render *nv04_render = nv04_vbuf_render(render);
+	FREE(nv04_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv04_vbuf_render_create( struct nv04_context *nv04 )
+{
+	struct nv04_vbuf_render *nv04_render = CALLOC_STRUCT(nv04_vbuf_render);
+
+	nv04_render->nv04 = nv04;
+
+	nv04_render->base.max_vertex_buffer_bytes = VERTEX_BUFFER_SIZE;
+	nv04_render->base.max_indices = 65536; 
+	nv04_render->base.get_vertex_info = nv04_vbuf_render_get_vertex_info;
+	nv04_render->base.allocate_vertices = nv04_vbuf_render_allocate_vertices;
+	nv04_render->base.set_primitive = nv04_vbuf_render_set_primitive;
+	nv04_render->base.draw = nv04_vbuf_render_draw;
+	nv04_render->base.release_vertices = nv04_vbuf_render_release_vertices;
+	nv04_render->base.destroy = nv04_vbuf_render_destroy;
+
+	return &nv04_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv04_draw_vbuf_stage( struct nv04_context *nv04 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv04_vbuf_render_create(nv04);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv04->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
new file mode 100644
index 0000000000..3966a29ffa
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -0,0 +1,212 @@
+#include "pipe/p_screen.h"
+
+#include "nv04_context.h"
+#include "nv04_screen.h"
+
+static const char *
+nv04_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv04_screen *nv04screen = nv04_screen(screen);
+	struct nouveau_device *dev = nv04screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv04_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv04_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 1;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv04_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 0.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 0.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv04_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv04_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv04_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv04_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv04_screen *screen = nv04_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->fahrenheit);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv04_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv04_screen *screen = CALLOC_STRUCT(nv04_screen);
+	unsigned fahrenheit_class = 0, sub3d_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	if (chipset>=0x20) {
+		fahrenheit_class = 0;
+		sub3d_class = 0;
+	} else if (chipset>=0x10) {
+		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV10_CONTEXT_SURFACES_3D;
+	} else {
+		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		sub3d_class = NV04_CONTEXT_SURFACES_3D;
+	}
+
+	if (!fahrenheit_class) {
+		NOUVEAU_ERR("Unknown nv04 chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	/* 3D object */
+	ret = nvws->grobj_alloc(nvws, fahrenheit_class, &screen->fahrenheit);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return NULL;
+	}
+
+	/* 3D surface object */
+	ret = nvws->grobj_alloc(nvws, sub3d_class, &screen->context_surfaces_3d);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D surface object: %d\n", ret);
+		return NULL;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv04_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv04_screen_destroy;
+
+	screen->pipe.get_name = nv04_screen_get_name;
+	screen->pipe.get_vendor = nv04_screen_get_vendor;
+	screen->pipe.get_param = nv04_screen_get_param;
+	screen->pipe.get_paramf = nv04_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv04_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv04_surface_map;
+	screen->pipe.surface_unmap = nv04_surface_unmap;
+
+	nv04_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_screen.h b/src/gallium/drivers/nv04/nv04_screen.h
new file mode 100644
index 0000000000..99a49cdf7a
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_screen.h
@@ -0,0 +1,25 @@
+#ifndef __NV04_SCREEN_H__
+#define __NV04_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv04_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+	unsigned chipset;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *fahrenheit;
+	struct nouveau_grobj *context_surfaces_3d;
+	struct nouveau_notifier *sync;
+
+};
+
+static INLINE struct nv04_screen *
+nv04_screen(struct pipe_screen *screen)
+{
+	return (struct nv04_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
new file mode 100644
index 0000000000..ff1933b550
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -0,0 +1,495 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void *
+nv04_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv04_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv04_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_src = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dst = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+	
+
+	return (void *)cb;
+}
+
+static void
+nv04_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->blend = (struct nv04_blend_state*)blend;
+
+	nv04->dirty |= NV04_NEW_BLEND;
+}
+
+static void
+nv04_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+	}
+	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+}
+
+static void *
+nv04_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+
+	struct nv04_sampler_state *ss;
+	uint32_t filter = 0;
+
+	ss = MALLOC(sizeof(struct nv04_sampler_state));
+
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+
+	if (cso->max_anisotropy > 1.0) {
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ss->filter = filter;
+
+	return (void *)ss;
+}
+
+static void
+nv04_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->sampler[unit] = sampler[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv04_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void
+nv04_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv04->tex_miptree[unit] = (struct nv04_miptree *)miptree[unit];
+		nv04->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv04_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv04_rasterizer_state *rs;
+
+	/*XXX: ignored:
+	 * 	scissor
+	 * 	points/lines (no hw support, emulated with tris in gallium)
+	 */
+	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
+
+	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+
+	return (void *)rs;
+}
+
+static void
+nv04_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->rast = (struct nv04_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv04->draw, (nv04->rast ? nv04->rast->templ : NULL));
+
+	nv04->dirty |= NV04_NEW_RAST | NV04_NEW_BLEND;
+}
+
+static void
+nv04_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static INLINE uint32_t nv04_compare_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_FUNC_NEVER:		return 1;
+		case PIPE_FUNC_LESS:		return 2;
+		case PIPE_FUNC_EQUAL:		return 3;
+		case PIPE_FUNC_LEQUAL:		return 4;
+		case PIPE_FUNC_GREATER:		return 5;
+		case PIPE_FUNC_NOTEQUAL:	return 6;
+		case PIPE_FUNC_GEQUAL:		return 7;
+		case PIPE_FUNC_ALWAYS:		return 8;
+	}
+	NOUVEAU_MSG("Unable to find the function\n");
+	return 0;
+}
+
+static void *
+nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv04_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
+
+	hw->control = float_to_ubyte(cso->alpha.ref);
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
+	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+
+	return (void *)hw;
+}
+
+static void
+nv04_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->dsa = hwcso;
+	nv04->dirty |= NV04_NEW_CONTROL;
+}
+
+static void
+nv04_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	free(hwcso);
+}
+
+static void *
+nv04_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	return draw_create_vertex_shader(nv04->draw, templ);
+}
+
+static void
+nv04_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_bind_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+
+	nv04->dirty |= NV04_NEW_VERTPROG;
+}
+
+static void
+nv04_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_delete_vertex_shader(nv04->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv04_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv04_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv04_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	return (void *)fp;
+}
+
+static void
+nv04_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04->fragprog.current = fp;
+	nv04->dirty |= NV04_NEW_FRAGPROG;
+}
+
+static void
+nv04_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = hwcso;
+
+	nv04_fragprog_destroy(nv04, fp);
+	free((void*)fp->pipe.tokens);
+	free(fp);
+}
+
+static void
+nv04_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+}
+
+static void
+nv04_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv04_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv04->vertprog.constant_buf = buf->buffer;
+		nv04->dirty |= NV04_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv04->fragprog.constant_buf = buf->buffer;
+		nv04->dirty |= NV04_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv04_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct pipe_surface *rt, *zeta;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format = 0x108;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format = 0x103;
+		break;
+	default:
+		assert(0);
+	}
+
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_FORMAT, 1);
+	OUT_RING(rt_format);
+
+	/* FIXME pitches have to be aligned ! */
+	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(rt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(zeta->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}
+}
+
+static void
+nv04_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv04_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+/*	struct nv04_context *nv04 = nv04_context(pipe);
+
+	// XXX
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void
+nv04_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *viewport)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	nv04->viewport = *viewport;
+
+	draw_set_viewport_state(nv04->draw, &nv04->viewport);
+}
+
+static void
+nv04_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+		       const struct pipe_vertex_buffer *buffers)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	memcpy(nv04->vertex_buffer, buffers, count * sizeof(buffers[0]));
+	nv04->num_vertex_buffers = count;
+
+	draw_set_vertex_buffers(nv04->draw, count, buffers);
+}
+
+static void
+nv04_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_element *elements)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+
+	draw_flush(nv04->draw);
+
+	nv04->num_vertex_elements = count;
+	draw_set_vertex_elements(nv04->draw, count, elements);
+}
+
+void
+nv04_init_state_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.create_blend_state = nv04_blend_state_create;
+	nv04->pipe.bind_blend_state = nv04_blend_state_bind;
+	nv04->pipe.delete_blend_state = nv04_blend_state_delete;
+
+	nv04->pipe.create_sampler_state = nv04_sampler_state_create;
+	nv04->pipe.bind_sampler_states = nv04_sampler_state_bind;
+	nv04->pipe.delete_sampler_state = nv04_sampler_state_delete;
+	nv04->pipe.set_sampler_textures = nv04_set_sampler_texture;
+
+	nv04->pipe.create_rasterizer_state = nv04_rasterizer_state_create;
+	nv04->pipe.bind_rasterizer_state = nv04_rasterizer_state_bind;
+	nv04->pipe.delete_rasterizer_state = nv04_rasterizer_state_delete;
+
+	nv04->pipe.create_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_create;
+	nv04->pipe.bind_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_bind;
+	nv04->pipe.delete_depth_stencil_alpha_state = nv04_depth_stencil_alpha_state_delete;
+
+	nv04->pipe.create_vs_state = nv04_vp_state_create;
+	nv04->pipe.bind_vs_state = nv04_vp_state_bind;
+	nv04->pipe.delete_vs_state = nv04_vp_state_delete;
+
+	nv04->pipe.create_fs_state = nv04_fp_state_create;
+	nv04->pipe.bind_fs_state = nv04_fp_state_bind;
+	nv04->pipe.delete_fs_state = nv04_fp_state_delete;
+
+	nv04->pipe.set_blend_color = nv04_set_blend_color;
+	nv04->pipe.set_clip_state = nv04_set_clip_state;
+	nv04->pipe.set_constant_buffer = nv04_set_constant_buffer;
+	nv04->pipe.set_framebuffer_state = nv04_set_framebuffer_state;
+	nv04->pipe.set_polygon_stipple = nv04_set_polygon_stipple;
+	nv04->pipe.set_scissor_state = nv04_set_scissor_state;
+	nv04->pipe.set_viewport_state = nv04_set_viewport_state;
+
+	nv04->pipe.set_vertex_buffers = nv04_set_vertex_buffers;
+	nv04->pipe.set_vertex_elements = nv04_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_state.h b/src/gallium/drivers/nv04/nv04_state.h
new file mode 100644
index 0000000000..399f750dbe
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state.h
@@ -0,0 +1,71 @@
+#ifndef __NV04_STATE_H__
+#define __NV04_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv04_blend_state {
+	uint32_t b_enable;
+	uint32_t b_src;
+	uint32_t b_dst;
+};
+
+struct nv04_fragtex_state {
+	uint32_t format;
+};
+
+struct nv04_sampler_state {
+	uint32_t filter;
+	uint32_t format;
+};
+
+struct nv04_depth_stencil_alpha_state {
+	uint32_t control;
+};
+
+struct nv04_rasterizer_state {
+	uint32_t blend;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv04_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+struct nv04_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv04_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv04_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+
+#endif
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
new file mode 100644
index 0000000000..0ad40a092e
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -0,0 +1,156 @@
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+static void nv04_vertex_layout(struct pipe_context* pipe)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nv04_fragment_program *fp = nv04->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (i) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+static uint32_t nv04_blend_func(uint32_t f)
+{
+	switch ( f ) {
+		case PIPE_BLENDFACTOR_ZERO:			return 0x1;
+		case PIPE_BLENDFACTOR_ONE:			return 0x2;
+		case PIPE_BLENDFACTOR_SRC_COLOR:		return 0x3;
+		case PIPE_BLENDFACTOR_INV_SRC_COLOR:		return 0x4;
+		case PIPE_BLENDFACTOR_SRC_ALPHA:		return 0x5;
+		case PIPE_BLENDFACTOR_INV_SRC_ALPHA:		return 0x6;
+		case PIPE_BLENDFACTOR_DST_ALPHA:		return 0x7;
+		case PIPE_BLENDFACTOR_INV_DST_ALPHA:		return 0x8;
+		case PIPE_BLENDFACTOR_DST_COLOR:		return 0x9;
+		case PIPE_BLENDFACTOR_INV_DST_COLOR:		return 0xA;
+		case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:	return 0xB;
+	}
+	NOUVEAU_MSG("Unable to find the blend function 0x%x\n",f);
+	return 0;
+}
+
+static void nv04_emit_control(struct nv04_context* nv04)
+{
+	uint32_t control = nv04->dsa->control;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	OUT_RING(control);
+}
+
+static void nv04_emit_blend(struct nv04_context* nv04)
+{
+	uint32_t blend;
+
+	blend=0x4; // texture MODULATE_ALPHA
+	blend|=0x20; // alpha is MSB
+	blend|=(2<<6); // flat shading
+	blend|=(1<<8); // persp correct
+	blend|=(0<<16); // no fog
+	blend|=(nv04->blend->b_enable<<20);
+	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
+	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	OUT_RING(blend);
+}
+
+static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
+{
+	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
+	struct pipe_texture *pt = &nv04mt->base;
+
+	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
+	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING(nv04->sampler[unit]->filter);
+}
+
+void
+nv04_emit_hw_state(struct nv04_context *nv04)
+{
+	int i;
+
+	if (nv04->dirty & NV04_NEW_VERTPROG) {
+		//nv04_vertprog_bind(nv04, nv04->vertprog.current);
+		nv04->dirty &= ~NV04_NEW_VERTPROG;
+	}
+
+	if (nv04->dirty & NV04_NEW_FRAGPROG) {
+		nv04_fragprog_bind(nv04, nv04->fragprog.current);
+		/*XXX: clear NV04_NEW_FRAGPROG if no new program uploaded */
+		nv04->dirty_samplers |= (1<<10);
+		nv04->dirty_samplers = 0;
+	}
+
+	if (nv04->dirty & NV04_NEW_CONTROL) {
+		nv04->dirty &= ~NV04_NEW_CONTROL;
+
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+		OUT_RING(nv04->dsa->control);
+	}
+
+	if (nv04->dirty & NV04_NEW_BLEND) {
+		nv04->dirty &= ~NV04_NEW_BLEND;
+
+		nv04_emit_blend(nv04);
+	}
+
+	if (nv04->dirty & NV04_NEW_SAMPLER) {
+		nv04->dirty &= ~NV04_NEW_SAMPLER;
+
+		nv04_emit_sampler(nv04, 0);
+	}
+
+	if (nv04->dirty & NV04_NEW_VIEWPORT) {
+		nv04->dirty &= ~NV04_NEW_VIEWPORT;
+//		nv04_state_emit_viewport(nv04);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv04_vbo.c
+	 */
+
+	/* Render target */
+/*	BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_PITCH, 2);
+	OUT_RING(rt->stride|(zeta->stride<<16));
+	OUT_RELOCl(rt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	if (fb->zsbuf) {
+		BEGIN_RING(context_surfaces_3d, NV04_CONTEXT_SURFACES_3D_OFFSET_ZETA, 1);
+		OUT_RELOCl(zeta->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	}*/
+
+	/* Texture images */
+	for (i = 0; i < 1; i++) {
+		if (!(nv04->fp_samplers & (1 << i)))
+			continue;
+		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
+		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
+		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	}
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface.c b/src/gallium/drivers/nv04/nv04_surface.c
new file mode 100644
index 0000000000..9d9943ed4e
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv04_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv04_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nouveau_winsys *nvws = nv04->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv04_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv04_context *nv04 = nv04_context(pipe);
+	struct nouveau_winsys *nvws = nv04->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv04_init_surface_functions(struct nv04_context *nv04)
+{
+	nv04->pipe.surface_copy = nv04_surface_copy;
+	nv04->pipe.surface_fill = nv04_surface_fill;
+}
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
new file mode 100644
index 0000000000..91f919d48e
--- /dev/null
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -0,0 +1,71 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv04_context.h"
+#include "nv04_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv04_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv04_context *nv04 = nv04_context( pipe );
+	struct draw_context *draw = nv04->draw;
+	unsigned i;
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vertex_buffer[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv04->vertex_buffer[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	/* draw! */
+	draw_arrays(nv04->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv04->vertex_buffer[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv04->vertex_buffer[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv04_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv04_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv10/Makefile b/src/gallium/drivers/nv10/Makefile
new file mode 100644
index 0000000000..4ba7ce586d
--- /dev/null
+++ b/src/gallium/drivers/nv10/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv10
+
+DRIVER_SOURCES = \
+	nv10_clear.c \
+	nv10_context.c \
+	nv10_fragprog.c \
+	nv10_fragtex.c \
+	nv10_miptree.c \
+	nv10_prim_vbuf.c \
+	nv10_screen.c \
+	nv10_state.c \
+	nv10_state_emit.c \
+	nv10_surface.c \
+	nv10_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv10/nv10_clear.c b/src/gallium/drivers/nv10/nv10_clear.c
new file mode 100644
index 0000000000..be7e09cf4b
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+
+void
+nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv10/nv10_context.c b/src/gallium/drivers/nv10/nv10_context.c
new file mode 100644
index 0000000000..4eb4ed9185
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.c
@@ -0,0 +1,296 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_flush(nv10->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv10_destroy(struct pipe_context *pipe)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	if (nv10->draw)
+		draw_destroy(nv10->draw);
+
+	FREE(nv10);
+}
+
+static void nv10_init_hwctx(struct nv10_context *nv10)
+{
+	struct nv10_screen *screen = nv10->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+
+	BEGIN_RING(celsius, NV10TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle);
+	BEGIN_RING(celsius, NV10TCL_DMA_IN_MEMORY2, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0x7ff<<16)|0x800);
+
+	for (i=1;i<8;i++) {
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, 0x290, 1);
+	OUT_RING  ((0x10<<16)|1);
+	BEGIN_RING(celsius, 0x3f4, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	if (nv10->screen->celsius->grclass != NV10TCL) {
+		/* For nv11, nv17 */
+		BEGIN_RING(celsius, 0x120, 3);
+		OUT_RING  (0);
+		OUT_RING  (1);
+		OUT_RING  (2);
+
+		BEGIN_RING(celsius, NV10TCL_NOP, 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	/* Set state */
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_ENABLE(0), 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_RC_IN_ALPHA(0), 12);
+	OUT_RING  (0x30141010);
+	OUT_RING  (0);
+	OUT_RING  (0x20040000);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0x18000000);
+	OUT_RING  (0x300e0300);
+	OUT_RING  (0x0c091c80);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 2);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_WEIGHT_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (1);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0x8006);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 8);
+	OUT_RING  (0xff);
+	OUT_RING  (0x207);
+	OUT_RING  (0);
+	OUT_RING  (0xff);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1e00);
+	OUT_RING  (0x1d01);
+	BEGIN_RING(celsius, NV10TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LIGHT_MODEL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_COLOR_CONTROL, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING  (0x201);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_POINT_PARAMETERS_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (0x1b02);
+	OUT_RING  (0x1b02);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (0x405);
+	OUT_RING  (0x901);
+	BEGIN_RING(celsius, NV10TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_TX_GEN_S(0), 8);
+	for (i=0;i<8;i++) {
+		OUT_RING  (0);
+	}
+	BEGIN_RING(celsius, NV10TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RING  (0x3fc00000);	/* -1.50 */
+	OUT_RING  (0xbdb8aa0a);	/* -0.09 */
+	OUT_RING  (0);		/*  0.00 */
+
+	BEGIN_RING(celsius, NV10TCL_NOP, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(celsius, NV10TCL_FOG_MODE, 2);
+	OUT_RING  (0x802);
+	OUT_RING  (2);
+	/* for some reason VIEW_MATRIX_ENABLE need to be 6 instead of 4 when
+	 * using texturing, except when using the texture matrix
+	 */
+	BEGIN_RING(celsius, NV10TCL_VIEW_MATRIX_ENABLE, 1);
+	OUT_RING  (6);
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (0x01010101);
+
+	/* Set vertex component */
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL_4F_R, 4);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_COL2_3F_R, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_NOR_3F_X, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX0_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_TX1_4F_S, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	BEGIN_RING(celsius, NV10TCL_VERTEX_FOG_1F, 1);
+	OUT_RINGf (0.0);
+	BEGIN_RING(celsius, NV10TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	BEGIN_RING(celsius, NV10TCL_PROJECTION_MATRIX(0), 16);
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	for (i=0;i<16;i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RING  (0.0);
+	OUT_RINGf  (16777216.0);
+
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_SCALE_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv10_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv10_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv10_context *nv10;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv10 = CALLOC(1, sizeof(struct nv10_context));
+	if (!nv10)
+		return NULL;
+	nv10->screen = screen;
+	nv10->pctx_id = pctx_id;
+
+	nv10->nvws = nvws;
+
+	nv10->pipe.winsys = ws;
+	nv10->pipe.screen = pscreen;
+	nv10->pipe.destroy = nv10_destroy;
+	nv10->pipe.set_edgeflags = nv10_set_edgeflags;
+	nv10->pipe.draw_arrays = nv10_draw_arrays;
+	nv10->pipe.draw_elements = nv10_draw_elements;
+	nv10->pipe.clear = nv10_clear;
+	nv10->pipe.flush = nv10_flush;
+
+	nv10_init_surface_functions(nv10);
+	nv10_init_state_functions(nv10);
+
+	nv10->draw = draw_create();
+	assert(nv10->draw);
+	draw_set_rasterize_stage(nv10->draw, nv10_draw_vbuf_stage(nv10));
+
+	nv10_init_hwctx(nv10);
+
+	return &nv10->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_context.h b/src/gallium/drivers/nv10/nv10_context.h
new file mode 100644
index 0000000000..f3b56de25a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV10_CONTEXT_H__
+#define __NV10_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv10_screen *ctx = nv10->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv10_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV10_NEW_VERTPROG	(1 << 0)
+#define NV10_NEW_FRAGPROG	(1 << 1)
+#define NV10_NEW_VTXARRAYS	(1 << 2)
+#define NV10_NEW_BLEND		(1 << 3)
+#define NV10_NEW_BLENDCOL	(1 << 4)
+#define NV10_NEW_RAST 		(1 << 5)
+#define NV10_NEW_DSA  		(1 << 6)
+#define NV10_NEW_VIEWPORT	(1 << 7)
+#define NV10_NEW_SCISSOR	(1 << 8)
+#define NV10_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv10_screen.h"
+
+struct nv10_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv10_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv10_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv10_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv10_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv10_rasterizer_state *rast;
+	struct nv10_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv10_vertex_program *active;
+
+		struct nv10_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv10_fragment_program *active;
+
+		struct nv10_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv10_context *
+nv10_context(struct pipe_context *pipe)
+{
+	return (struct nv10_context *)pipe;
+}
+
+extern void nv10_init_state_functions(struct nv10_context *nv10);
+extern void nv10_init_surface_functions(struct nv10_context *nv10);
+
+extern void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv10_clear.c */
+extern void nv10_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv10_draw.c */
+extern struct draw_stage *nv10_draw_render_stage(struct nv10_context *nv10);
+
+/* nv10_fragprog.c */
+extern void nv10_fragprog_bind(struct nv10_context *,
+			       struct nv10_fragment_program *);
+extern void nv10_fragprog_destroy(struct nv10_context *,
+				  struct nv10_fragment_program *);
+
+/* nv10_fragtex.c */
+extern void nv10_fragtex_bind(struct nv10_context *);
+
+/* nv10_prim_vbuf.c */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 );
+extern void nv10_vtxbuf_bind(struct nv10_context* nv10);
+
+/* nv10_state.c and friends */
+extern void nv10_emit_hw_state(struct nv10_context *nv10);
+extern void nv10_state_tex_update(struct nv10_context *nv10);
+
+/* nv10_vbo.c */
+extern boolean nv10_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_fragprog.c b/src/gallium/drivers/nv10/nv10_fragprog.c
new file mode 100644
index 0000000000..698db5a16a
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv10_context.h"
+
+void
+nv10_fragprog_bind(struct nv10_context *nv10, struct nv10_fragment_program *fp)
+{
+}
+
+void
+nv10_fragprog_destroy(struct nv10_context *nv10,
+		      struct nv10_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_fragtex.c b/src/gallium/drivers/nv10/nv10_fragtex.c
new file mode 100644
index 0000000000..27f2f87584
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv10_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV10TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv10_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv10_texture_format
+nv10_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+//	_(RGB_DXT1      , DXT1,   ),
+//	_(RGBA_DXT1     , DXT1,   ),
+//	_(RGBA_DXT3     , DXT3,   ),
+//	_(RGBA_DXT5     , DXT5,   ),
+	{},
+};
+
+static struct nv10_texture_format *
+nv10_fragtex_format(uint pipe_format)
+{
+	struct nv10_texture_format *tf = nv10_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv10_fragtex_build(struct nv10_context *nv10, int unit)
+{
+#if 0
+	struct nv10_sampler_state *ps = nv10->tex_sampler[unit];
+	struct nv10_miptree *nv10mt = nv10->tex_miptree[unit];
+	struct pipe_texture *pt = &nv10mt->base;
+	struct nv10_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv10_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv10mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv10mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv10_fragtex_bind(struct nv10_context *nv10)
+{
+#if 0
+	struct nv10_fragment_program *fp = nv10->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv10->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(celsius, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv10->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv10_fragtex_build(nv10, unit);
+	}
+
+	nv10->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_miptree.c b/src/gallium/drivers/nv10/nv10_miptree.c
new file mode 100644
index 0000000000..943f9e21e9
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_miptree.c
@@ -0,0 +1,149 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static void
+nv10_miptree_layout(struct nv10_miptree *nv10mt)
+{
+	struct pipe_texture *pt = &nv10mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv10mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv10mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv10mt->level[l].pitch = (nv10mt->level[l].pitch + 63) & ~63;
+
+		nv10mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv10mt->level[l].image_offset[f] = offset;
+			offset += nv10mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv10mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv10_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv10_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv10_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv10_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv10_miptree *nv10mt = (struct nv10_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv10mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv10mt->level[l].image_offset)
+				FREE(nv10mt->level[l].image_offset);
+		}
+		FREE(nv10mt);
+	}
+}
+
+static void
+nv10_miptree_update(struct pipe_context *pipe, struct pipe_texture *mt,
+		    uint face, uint levels)
+{
+}
+
+
+static struct pipe_surface *
+nv10_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv10_miptree *nv10mt = (struct nv10_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = ws->surface_alloc(ws);
+	if (!ps)
+		return NULL;
+	pipe_buffer_reference(screen, &ps->buffer, nv10mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv10mt->level[level].pitch;
+	ps->refcount = 1;
+	ps->winsys = screen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv10mt->level[level].image_offset[face];
+	} else {
+		ps->offset = nv10mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv10_miptree_surface_release(struct pipe_screen *screen,
+			     struct pipe_surface **surface)
+{
+}
+
+void nv10_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv10_miptree_create;
+	pscreen->texture_release = nv10_miptree_release;
+	pscreen->get_tex_surface = nv10_miptree_surface_get;
+	pscreen->tex_surface_release = nv10_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_prim_vbuf.c b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
new file mode 100644
index 0000000000..e7e81d3dff
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_prim_vbuf.c
@@ -0,0 +1,245 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv10.
+ */
+struct nv10_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv10_context *nv10;   
+
+	/** Vertex buffer */
+	struct pipe_buffer* buffer;
+
+	/** Vertex size in bytes */
+	unsigned vertex_size;
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+
+void nv10_vtxbuf_bind( struct nv10_context* nv10 )
+{
+	int i;
+	for(i = 0; i < 8; i++) {
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_OFFSET(i), 1);
+		OUT_RING(0/*nv10->vtxbuf*/);
+		BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_ATTRIB_FORMAT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+}
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv10_vbuf_render *
+nv10_vbuf_render( struct vbuf_render *render )
+{
+	assert(render);
+	return (struct nv10_vbuf_render *)render;
+}
+
+
+static const struct vertex_info *
+nv10_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+
+	nv10_emit_hw_state(nv10);
+
+	return &nv10->vertex_info;
+}
+
+
+static void *
+nv10_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+
+	assert(!nv10_render->buffer);
+	nv10_render->buffer = winsys->buffer_create(winsys, 64, PIPE_BUFFER_USAGE_VERTEX, size);
+
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	return winsys->buffer_map(winsys, 
+			nv10_render->buffer, 
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+
+static boolean
+nv10_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv10_render->hwprim = hwp;
+	return TRUE;
+}
+
+
+static void 
+nv10_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	int push, i;
+
+	nv10_emit_hw_state(nv10);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv10_render->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv10_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(celsius, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(celsius, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+
+static void
+nv10_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	struct nv10_context *nv10 = nv10_render->nv10;
+	struct pipe_winsys *winsys = nv10->pipe.winsys;
+	struct pipe_screen *pscreen = &nv10->screen->pipe;
+
+	assert(nv10_render->buffer);
+	winsys->buffer_unmap(winsys, nv10_render->buffer);
+	pipe_buffer_reference(pscreen, &nv10_render->buffer, NULL);
+}
+
+
+static void
+nv10_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv10_vbuf_render *nv10_render = nv10_vbuf_render(render);
+	FREE(nv10_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv10_vbuf_render_create( struct nv10_context *nv10 )
+{
+	struct nv10_vbuf_render *nv10_render = CALLOC_STRUCT(nv10_vbuf_render);
+
+	nv10_render->nv10 = nv10;
+
+	nv10_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv10_render->base.max_indices = 1024;
+	nv10_render->base.get_vertex_info = nv10_vbuf_render_get_vertex_info;
+	nv10_render->base.allocate_vertices = nv10_vbuf_render_allocate_vertices;
+	nv10_render->base.set_primitive = nv10_vbuf_render_set_primitive;
+	nv10_render->base.draw = nv10_vbuf_render_draw;
+	nv10_render->base.release_vertices = nv10_vbuf_render_release_vertices;
+	nv10_render->base.destroy = nv10_vbuf_render_destroy;
+
+	return &nv10_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv10_draw_vbuf_stage( struct nv10_context *nv10 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv10_vbuf_render_create(nv10);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv10->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
new file mode 100644
index 0000000000..27a9edf9bb
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -0,0 +1,208 @@
+#include "pipe/p_screen.h"
+
+#include "nv10_context.h"
+#include "nv10_screen.h"
+
+static const char *
+nv10_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv10_screen *nv10screen = nv10_screen(screen);
+	struct nouveau_device *dev = nv10screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv10_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv10_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv10_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv10_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv10_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv10_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv10_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv10_screen *screen = nv10_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->celsius);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv10_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv10_screen *screen = CALLOC_STRUCT(nv10_screen);
+	unsigned celsius_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if (chipset>=0x20)
+		celsius_class=NV11TCL;
+	else if (chipset>=0x17)
+		celsius_class=NV17TCL;
+	else if (chipset>=0x11)
+		celsius_class=NV11TCL;
+	else
+		celsius_class=NV10TCL;
+
+	if (!celsius_class) {
+		NOUVEAU_ERR("Unknown nv1x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, celsius_class, &screen->celsius);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv10_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv10_screen_destroy;
+
+	screen->pipe.get_name = nv10_screen_get_name;
+	screen->pipe.get_vendor = nv10_screen_get_vendor;
+	screen->pipe.get_param = nv10_screen_get_param;
+	screen->pipe.get_paramf = nv10_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv10_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv10_surface_map;
+	screen->pipe.surface_unmap = nv10_surface_unmap;
+
+	nv10_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_screen.h b/src/gallium/drivers/nv10/nv10_screen.h
new file mode 100644
index 0000000000..3f8750a13f
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_screen.h
@@ -0,0 +1,22 @@
+#ifndef __NV10_SCREEN_H__
+#define __NV10_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv10_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *celsius;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv10_screen *
+nv10_screen(struct pipe_screen *screen)
+{
+	return (struct nv10_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state.c b/src/gallium/drivers/nv10/nv10_state.c
new file mode 100644
index 0000000000..d2375aa2f6
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.c
@@ -0,0 +1,588 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void *
+nv10_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv10_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv10_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv10_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend = (struct nv10_blend_state*)blend;
+
+	nv10->dirty |= NV10_NEW_BLEND;
+}
+
+static void
+nv10_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV10TCL_TX_FORMAT_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV10TCL_TX_FORMAT_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV10TCL_TX_FORMAT_WRAP_S_SHIFT;
+}
+
+static void *
+nv10_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv10_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv10_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV10TCL_TX_FORMAT_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV10TCL_TX_FORMAT_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV10TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV10TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV10TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv10_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_sampler[unit] = sampler[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv10_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv10_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv10->tex_miptree[unit] = (struct nv10_miptree *)miptree[unit];
+		nv10->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv10_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv10_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv10_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? 0x1d00 : 0x1d01;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV10TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV10TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV10TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV10TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV10TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv10_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->rast = (struct nv10_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv10->draw, (nv10->rast ? nv10->rast->templ : NULL));
+
+	nv10->dirty |= NV10_NEW_RAST;
+}
+
+static void
+nv10_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv10_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv10_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].write_mask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].value_mask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+
+	return (void *)hw;
+}
+
+static void
+nv10_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->dsa = (struct nv10_depth_stencil_alpha_state*)dsa;
+
+	nv10->dirty |= NV10_NEW_DSA;
+}
+
+static void
+nv10_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv10_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	return draw_create_vertex_shader(nv10->draw, templ);
+}
+
+static void
+nv10_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_bind_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+
+	nv10->dirty |= NV10_NEW_VERTPROG;
+}
+
+static void
+nv10_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_delete_vertex_shader(nv10->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv10_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv10_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv10_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv10_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10->fragprog.current = fp;
+	nv10->dirty |= NV10_NEW_FRAGPROG;
+}
+
+static void
+nv10_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nv10_fragment_program *fp = hwcso;
+
+	nv10_fragprog_destroy(nv10, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv10_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv10->dirty |= NV10_NEW_BLENDCOL;
+}
+
+static void
+nv10_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	draw_set_clip_state(nv10->draw, clip);
+}
+
+static void
+nv10_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->size && (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv10->constbuf[shader], mapped, buf->size);
+			nv10->constbuf_nr[shader] =
+				buf->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv10_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv10->dirty |= NV10_NEW_FRAMEBUFFER;
+}
+
+static void
+nv10_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv10_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->scissor = (struct pipe_scissor_state*)s;
+
+	nv10->dirty |= NV10_NEW_SCISSOR;
+}
+
+static void
+nv10_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	nv10->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv10->draw, nv10->viewport);
+
+	nv10->dirty |= NV10_NEW_VIEWPORT;
+}
+
+static void
+nv10_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxbuf, vb, sizeof(*vb) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv10->draw, count, vb);
+}
+
+static void
+nv10_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+
+	memcpy(nv10->vtxelt, ve, sizeof(*ve) * count);
+	nv10->dirty |= NV10_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv10->draw, count, ve);
+}
+
+void
+nv10_init_state_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.create_blend_state = nv10_blend_state_create;
+	nv10->pipe.bind_blend_state = nv10_blend_state_bind;
+	nv10->pipe.delete_blend_state = nv10_blend_state_delete;
+
+	nv10->pipe.create_sampler_state = nv10_sampler_state_create;
+	nv10->pipe.bind_sampler_states = nv10_sampler_state_bind;
+	nv10->pipe.delete_sampler_state = nv10_sampler_state_delete;
+	nv10->pipe.set_sampler_textures = nv10_set_sampler_texture;
+
+	nv10->pipe.create_rasterizer_state = nv10_rasterizer_state_create;
+	nv10->pipe.bind_rasterizer_state = nv10_rasterizer_state_bind;
+	nv10->pipe.delete_rasterizer_state = nv10_rasterizer_state_delete;
+
+	nv10->pipe.create_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_create;
+	nv10->pipe.bind_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_bind;
+	nv10->pipe.delete_depth_stencil_alpha_state =
+		nv10_depth_stencil_alpha_state_delete;
+
+	nv10->pipe.create_vs_state = nv10_vp_state_create;
+	nv10->pipe.bind_vs_state = nv10_vp_state_bind;
+	nv10->pipe.delete_vs_state = nv10_vp_state_delete;
+
+	nv10->pipe.create_fs_state = nv10_fp_state_create;
+	nv10->pipe.bind_fs_state = nv10_fp_state_bind;
+	nv10->pipe.delete_fs_state = nv10_fp_state_delete;
+
+	nv10->pipe.set_blend_color = nv10_set_blend_color;
+	nv10->pipe.set_clip_state = nv10_set_clip_state;
+	nv10->pipe.set_constant_buffer = nv10_set_constant_buffer;
+	nv10->pipe.set_framebuffer_state = nv10_set_framebuffer_state;
+	nv10->pipe.set_polygon_stipple = nv10_set_polygon_stipple;
+	nv10->pipe.set_scissor_state = nv10_set_scissor_state;
+	nv10->pipe.set_viewport_state = nv10_set_viewport_state;
+
+	nv10->pipe.set_vertex_buffers = nv10_set_vertex_buffers;
+	nv10->pipe.set_vertex_elements = nv10_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_state.h b/src/gallium/drivers/nv10/nv10_state.h
new file mode 100644
index 0000000000..3a3fd0d4f4
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV10_STATE_H__
+#define __NV10_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv10_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv10_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv10_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv10_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv10_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv10_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv10_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv10_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv10_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv10_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv10_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv10_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv10_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv10/nv10_state_emit.c b/src/gallium/drivers/nv10/nv10_state_emit.c
new file mode 100644
index 0000000000..46c7e1d753
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_state_emit.c
@@ -0,0 +1,303 @@
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+static void nv10_state_emit_blend(struct nv10_context* nv10)
+{
+	struct nv10_blend_state *b = nv10->blend;
+
+	BEGIN_RING(celsius, NV10TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_FUNC_ENABLE, 3);
+	OUT_RING  (b->b_enable);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(celsius, NV10TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv10_state_emit_blend_color(struct nv10_context* nv10)
+{
+	struct pipe_blend_color *c = nv10->blend_color;
+
+	BEGIN_RING(celsius, NV10TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv10_state_emit_rast(struct nv10_context* nv10)
+{
+	struct nv10_rasterizer_state *r = nv10->rast;
+
+	BEGIN_RING(celsius, NV10TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(celsius, NV10TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(celsius, NV10TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(celsius, NV10TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(celsius, NV10TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv10_state_emit_dsa(struct nv10_context* nv10)
+{
+	struct nv10_depth_stencil_alpha_state *d = nv10->dsa;
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(celsius, NV10TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(celsius, NV10TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(celsius, NV10TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(celsius, NV10TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv10_state_emit_viewport(struct nv10_context* nv10)
+{
+}
+
+static void nv10_state_emit_scissor(struct nv10_context* nv10)
+{
+	// XXX this is so not working
+/*	struct pipe_scissor_state *s = nv10->scissor;
+	BEGIN_RING(celsius, NV10TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv10_state_emit_framebuffer(struct nv10_context* nv10)
+{
+	struct pipe_framebuffer_state* fb = nv10->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV10TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV10TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(celsius, NV10TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv10->rt[0] = rt->buffer;
+
+	if (zeta_format)
+	{
+		nv10->zeta = zeta->buffer;
+	}
+
+	BEGIN_RING(celsius, NV10TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(celsius, NV10TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0 | 0x08000800);
+	OUT_RING  (((h - 1) << 16) | 0 | 0x08000800);
+}
+
+static void nv10_vertex_layout(struct nv10_context *nv10)
+{
+	struct nv10_fragment_program *fp = nv10->fragprog.current;
+	uint32_t src = 0;
+	int i;
+	struct vertex_info vinfo;
+
+	memset(&vinfo, 0, sizeof(vinfo));
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+			case TGSI_SEMANTIC_POSITION:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			case TGSI_SEMANTIC_COLOR:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src++);
+				break;
+			default:
+			case TGSI_SEMANTIC_GENERIC:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+			case TGSI_SEMANTIC_FOG:
+				draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src++);
+				break;
+		}
+	}
+	draw_compute_vertex_size(&vinfo);
+}
+
+void
+nv10_emit_hw_state(struct nv10_context *nv10)
+{
+	int i;
+
+	if (nv10->dirty & NV10_NEW_VERTPROG) {
+		//nv10_vertprog_bind(nv10, nv10->vertprog.current);
+		nv10->dirty &= ~NV10_NEW_VERTPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_FRAGPROG) {
+		nv10_fragprog_bind(nv10, nv10->fragprog.current);
+		/*XXX: clear NV10_NEW_FRAGPROG if no new program uploaded */
+		nv10->dirty_samplers |= (1<<10);
+		nv10->dirty_samplers = 0;
+	}
+
+	if (nv10->dirty_samplers || (nv10->dirty & NV10_NEW_FRAGPROG)) {
+		nv10_fragtex_bind(nv10);
+		nv10->dirty &= ~NV10_NEW_FRAGPROG;
+	}
+
+	if (nv10->dirty & NV10_NEW_VTXARRAYS) {
+		nv10->dirty &= ~NV10_NEW_VTXARRAYS;
+		nv10_vertex_layout(nv10);
+		nv10_vtxbuf_bind(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLEND) {
+		nv10->dirty &= ~NV10_NEW_BLEND;
+		nv10_state_emit_blend(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_BLENDCOL) {
+		nv10->dirty &= ~NV10_NEW_BLENDCOL;
+		nv10_state_emit_blend_color(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_RAST) {
+		nv10->dirty &= ~NV10_NEW_RAST;
+		nv10_state_emit_rast(nv10);
+	}
+
+	if (nv10->dirty & NV10_NEW_DSA) {
+		nv10->dirty &= ~NV10_NEW_DSA;
+		nv10_state_emit_dsa(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_VIEWPORT) {
+		nv10->dirty &= ~NV10_NEW_VIEWPORT;
+		nv10_state_emit_viewport(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_SCISSOR) {
+		nv10->dirty &= ~NV10_NEW_SCISSOR;
+		nv10_state_emit_scissor(nv10);
+	}
+
+ 	if (nv10->dirty & NV10_NEW_FRAMEBUFFER) {
+		nv10->dirty &= ~NV10_NEW_FRAMEBUFFER;
+		nv10_state_emit_framebuffer(nv10);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv10_vbo.c
+	 */
+
+	/* Render target */
+// XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+//	BEGIN_RING(celsius, NV10TCL_DMA_COLOR0, 1);
+//	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv10->zeta) {
+// XXX
+//		BEGIN_RING(celsius, NV10TCL_DMA_ZETA, 1);
+//		OUT_RELOCo(nv10->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		BEGIN_RING(celsius, NV10TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(celsius, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv10->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(celsius, NV10TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv10->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(celsius, NV10TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv10->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv10->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(celsius, NV10TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv10->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(celsius, NV10TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv10->tex[i].buffer, nv10->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV10TCL_TX_FORMAT_DMA0,
+			   NV10TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv10/nv10_surface.c b/src/gallium/drivers/nv10/nv10_surface.c
new file mode 100644
index 0000000000..be44c7bed5
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv10_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv10_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nouveau_winsys *nvws = nv10->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv10_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv10_context *nv10 = nv10_context(pipe);
+	struct nouveau_winsys *nvws = nv10->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv10_init_surface_functions(struct nv10_context *nv10)
+{
+	nv10->pipe.surface_copy = nv10_surface_copy;
+	nv10->pipe.surface_fill = nv10_surface_fill;
+}
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
new file mode 100644
index 0000000000..d0e788ac03
--- /dev/null
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv10_context.h"
+#include "nv10_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv10_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv10_context *nv10 = nv10_context( pipe );
+	struct draw_context *draw = nv10->draw;
+	unsigned i;
+
+	nv10_emit_hw_state(nv10);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv10->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv10->constbuf[PIPE_SHADER_VERTEX],
+					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv10->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv10->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv10->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv10_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv10_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/Makefile b/src/gallium/drivers/nv20/Makefile
new file mode 100644
index 0000000000..d777fd3d8b
--- /dev/null
+++ b/src/gallium/drivers/nv20/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv20
+
+DRIVER_SOURCES = \
+	nv20_clear.c \
+	nv20_context.c \
+	nv20_fragprog.c \
+	nv20_fragtex.c \
+	nv20_miptree.c \
+	nv20_prim_vbuf.c \
+	nv20_screen.c \
+	nv20_state.c \
+	nv20_state_emit.c \
+	nv20_surface.c \
+	nv20_vbo.c
+#	nv20_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv20/nv20_clear.c b/src/gallium/drivers/nv20/nv20_clear.c
new file mode 100644
index 0000000000..81b6f3e78a
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_clear.c
@@ -0,0 +1,12 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+
+void
+nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+}
diff --git a/src/gallium/drivers/nv20/nv20_context.c b/src/gallium/drivers/nv20/nv20_context.c
new file mode 100644
index 0000000000..9a17f4af57
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.c
@@ -0,0 +1,419 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_flush(nv20->draw);
+
+	FIRE_RING(fence);
+}
+
+static void
+nv20_destroy(struct pipe_context *pipe)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	if (nv20->draw)
+		draw_destroy(nv20->draw);
+
+	FREE(nv20);
+}
+
+static void nv20_init_hwctx(struct nv20_context *nv20)
+{
+	struct nv20_screen *screen = nv20->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+	int i;
+	float projectionmatrix[16];
+	const boolean is_nv25tcl = (nv20->screen->kelvin->grclass == NV25TCL);
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_NOTIFY, 1);
+	OUT_RING  (screen->sync->handle);
+	BEGIN_RING(kelvin, NV20TCL_DMA_TEXTURE0, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->gart->handle); /* TEXTURE1 */
+	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR, 2);
+	OUT_RING  (nvws->channel->vram->handle);
+	OUT_RING  (nvws->channel->vram->handle); /* ZETA */
+
+	BEGIN_RING(kelvin, NV20TCL_DMA_QUERY, 1);
+	OUT_RING  (0); /* renouveau: beef0351, unique */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(0), 1);
+	OUT_RING  ((0xfff << 16) | 0x0);
+
+	for (i = 1; i < NV20TCL_VIEWPORT_CLIP_HORIZ__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_VERT(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_MODE, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x17e0, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL | 0xdb0);
+	} else {
+		BEGIN_RING(kelvin, 0x1e68, 1);
+		OUT_RING  (0x4b800000); /* 16777216.000000 */
+		BEGIN_RING(kelvin, NV20TCL_TX_RCOMP, 1);
+		OUT_RING  (NV20TCL_TX_RCOMP_LEQUAL);
+	}
+
+	BEGIN_RING(kelvin, 0x290, 1);
+	OUT_RING  ((0x10 << 16) | 1);
+	BEGIN_RING(kelvin, 0x9fc, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, 0x1d80, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, 0x9f8, 1);
+	OUT_RING  (4);
+	BEGIN_RING(kelvin, 0x17ec, 3);
+	OUT_RINGf (0.0);
+	OUT_RINGf (1.0);
+	OUT_RINGf (0.0);
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d88, 1);
+		OUT_RING  (3);
+
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY9, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+		BEGIN_RING(kelvin, NV25TCL_DMA_IN_MEMORY8, 1);
+		OUT_RING  (nvws->channel->vram->handle);
+	}
+	BEGIN_RING(kelvin, NV20TCL_DMA_FENCE, 1);
+	OUT_RING  (0);	/* renouveau: beef1e10 */
+
+	BEGIN_RING(kelvin, 0x1e98, 1);
+	OUT_RING  (0);
+#if 0
+	if (is_nv25tcl) {
+		BEGIN_RING(NvSub3D, NV25TCL_DMA_IN_MEMORY4, 2);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+		OUT_RING  (NvDmaFB);	/* renouveau: beef0201 */
+
+		BEGIN_RING(NvSub3D, NV20TCL_DMA_TEXTURE1, 1);
+		OUT_RING  (NvDmaTT);	/* renouveau: beef0202 */
+	}
+#endif
+	BEGIN_RING(kelvin, NV20TCL_NOTIFY, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, 0x120, 3);
+	OUT_RING  (0);
+	OUT_RING  (1);
+	OUT_RING  (2);
+
+/* error: ILLEGAL_MTHD, PROTECTION_FAULT
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_TRANSLATE_X, 4);
+	OUT_RINGf (0.0);
+	OUT_RINGf (512.0);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);
+*/
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x022c, 2);
+		OUT_RING  (0x280);
+		OUT_RING  (0x07d28000);
+	}
+
+/* * illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c2c, 1);
+	OUT_RING  (0); */
+
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1da4, 1);
+		OUT_RING  (0);
+	}
+
+/* * crashes with illegal method, protection fault
+	BEGIN_RING(NvSub3D, 0x1c18, 1);
+	OUT_RING  (0x200); */
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 2);
+	OUT_RING  ((0 << 16) | 0);
+	OUT_RING  ((0 << 16) | 0);
+
+	/* *** Set state *** */
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 2);
+	OUT_RING  (NV20TCL_ALPHA_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_ALPHA_FUNC_REF */
+
+	for (i = 0; i < NV20TCL_TX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_OP, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_SHADER_CULL_MODE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_ALPHA(0), 4);
+	OUT_RING  (0x30d410d0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_RGB(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_ENABLE, 1);
+	OUT_RING  (0x00011101);
+	BEGIN_RING(kelvin, NV20TCL_RC_FINAL0, 2);
+	OUT_RING  (0x130e0300);
+	OUT_RING  (0x0c091c80);
+	BEGIN_RING(kelvin, NV20TCL_RC_OUT_ALPHA(0), 4);
+	OUT_RING  (0x00000c00);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_IN_RGB(0), 4);
+	OUT_RING  (0x20c400c0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_COLOR0, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_RC_CONSTANT_COLOR0(0), 4);
+	OUT_RING  (0x035125a0);
+	OUT_RING  (0);
+	OUT_RING  (0x40002000);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_MULTISAMPLE_CONTROL, 1);
+	OUT_RING  (0xffff0000);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 4);
+	OUT_RING  (NV20TCL_BLEND_FUNC_SRC_ONE);
+	OUT_RING  (NV20TCL_BLEND_FUNC_DST_ZERO);
+	OUT_RING  (0);			/* NV20TCL_BLEND_COLOR */
+	OUT_RING  (NV20TCL_BLEND_EQUATION_FUNC_ADD);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RING  (0xff);
+	OUT_RING  (NV20TCL_STENCIL_FUNC_FUNC_ALWAYS);
+	OUT_RING  (0);			/* NV20TCL_STENCIL_FUNC_REF */
+	OUT_RING  (0xff);		/* NV20TCL_STENCIL_FUNC_MASK */
+	OUT_RING  (NV20TCL_STENCIL_OP_FAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZFAIL_KEEP);
+	OUT_RING  (NV20TCL_STENCIL_OP_ZPASS_KEEP);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_LOGIC_OP_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (NV20TCL_COLOR_LOGIC_OP_OP_COPY);
+	BEGIN_RING(kelvin, 0x17cc, 1);
+	OUT_RING  (0);
+	if (is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (1);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LIGHTING_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_CONTROL, 1);
+	OUT_RING  (0x00020000);
+	BEGIN_RING(kelvin, NV20TCL_SEPARATE_SPECULAR_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_LIGHT_MODEL_TWO_SIDE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_ENABLED_LIGHTS, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_NORMALIZE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_PATTERN(0),
+					NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE);
+	for (i = 0; i < NV20TCL_POLYGON_STIPPLE_PATTERN__SIZE; ++i) {
+		OUT_RING(0xffffffff);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	OUT_RING  (0);
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_LINE_ENABLE */
+	OUT_RING  (0);		/* NV20TCL.POLYGON_OFFSET_FILL_ENABLE */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING  (NV20TCL_DEPTH_FUNC_LESS);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_OFFSET_FACTOR, 2);
+	OUT_RINGf (0.0);
+	OUT_RINGf (0.0);	/* NV20TCL.POLYGON_OFFSET_UNITS */
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_UNK17D8, 1);
+	OUT_RING  (1);
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, 0x1d84, 1);
+		OUT_RING  (3);
+	}
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	if (!is_nv25tcl) {
+		OUT_RING  (8);
+	} else {
+		OUT_RINGf (1.0);
+	}
+	if (!is_nv25tcl) {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 2);
+		OUT_RING  (0);
+		OUT_RING  (0);		/* NV20TCL.POINT_SMOOTH_ENABLE */
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_POINT_PARAMETERS_ENABLE, 1);
+		OUT_RING  (0);
+		BEGIN_RING(kelvin, 0x0a1c, 1);
+		OUT_RING  (0x800);
+	}
+	BEGIN_RING(kelvin, NV20TCL_LINE_WIDTH, 1);
+	OUT_RING  (8);
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (NV20TCL_POLYGON_MODE_FRONT_FILL);
+	OUT_RING  (NV20TCL_POLYGON_MODE_BACK_FILL);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (NV20TCL_CULL_FACE_BACK);
+	OUT_RING  (NV20TCL_FRONT_FACE_CCW);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_SMOOTH_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 1);
+	OUT_RING  (NV20TCL_SHADE_MODEL_SMOOTH);
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_STIPPLE_ENABLE, 1);
+	OUT_RING  (0);
+	BEGIN_RING(kelvin, NV20TCL_TX_GEN_S(0), 4 * NV20TCL_TX_GEN_S__SIZE);
+	for (i=0; i < 4 * NV20TCL_TX_GEN_S__SIZE; ++i) {
+		OUT_RING(0);
+	}
+	BEGIN_RING(kelvin, NV20TCL_FOG_EQUATION_CONSTANT, 3);
+	OUT_RINGf (1.5);
+	OUT_RINGf (-0.090168);		/* NV20TCL.FOG_EQUATION_LINEAR */
+	OUT_RINGf (0.0);		/* NV20TCL.FOG_EQUATION_QUADRATIC */
+	BEGIN_RING(kelvin, NV20TCL_FOG_MODE, 2);
+	OUT_RING  (NV20TCL_FOG_MODE_EXP_2);
+	OUT_RING  (NV20TCL_FOG_COORD_DIST_COORD_FOG);
+	BEGIN_RING(kelvin, NV20TCL_FOG_ENABLE, 2);
+	OUT_RING  (0);
+	OUT_RING  (0);			/* NV20TCL.FOG_COLOR */
+	BEGIN_RING(kelvin, NV20TCL_ENGINE, 1);
+	OUT_RING  (NV20TCL_ENGINE_FIXED);
+
+	for (i = 0; i < NV20TCL_TX_MATRIX_ENABLE__SIZE; ++i) {
+		BEGIN_RING(kelvin, NV20TCL_TX_MATRIX_ENABLE(i), 1);
+		OUT_RING  (0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTX_ATTR_4F_X(1), 4 * 15);
+	OUT_RINGf(1.0); OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0);
+	OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0); OUT_RINGf(1.0);
+	for (i = 4; i < 16; ++i) {
+		OUT_RINGf(0.0); OUT_RINGf(0.0); OUT_RINGf(0.0);	OUT_RINGf(1.0);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_EDGEFLAG_ENABLE, 1);
+	OUT_RING  (1);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING (0x00010101);
+	BEGIN_RING(kelvin, NV20TCL_CLEAR_VALUE, 1);
+	OUT_RING (0);
+
+	memset(projectionmatrix, 0, sizeof(projectionmatrix));
+	projectionmatrix[0*4+0] = 1.0;
+	projectionmatrix[1*4+1] = 1.0;
+	projectionmatrix[2*4+2] = 1.0;
+	projectionmatrix[3*4+3] = 1.0;
+	BEGIN_RING(kelvin, NV20TCL_PROJECTION_MATRIX(0), 16);
+	for (i = 0; i < 16; i++) {
+		OUT_RINGf  (projectionmatrix[i]);
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_RANGE_NEAR, 2);
+	OUT_RINGf  (0.0);
+	OUT_RINGf  (16777216.0); /* bpp dependant? */
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE0_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_SCALE1_X, 4);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (-2048.0);
+	OUT_RINGf  (16777215.0 * 0.5);
+	OUT_RING  (0);
+
+	FIRE_RING (NULL);
+}
+
+static void
+nv20_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv20_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv20_context *nv20;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv20 = CALLOC(1, sizeof(struct nv20_context));
+	if (!nv20)
+		return NULL;
+	nv20->screen = screen;
+	nv20->pctx_id = pctx_id;
+
+	nv20->nvws = nvws;
+
+	nv20->pipe.winsys = ws;
+	nv20->pipe.screen = pscreen;
+	nv20->pipe.destroy = nv20_destroy;
+	nv20->pipe.set_edgeflags = nv20_set_edgeflags;
+	nv20->pipe.draw_arrays = nv20_draw_arrays;
+	nv20->pipe.draw_elements = nv20_draw_elements;
+	nv20->pipe.clear = nv20_clear;
+	nv20->pipe.flush = nv20_flush;
+
+	nv20_init_surface_functions(nv20);
+	nv20_init_state_functions(nv20);
+
+	nv20->draw = draw_create();
+	assert(nv20->draw);
+	draw_set_rasterize_stage(nv20->draw, nv20_draw_vbuf_stage(nv20));
+
+	nv20_init_hwctx(nv20);
+
+	return &nv20->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_context.h b/src/gallium/drivers/nv20/nv20_context.h
new file mode 100644
index 0000000000..8ad926db20
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_context.h
@@ -0,0 +1,153 @@
+#ifndef __NV20_CONTEXT_H__
+#define __NV20_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv20_screen *ctx = nv20->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv20_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+#define NV20_NEW_VERTPROG	(1 << 0)
+#define NV20_NEW_FRAGPROG	(1 << 1)
+#define NV20_NEW_VTXARRAYS	(1 << 2)
+#define NV20_NEW_BLEND		(1 << 3)
+#define NV20_NEW_BLENDCOL	(1 << 4)
+#define NV20_NEW_RAST 		(1 << 5)
+#define NV20_NEW_DSA  		(1 << 6)
+#define NV20_NEW_VIEWPORT	(1 << 7)
+#define NV20_NEW_SCISSOR	(1 << 8)
+#define NV20_NEW_FRAMEBUFFER	(1 << 9)
+
+#include "nv20_screen.h"
+
+struct nv20_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv20_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	uint32_t dirty;
+
+	struct nv20_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv20_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned dirty_samplers;
+	unsigned fp_samplers;
+	unsigned vp_samplers;
+
+	uint32_t rt_enable;
+	struct pipe_buffer *rt[4];
+	struct pipe_buffer *zeta;
+	uint32_t lma_offset;
+
+	struct nv20_blend_state *blend;
+	struct pipe_blend_color *blend_color;
+	struct nv20_rasterizer_state *rast;
+	struct nv20_depth_stencil_alpha_state *dsa;
+	struct pipe_viewport_state *viewport;
+	struct pipe_scissor_state *scissor;
+	struct pipe_framebuffer_state *framebuffer;
+
+	//struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	float *constbuf[PIPE_SHADER_TYPES][32][4];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+
+	struct vertex_info vertex_info;
+
+	struct {
+		struct pipe_buffer *buffer;
+		uint32_t format;
+	} tex[2];
+
+	unsigned vb_enable;
+	struct {
+		struct pipe_buffer *buffer;
+		unsigned delta;
+	} vb[16];
+
+/*	struct {
+	
+		struct nouveau_resource *exec_heap;
+		struct nouveau_resource *data_heap;
+
+		struct nv20_vertex_program *active;
+
+		struct nv20_vertex_program *current;
+	} vertprog;
+*/
+	struct {
+		struct nv20_fragment_program *active;
+
+		struct nv20_fragment_program *current;
+		struct pipe_buffer *constant_buf;
+	} fragprog;
+
+	struct pipe_vertex_buffer  vtxbuf[PIPE_MAX_ATTRIBS];
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+};
+
+static INLINE struct nv20_context *
+nv20_context(struct pipe_context *pipe)
+{
+	return (struct nv20_context *)pipe;
+}
+
+extern void nv20_init_state_functions(struct nv20_context *nv20);
+extern void nv20_init_surface_functions(struct nv20_context *nv20);
+
+extern void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv20_clear.c */
+extern void nv20_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv20_draw.c */
+extern struct draw_stage *nv20_draw_render_stage(struct nv20_context *nv20);
+
+/* nv20_fragprog.c */
+extern void nv20_fragprog_bind(struct nv20_context *,
+			       struct nv20_fragment_program *);
+extern void nv20_fragprog_destroy(struct nv20_context *,
+				  struct nv20_fragment_program *);
+
+/* nv20_fragtex.c */
+extern void nv20_fragtex_bind(struct nv20_context *);
+
+/* nv20_prim_vbuf.c */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 );
+extern void nv20_vtxbuf_bind(struct nv20_context* nv20);
+
+/* nv20_state.c and friends */
+extern void nv20_emit_hw_state(struct nv20_context *nv20);
+extern void nv20_state_tex_update(struct nv20_context *nv20);
+
+/* nv20_vbo.c */
+extern boolean nv20_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count);
+
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_fragprog.c b/src/gallium/drivers/nv20/nv20_fragprog.c
new file mode 100644
index 0000000000..4f496369dd
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragprog.c
@@ -0,0 +1,21 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv20_context.h"
+
+void
+nv20_fragprog_bind(struct nv20_context *nv20, struct nv20_fragment_program *fp)
+{
+}
+
+void
+nv20_fragprog_destroy(struct nv20_context *nv20,
+		      struct nv20_fragment_program *fp)
+{
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_fragtex.c b/src/gallium/drivers/nv20/nv20_fragtex.c
new file mode 100644
index 0000000000..495a7be912
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_fragtex.c
@@ -0,0 +1,124 @@
+#include "nv20_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf)                                                                \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV20TCL_TX_FORMAT_FORMAT_##tf,                                               \
+}
+
+struct nv20_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+};
+
+static struct nv20_texture_format
+nv20_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8),
+	_(A1R5G5B5_UNORM, A1R5G5B5),
+	_(A4R4G4B4_UNORM, A4R4G4B4),
+	_(L8_UNORM      , L8      ),
+	_(A8_UNORM      , A8      ),
+	_(A8L8_UNORM    , A8L8    ),
+/*	_(RGB_DXT1      , DXT1,   ), */
+/*	_(RGBA_DXT1     , DXT1,   ), */
+/*	_(RGBA_DXT3     , DXT3,   ), */
+/*	_(RGBA_DXT5     , DXT5,   ), */
+	{},
+};
+
+static struct nv20_texture_format *
+nv20_fragtex_format(uint pipe_format)
+{
+	struct nv20_texture_format *tf = nv20_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	return NULL;
+}
+
+
+static void
+nv20_fragtex_build(struct nv20_context *nv20, int unit)
+{
+#if 0
+	struct nv20_sampler_state *ps = nv20->tex_sampler[unit];
+	struct nv20_miptree *nv20mt = nv20->tex_miptree[unit];
+	struct pipe_texture *pt = &nv20mt->base;
+	struct nv20_texture_format *tf;
+	uint32_t txf, txs, txp;
+
+	tf = nv20_fragtex_format(pt->format);
+	if (!tf || !tf->defined) {
+		NOUVEAU_ERR("Unsupported texture format: 0x%x\n", pt->format);
+		return;
+	}
+
+	txf  = tf->format << 8;
+	txf |= (pt->last_level + 1) << 16;
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= 8;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV10TCL_TX_FORMAT_CUBE_MAP;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= (2<<4);
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= (1<<4);
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_TX_OFFSET(unit), 8);
+	OUT_RELOCl(nv20mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+	OUT_RELOCd(nv20mt->buffer,txf,NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
+	OUT_RING  (ps->wrap);
+	OUT_RING  (0x40000000); /* enable */
+	OUT_RING  (txs);
+	OUT_RING  (ps->filt | 0x2000 /* magic */);
+	OUT_RING  ((pt->width[0] << 16) | pt->height[0]);
+	OUT_RING  (ps->bcol);
+#endif
+}
+
+void
+nv20_fragtex_bind(struct nv20_context *nv20)
+{
+#if 0
+	struct nv20_fragment_program *fp = nv20->fragprog.active;
+	unsigned samplers, unit;
+
+	samplers = nv20->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		BEGIN_RING(kelvin, NV10TCL_TX_ENABLE(unit), 1);
+		OUT_RING  (0);
+	}
+
+	samplers = nv20->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		nv20_fragtex_build(nv20, unit);
+	}
+
+	nv20->fp_samplers = fp->samplers;
+#endif
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
new file mode 100644
index 0000000000..c6106d58c4
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -0,0 +1,156 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static void
+nv20_miptree_layout(struct nv20_miptree *nv20mt)
+{
+	struct pipe_texture *pt = &nv20mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0];
+	uint offset = 0;
+	int nr_faces, l, f;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else {
+		nr_faces = 1;
+	}
+	
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			nv20mt->level[l].pitch = pt->nblocksx[l] * pt->block.size;
+		else
+			nv20mt->level[l].pitch = pt->nblocksx[0] * pt->block.size;
+		nv20mt->level[l].pitch = (nv20mt->level[l].pitch + 63) & ~63;
+
+		nv20mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv20mt->level[l].image_offset[f] = offset;
+			offset += nv20mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv20mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv20_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv20_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = screen;
+
+	nv20_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256, PIPE_BUFFER_USAGE_PIXEL,
+					   mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+	
+	return &mt->base;
+}
+
+static void
+nv20_miptree_release(struct pipe_screen *screen, struct pipe_texture **pt)
+{
+	struct pipe_texture *mt = *pt;
+
+	*pt = NULL;
+	if (--mt->refcount <= 0) {
+		struct nv20_miptree *nv20mt = (struct nv20_miptree *)mt;
+		int l;
+
+		pipe_buffer_reference(screen, &nv20mt->buffer, NULL);
+		for (l = 0; l <= mt->last_level; l++) {
+			if (nv20mt->level[l].image_offset)
+				FREE(nv20mt->level[l].image_offset);
+		}
+		FREE(nv20mt);
+	}
+}
+
+static struct pipe_surface *
+nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv20_miptree *nv20mt = (struct nv20_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(screen, &ps->buffer, nv20mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv20mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = screen->winsys;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv20mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv20mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv20mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv20_miptree_surface_release(struct pipe_screen *pscreen,
+			     struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void nv20_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv20_miptree_create;
+	pscreen->texture_release = nv20_miptree_release;
+	pscreen->get_tex_surface = nv20_miptree_surface_get;
+	pscreen->tex_surface_release = nv20_miptree_surface_release;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_prim_vbuf.c b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
new file mode 100644
index 0000000000..74540845a8
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_prim_vbuf.c
@@ -0,0 +1,402 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Build post-transformation, post-clipping vertex buffers and element
+ * lists by hooking into the end of the primitive pipeline and
+ * manipulating the vertex_id field in the vertex headers.
+ *
+ * XXX: work in progress 
+ * 
+ * \author José Fonseca <jrfonseca@tungstengraphics.com>
+ * \author Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "draw/draw_vbuf.h"
+
+/**
+ * Primitive renderer for nv20.
+ */
+struct nv20_vbuf_render {
+	struct vbuf_render base;
+
+	struct nv20_context *nv20;   
+
+	/** Vertex buffer in VRAM */
+	struct pipe_buffer *pbuffer;
+
+	/** Vertex buffer in normal memory */
+	void *mbuffer;
+
+	/** Vertex size in bytes */
+	/*unsigned vertex_size;*/
+
+	/** Hardware primitive */
+	unsigned hwprim;
+};
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct nv20_vbuf_render *
+nv20_vbuf_render(struct vbuf_render *render)
+{
+	assert(render);
+	return (struct nv20_vbuf_render *)render;
+}
+
+void nv20_vtxbuf_bind( struct nv20_context* nv20 )
+{
+#if 0
+	int i;
+	for(i = 0; i < NV20TCL_VTXBUF_ADDRESS__SIZE; i++) {
+		BEGIN_RING(kelvin, NV20TCL_VTXBUF_ADDRESS(i), 1);
+		OUT_RING(0/*nv20->vtxbuf*/);
+		BEGIN_RING(kelvin, NV20TCL_VTXFMT(i) ,1);
+		OUT_RING(0/*XXX*/);
+	}
+#endif
+}
+
+static const struct vertex_info *
+nv20_vbuf_render_get_vertex_info( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+
+	nv20_emit_hw_state(nv20);
+
+	return &nv20->vertex_info;
+}
+
+static void *
+nv20__allocate_mbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	nv20_render->mbuffer = MALLOC(size);
+	return nv20_render->mbuffer;
+}
+
+static void *
+nv20__allocate_pbuffer(struct nv20_vbuf_render *nv20_render, size_t size)
+{
+	struct pipe_winsys *winsys = nv20_render->nv20->pipe.winsys;
+	nv20_render->pbuffer = winsys->buffer_create(winsys, 64,
+					PIPE_BUFFER_USAGE_VERTEX, size);
+	return winsys->buffer_map(winsys,
+			nv20_render->pbuffer,
+			PIPE_BUFFER_USAGE_CPU_WRITE);
+}
+
+static void *
+nv20_vbuf_render_allocate_vertices( struct vbuf_render *render,
+		ushort vertex_size,
+		ushort nr_vertices )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	size_t size = (size_t)vertex_size * (size_t)nr_vertices;
+	void *buf;
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	/*
+	 * For small amount of vertices, don't bother with pipe vertex
+	 * buffer, the data will be passed directly via the fifo.
+	 */
+	/* XXX: Pipe vertex buffers don't work. */
+	if (0 && size > 16 * 1024)
+		buf = nv20__allocate_pbuffer(nv20_render, size);
+	else
+		buf = nv20__allocate_mbuffer(nv20_render, size);
+
+	if (buf)
+		nv20_render->nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	return buf;
+}
+
+static boolean
+nv20_vbuf_render_set_primitive( struct vbuf_render *render, 
+		unsigned prim )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	unsigned hwp = nvgl_primitive(prim);
+	if (hwp == 0)
+		return FALSE;
+
+	nv20_render->hwprim = hwp;
+	return TRUE;
+}
+
+static uint32_t
+nv20__vtxhwformat(unsigned stride, unsigned fields, unsigned type)
+{
+	return (stride << NV20TCL_VTXFMT_STRIDE_SHIFT) |
+		(fields << NV20TCL_VTXFMT_SIZE_SHIFT) |
+		(type << NV20TCL_VTXFMT_TYPE_SHIFT);
+}
+
+static unsigned
+nv20__emit_format(struct nv20_context *nv20, enum attrib_emit type, int hwattr)
+{
+	uint32_t hwfmt = 0;
+	unsigned fields;
+
+	switch (type) {
+	case EMIT_OMIT:
+		hwfmt = nv20__vtxhwformat(0, 0, 2);
+		fields = 0;
+		break;
+	case EMIT_1F:
+		hwfmt = nv20__vtxhwformat(4, 1, 2);
+		fields = 1;
+		break;
+	case EMIT_2F:
+		hwfmt = nv20__vtxhwformat(8, 2, 2);
+		fields = 2;
+		break;
+	case EMIT_3F:
+		hwfmt = nv20__vtxhwformat(12, 3, 2);
+		fields = 3;
+		break;
+	case EMIT_4F:
+		hwfmt = nv20__vtxhwformat(16, 4, 2);
+		fields = 4;
+		break;
+	default:
+		NOUVEAU_ERR("unhandled attrib_emit %d\n", type);
+		return 0;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VTXFMT(hwattr), 1);
+	OUT_RING(hwfmt);
+	return fields;
+}
+
+static unsigned
+nv20__emit_vertex_array_format(struct nv20_context *nv20)
+{
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	int hwattr = NV20TCL_VTXFMT__SIZE;
+	int attr = 0;
+	unsigned nr_fields = 0;
+
+	while (hwattr-- > 0) {
+		if (vinfo->hwfmt[0] & (1 << hwattr)) {
+			nr_fields += nv20__emit_format(nv20,
+					vinfo->attrib[attr].emit, hwattr);
+			attr++;
+		} else
+			nv20__emit_format(nv20, EMIT_OMIT, hwattr);
+	}
+
+	return nr_fields;
+}
+
+static void
+nv20__draw_mbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	unsigned nr_fields;
+	int max_push;
+	ubyte *data = nv20_render->mbuffer;
+	int vsz = 4 * vinfo->size;
+
+	nr_fields = nv20__emit_vertex_array_format(nv20);
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	max_push = 1200 / nr_fields;
+	while (nr_indices) {
+		int i;
+		int push = MIN2(nr_indices, max_push);
+
+		BEGIN_RING_NI(kelvin, NV20TCL_VERTEX_DATA, push * nr_fields);
+		for (i = 0; i < push; i++) {
+			/* XXX: fixme to handle other than floats? */
+			int f = nr_fields;
+			float *attrv = (float*)&data[indices[i] * vsz];
+			while (f-- > 0)
+				OUT_RINGf(*attrv++);
+		}
+
+		nr_indices -= push;
+		indices += push;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_VERTEX_BEGIN_END, 1);
+	OUT_RING(NV20TCL_VERTEX_BEGIN_END_STOP);
+}
+
+static void
+nv20__draw_pbuffer(struct nv20_vbuf_render *nv20_render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_context *nv20 = nv20_render->nv20;
+	int push, i;
+
+	NOUVEAU_ERR("nv20__draw_pbuffer: this path is broken.\n");
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_ARRAY_OFFSET_POS, 1);
+	OUT_RELOCl(nv20_render->pbuffer, 0,
+			NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING(nv20_render->hwprim);
+
+	if (nr_indices & 1) {
+		BEGIN_RING(kelvin, NV10TCL_VB_ELEMENT_U32, 1);
+		OUT_RING  (indices[0]);
+		indices++; nr_indices--;
+	}
+
+	while (nr_indices) {
+		// XXX too big/small ? check the size
+		push = MIN2(nr_indices, 1200 * 2);
+
+		BEGIN_RING_NI(kelvin, NV10TCL_VB_ELEMENT_U16, push >> 1);
+		for (i = 0; i < push; i+=2)
+			OUT_RING((indices[i+1] << 16) | indices[i]);
+
+		nr_indices -= push;
+		indices  += push;
+	}
+
+	BEGIN_RING(kelvin, NV10TCL_VERTEX_BUFFER_BEGIN_END, 1);
+	OUT_RING  (0);
+}
+
+static void
+nv20_vbuf_render_draw( struct vbuf_render *render,
+		const ushort *indices,
+		uint nr_indices)
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	nv20_emit_hw_state(nv20_render->nv20);
+
+	if (nv20_render->pbuffer)
+		nv20__draw_pbuffer(nv20_render, indices, nr_indices);
+	else if (nv20_render->mbuffer)
+		nv20__draw_mbuffer(nv20_render, indices, nr_indices);
+	else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_release_vertices( struct vbuf_render *render,
+		void *vertices, 
+		unsigned vertex_size,
+		unsigned vertices_used )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+	struct nv20_context *nv20 = nv20_render->nv20;
+	struct pipe_winsys *winsys = nv20->pipe.winsys;
+	struct pipe_screen *pscreen = &nv20->screen->pipe;
+
+	if (nv20_render->pbuffer) {
+		winsys->buffer_unmap(winsys, nv20_render->pbuffer);
+		pipe_buffer_reference(pscreen, &nv20_render->pbuffer, NULL);
+	} else if (nv20_render->mbuffer) {
+		FREE(nv20_render->mbuffer);
+		nv20_render->mbuffer = NULL;
+	} else
+		assert(0);
+}
+
+
+static void
+nv20_vbuf_render_destroy( struct vbuf_render *render )
+{
+	struct nv20_vbuf_render *nv20_render = nv20_vbuf_render(render);
+
+	assert(!nv20_render->pbuffer);
+	assert(!nv20_render->mbuffer);
+
+	FREE(nv20_render);
+}
+
+
+/**
+ * Create a new primitive render.
+ */
+static struct vbuf_render *
+nv20_vbuf_render_create( struct nv20_context *nv20 )
+{
+	struct nv20_vbuf_render *nv20_render = CALLOC_STRUCT(nv20_vbuf_render);
+
+	nv20_render->nv20 = nv20;
+
+	nv20_render->base.max_vertex_buffer_bytes = 16*1024;
+	nv20_render->base.max_indices = 1024;
+	nv20_render->base.get_vertex_info = nv20_vbuf_render_get_vertex_info;
+	nv20_render->base.allocate_vertices =
+					nv20_vbuf_render_allocate_vertices;
+	nv20_render->base.set_primitive = nv20_vbuf_render_set_primitive;
+	nv20_render->base.draw = nv20_vbuf_render_draw;
+	nv20_render->base.release_vertices = nv20_vbuf_render_release_vertices;
+	nv20_render->base.destroy = nv20_vbuf_render_destroy;
+
+	return &nv20_render->base;
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *nv20_draw_vbuf_stage( struct nv20_context *nv20 )
+{
+	struct vbuf_render *render;
+	struct draw_stage *stage;
+
+	render = nv20_vbuf_render_create(nv20);
+	if(!render)
+		return NULL;
+
+	stage = draw_vbuf_stage( nv20->draw, render );
+	if(!stage) {
+		render->destroy(render);
+		return NULL;
+	}
+
+	return stage;
+}
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
new file mode 100644
index 0000000000..c0a90f6c58
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -0,0 +1,204 @@
+#include "pipe/p_screen.h"
+
+#include "nv20_context.h"
+#include "nv20_screen.h"
+
+static const char *
+nv20_screen_get_name(struct pipe_screen *screen)
+{
+	struct nv20_screen *nv20screen = nv20_screen(screen);
+	struct nouveau_device *dev = nv20screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv20_screen_get_vendor(struct pipe_screen *screen)
+{
+	return "nouveau";
+}
+
+static int
+nv20_screen_get_param(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 2;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 0;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 1;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 12;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 12;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv20_screen_get_paramf(struct pipe_screen *screen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 2.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv20_screen_is_format_supported(struct pipe_screen *screen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv20_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys *ws = screen->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, surface->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface->offset;
+}
+
+static void
+nv20_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys *ws = screen->winsys;
+
+	ws->buffer_unmap(ws, surface->buffer);
+}
+
+static void
+nv20_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv20_screen *screen = nv20_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->kelvin);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv20_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv20_screen *screen = CALLOC_STRUCT(nv20_screen);
+	unsigned kelvin_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if (chipset >= 0x25)
+		kelvin_class = NV25TCL;
+	else if (chipset >= 0x20)
+		kelvin_class = NV20TCL;
+
+	if (!kelvin_class || chipset >= 0x30) {
+		NOUVEAU_ERR("Unknown nv2x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, kelvin_class, &screen->kelvin);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv20_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv20_screen_destroy;
+
+	screen->pipe.get_name = nv20_screen_get_name;
+	screen->pipe.get_vendor = nv20_screen_get_vendor;
+	screen->pipe.get_param = nv20_screen_get_param;
+	screen->pipe.get_paramf = nv20_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv20_screen_is_format_supported;
+
+	screen->pipe.surface_map = nv20_surface_map;
+	screen->pipe.surface_unmap = nv20_surface_unmap;
+
+	nv20_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_screen.h b/src/gallium/drivers/nv20/nv20_screen.h
new file mode 100644
index 0000000000..8f2f2e341d
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_screen.h
@@ -0,0 +1,22 @@
+#ifndef __NV20_SCREEN_H__
+#define __NV20_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv20_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *kelvin;
+	struct nouveau_notifier *sync;
+};
+
+static INLINE struct nv20_screen *
+nv20_screen(struct pipe_screen *screen)
+{
+	return (struct nv20_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state.c b/src/gallium/drivers/nv20/nv20_state.c
new file mode 100644
index 0000000000..21bde5b81f
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.c
@@ -0,0 +1,581 @@
+#include "draw/draw_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+static void *
+nv20_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv20_blend_state *cb;
+
+	cb = MALLOC(sizeof(struct nv20_blend_state));
+
+	cb->b_enable = cso->blend_enable ? 1 : 0;
+	cb->b_srcfunc = ((nvgl_blend_func(cso->alpha_src_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_src_factor)));
+	cb->b_dstfunc = ((nvgl_blend_func(cso->alpha_dst_factor)<<16) |
+			 (nvgl_blend_func(cso->rgb_dst_factor)));
+
+	cb->c_mask = (((cso->colormask & PIPE_MASK_A) ? (0x01<<24) : 0) |
+		      ((cso->colormask & PIPE_MASK_R) ? (0x01<<16) : 0) |
+		      ((cso->colormask & PIPE_MASK_G) ? (0x01<< 8) : 0) |
+		      ((cso->colormask & PIPE_MASK_B) ? (0x01<< 0) : 0));
+
+	cb->d_enable = cso->dither ? 1 : 0;
+
+	return (void *)cb;
+}
+
+static void
+nv20_blend_state_bind(struct pipe_context *pipe, void *blend)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend = (struct nv20_blend_state*)blend;
+
+	nv20->dirty |= NV20_NEW_BLEND;
+}
+
+static void
+nv20_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV20TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV20TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV20TCL_TX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV20TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return (ret >> NV20TCL_TX_WRAP_S_SHIFT);
+}
+
+static void *
+nv20_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv20_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv20_sampler_state));
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV20TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV20TCL_TX_WRAP_T_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy > 1.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+/*		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV20TCL_TX_ENABLE_ANISO_2X;
+		}*/
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV20TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |=
+				NV20TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV20TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+/*	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV10TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}*/
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv20_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_sampler[unit] = sampler[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void
+nv20_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv20_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv20->tex_miptree[unit] = (struct nv20_miptree *)miptree[unit];
+		nv20->dirty_samplers |= (1 << unit);
+	}
+}
+
+static void *
+nv20_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv20_rasterizer_state *rs;
+	int i;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	offset_cw/ccw -nohw
+	 * 	scissor
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 * 	offset_units / offset_scale
+	 */
+	rs = MALLOC(sizeof(struct nv20_rasterizer_state));
+
+	rs->templ = cso;
+	
+	rs->shade_model = cso->flatshade ? NV20TCL_SHADE_MODEL_FLAT :
+						NV20TCL_SHADE_MODEL_SMOOTH;
+
+	rs->line_width = (unsigned char)(cso->line_width * 8.0) & 0xff;
+	rs->line_smooth_en = cso->line_smooth ? 1 : 0;
+
+	/* XXX: nv20 and nv25 different! */
+	rs->point_size = *(uint32_t*)&cso->point_size;
+
+	rs->poly_smooth_en = cso->poly_smooth ? 1 : 0;
+
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		rs->front_face = NV20TCL_FRONT_FACE_CCW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_ccw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_cw);
+	} else {
+		rs->front_face = NV20TCL_FRONT_FACE_CW;
+		rs->poly_mode_front = nvgl_polygon_mode(cso->fill_cw);
+		rs->poly_mode_back  = nvgl_polygon_mode(cso->fill_ccw);
+	}
+
+	switch (cso->cull_mode) {
+	case PIPE_WINDING_CCW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CCW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_CW:
+		rs->cull_face_en = 1;
+		if (cso->front_winding == PIPE_WINDING_CW)
+			rs->cull_face    = NV20TCL_CULL_FACE_FRONT;
+		else
+			rs->cull_face    = NV20TCL_CULL_FACE_BACK;
+		break;
+	case PIPE_WINDING_BOTH:
+		rs->cull_face_en = 1;
+		rs->cull_face    = NV20TCL_CULL_FACE_FRONT_AND_BACK;
+		break;
+	case PIPE_WINDING_NONE:
+	default:
+		rs->cull_face_en = 0;
+		rs->cull_face    = 0;
+		break;
+	}
+
+	if (cso->point_sprite) {
+		rs->point_sprite = (1 << 0);
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				rs->point_sprite |= (1 << (8 + i));
+		}
+	} else {
+		rs->point_sprite = 0;
+	}
+
+	return (void *)rs;
+}
+
+static void
+nv20_rasterizer_state_bind(struct pipe_context *pipe, void *rast)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->rast = (struct nv20_rasterizer_state*)rast;
+
+	draw_set_rasterizer_state(nv20->draw, (nv20->rast ? nv20->rast->templ : NULL));
+
+	nv20->dirty |= NV20_NEW_RAST;
+}
+
+static void
+nv20_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv20_depth_stencil_alpha_state *hw;
+
+	hw = MALLOC(sizeof(struct nv20_depth_stencil_alpha_state));
+
+	hw->depth.func		= nvgl_comparison_op(cso->depth.func);
+	hw->depth.write_enable	= cso->depth.writemask ? 1 : 0;
+	hw->depth.test_enable	= cso->depth.enabled ? 1 : 0;
+
+	hw->stencil.enable = cso->stencil[0].enabled ? 1 : 0;
+	hw->stencil.wmask = cso->stencil[0].write_mask;
+	hw->stencil.func = nvgl_comparison_op(cso->stencil[0].func);
+	hw->stencil.ref	= cso->stencil[0].ref_value;
+	hw->stencil.vmask = cso->stencil[0].value_mask;
+	hw->stencil.fail = nvgl_stencil_op(cso->stencil[0].fail_op);
+	hw->stencil.zfail = nvgl_stencil_op(cso->stencil[0].zfail_op);
+	hw->stencil.zpass = nvgl_stencil_op(cso->stencil[0].zpass_op);
+
+	hw->alpha.enabled = cso->alpha.enabled ? 1 : 0;
+	hw->alpha.func = nvgl_comparison_op(cso->alpha.func);
+	hw->alpha.ref  = float_to_ubyte(cso->alpha.ref);
+
+	return (void *)hw;
+}
+
+static void
+nv20_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *dsa)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->dsa = (struct nv20_depth_stencil_alpha_state*)dsa;
+
+	nv20->dirty |= NV20_NEW_DSA;
+}
+
+static void
+nv20_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void *
+nv20_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *templ)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	return draw_create_vertex_shader(nv20->draw, templ);
+}
+
+static void
+nv20_vp_state_bind(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_bind_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+
+	nv20->dirty |= NV20_NEW_VERTPROG;
+}
+
+static void
+nv20_vp_state_delete(struct pipe_context *pipe, void *shader)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_delete_vertex_shader(nv20->draw, (struct draw_vertex_shader *) shader);
+}
+
+static void *
+nv20_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv20_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv20_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	
+	tgsi_scan_shader(cso->tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv20_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20->fragprog.current = fp;
+	nv20->dirty |= NV20_NEW_FRAGPROG;
+}
+
+static void
+nv20_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nv20_fragment_program *fp = hwcso;
+
+	nv20_fragprog_destroy(nv20, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv20_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->blend_color = (struct pipe_blend_color*)bcol;
+
+	nv20->dirty |= NV20_NEW_BLENDCOL;
+}
+
+static void
+nv20_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	draw_set_clip_state(nv20->draw, clip);
+}
+
+static void
+nv20_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+
+	assert(shader < PIPE_SHADER_TYPES);
+	assert(index == 0);
+
+	if (buf) {
+		void *mapped;
+		if (buf->size && (mapped = ws->buffer_map(ws, buf->buffer, PIPE_BUFFER_USAGE_CPU_READ)))
+		{
+			memcpy(nv20->constbuf[shader], mapped, buf->size);
+			nv20->constbuf_nr[shader] =
+				buf->size / (4 * sizeof(float));
+			ws->buffer_unmap(ws, buf->buffer);
+		}
+	}
+}
+
+static void
+nv20_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->framebuffer = (struct pipe_framebuffer_state*)fb;
+
+	nv20->dirty |= NV20_NEW_FRAMEBUFFER;
+}
+
+static void
+nv20_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	NOUVEAU_ERR("line stipple hahaha\n");
+}
+
+static void
+nv20_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->scissor = (struct pipe_scissor_state*)s;
+
+	nv20->dirty |= NV20_NEW_SCISSOR;
+}
+
+static void
+nv20_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	nv20->viewport = (struct pipe_viewport_state*)vpt;
+
+	draw_set_viewport_state(nv20->draw, nv20->viewport);
+
+	nv20->dirty |= NV20_NEW_VIEWPORT;
+}
+
+static void
+nv20_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxbuf, vb, sizeof(*vb) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_buffers(nv20->draw, count, vb);
+}
+
+static void
+nv20_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+
+	memcpy(nv20->vtxelt, ve, sizeof(*ve) * count);
+	nv20->dirty |= NV20_NEW_VTXARRAYS;
+
+	draw_set_vertex_elements(nv20->draw, count, ve);
+}
+
+void
+nv20_init_state_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.create_blend_state = nv20_blend_state_create;
+	nv20->pipe.bind_blend_state = nv20_blend_state_bind;
+	nv20->pipe.delete_blend_state = nv20_blend_state_delete;
+
+	nv20->pipe.create_sampler_state = nv20_sampler_state_create;
+	nv20->pipe.bind_sampler_states = nv20_sampler_state_bind;
+	nv20->pipe.delete_sampler_state = nv20_sampler_state_delete;
+	nv20->pipe.set_sampler_textures = nv20_set_sampler_texture;
+
+	nv20->pipe.create_rasterizer_state = nv20_rasterizer_state_create;
+	nv20->pipe.bind_rasterizer_state = nv20_rasterizer_state_bind;
+	nv20->pipe.delete_rasterizer_state = nv20_rasterizer_state_delete;
+
+	nv20->pipe.create_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_create;
+	nv20->pipe.bind_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_bind;
+	nv20->pipe.delete_depth_stencil_alpha_state =
+		nv20_depth_stencil_alpha_state_delete;
+
+	nv20->pipe.create_vs_state = nv20_vp_state_create;
+	nv20->pipe.bind_vs_state = nv20_vp_state_bind;
+	nv20->pipe.delete_vs_state = nv20_vp_state_delete;
+
+	nv20->pipe.create_fs_state = nv20_fp_state_create;
+	nv20->pipe.bind_fs_state = nv20_fp_state_bind;
+	nv20->pipe.delete_fs_state = nv20_fp_state_delete;
+
+	nv20->pipe.set_blend_color = nv20_set_blend_color;
+	nv20->pipe.set_clip_state = nv20_set_clip_state;
+	nv20->pipe.set_constant_buffer = nv20_set_constant_buffer;
+	nv20->pipe.set_framebuffer_state = nv20_set_framebuffer_state;
+	nv20->pipe.set_polygon_stipple = nv20_set_polygon_stipple;
+	nv20->pipe.set_scissor_state = nv20_set_scissor_state;
+	nv20->pipe.set_viewport_state = nv20_set_viewport_state;
+
+	nv20->pipe.set_vertex_buffers = nv20_set_vertex_buffers;
+	nv20->pipe.set_vertex_elements = nv20_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_state.h b/src/gallium/drivers/nv20/nv20_state.h
new file mode 100644
index 0000000000..34f402fdcb
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state.h
@@ -0,0 +1,139 @@
+#ifndef __NV20_STATE_H__
+#define __NV20_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv20_blend_state {
+	uint32_t b_enable;
+	uint32_t b_srcfunc;
+	uint32_t b_dstfunc;
+
+	uint32_t c_mask;
+
+	uint32_t d_enable;
+};
+
+struct nv20_sampler_state {
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv20_rasterizer_state {
+	uint32_t shade_model;
+
+	uint32_t line_width;
+	uint32_t line_smooth_en;
+
+	uint32_t point_size;
+
+	uint32_t poly_smooth_en;
+	
+	uint32_t poly_mode_front;
+	uint32_t poly_mode_back;
+
+	uint32_t front_face;
+	uint32_t cull_face;
+	uint32_t cull_face_en;
+
+	uint32_t point_sprite;
+
+	const struct pipe_rasterizer_state *templ;
+};
+
+struct nv20_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv20_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv20_vertex_program {
+	const struct pipe_shader_state *pipe;
+
+	boolean translated;
+	struct nv20_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv20_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+};
+
+struct nv20_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv20_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv20_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+};
+
+
+struct nv20_depth_stencil_alpha_state {
+	struct {
+		uint32_t func;
+		uint32_t write_enable;
+		uint32_t test_enable;
+	} depth;
+
+	struct {
+		uint32_t enable;
+		uint32_t wmask;
+		uint32_t func;
+		uint32_t ref;
+		uint32_t vmask;
+		uint32_t fail;
+		uint32_t zfail;
+		uint32_t zpass;
+	} stencil;
+
+	struct {
+		uint32_t enabled;
+		uint32_t func;
+		uint32_t ref;
+	} alpha;
+};
+
+struct nv20_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
new file mode 100644
index 0000000000..5265bf3a31
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -0,0 +1,359 @@
+#include "nv20_context.h"
+#include "nv20_state.h"
+#include "draw/draw_context.h"
+
+static void nv20_state_emit_blend(struct nv20_context* nv20)
+{
+	struct nv20_blend_state *b = nv20->blend;
+
+	BEGIN_RING(kelvin, NV20TCL_DITHER_ENABLE, 1);
+	OUT_RING  (b->d_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_ENABLE, 1);
+	OUT_RING  (b->b_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_FUNC_SRC, 2);
+	OUT_RING  (b->b_srcfunc);
+	OUT_RING  (b->b_dstfunc);
+
+	BEGIN_RING(kelvin, NV20TCL_COLOR_MASK, 1);
+	OUT_RING  (b->c_mask);
+}
+
+static void nv20_state_emit_blend_color(struct nv20_context* nv20)
+{
+	struct pipe_blend_color *c = nv20->blend_color;
+
+	BEGIN_RING(kelvin, NV20TCL_BLEND_COLOR, 1);
+	OUT_RING  ((float_to_ubyte(c->color[3]) << 24)|
+		   (float_to_ubyte(c->color[0]) << 16)|
+		   (float_to_ubyte(c->color[1]) << 8) |
+		   (float_to_ubyte(c->color[2]) << 0));
+}
+
+static void nv20_state_emit_rast(struct nv20_context* nv20)
+{
+	struct nv20_rasterizer_state *r = nv20->rast;
+
+	BEGIN_RING(kelvin, NV20TCL_SHADE_MODEL, 2);
+	OUT_RING  (r->shade_model);
+	OUT_RING  (r->line_width);
+
+
+	BEGIN_RING(kelvin, NV20TCL_POINT_SIZE, 1);
+	OUT_RING  (r->point_size);
+
+	BEGIN_RING(kelvin, NV20TCL_POLYGON_MODE_FRONT, 2);
+	OUT_RING  (r->poly_mode_front);
+	OUT_RING  (r->poly_mode_back);
+
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE, 2);
+	OUT_RING  (r->cull_face);
+	OUT_RING  (r->front_face);
+
+	BEGIN_RING(kelvin, NV20TCL_LINE_SMOOTH_ENABLE, 2);
+	OUT_RING  (r->line_smooth_en);
+	OUT_RING  (r->poly_smooth_en);
+
+	BEGIN_RING(kelvin, NV20TCL_CULL_FACE_ENABLE, 1);
+	OUT_RING  (r->cull_face_en);
+}
+
+static void nv20_state_emit_dsa(struct nv20_context* nv20)
+{
+	struct nv20_depth_stencil_alpha_state *d = nv20->dsa;
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_FUNC, 1);
+	OUT_RING (d->depth.func);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_WRITE_ENABLE, 1);
+	OUT_RING (d->depth.write_enable);
+
+	BEGIN_RING(kelvin, NV20TCL_DEPTH_TEST_ENABLE, 1);
+	OUT_RING (d->depth.test_enable);
+
+#if 0
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_ENABLE, 1);
+	OUT_RING (d->stencil.enable);
+	BEGIN_RING(kelvin, NV20TCL_STENCIL_MASK, 7);
+	OUT_RINGp ((uint32_t *)&(d->stencil.wmask), 7);
+#endif
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_ENABLE, 1);
+	OUT_RING (d->alpha.enabled);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_FUNC, 1);
+	OUT_RING (d->alpha.func);
+
+	BEGIN_RING(kelvin, NV20TCL_ALPHA_FUNC_REF, 1);
+	OUT_RING (d->alpha.ref);
+}
+
+static void nv20_state_emit_viewport(struct nv20_context* nv20)
+{
+}
+
+static void nv20_state_emit_scissor(struct nv20_context* nv20)
+{
+	/* NV20TCL_SCISSOR_* is probably a software method */
+/*	struct pipe_scissor_state *s = nv20->scissor;
+	BEGIN_RING(kelvin, NV20TCL_SCISSOR_HORIZ, 2);
+	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
+	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
+}
+
+static void nv20_state_emit_framebuffer(struct nv20_context* nv20)
+{
+	struct pipe_framebuffer_state* fb = nv20->framebuffer;
+	struct pipe_surface *rt, *zeta = NULL;
+	uint32_t rt_format, w, h;
+	int colour_format = 0, zeta_format = 0;
+
+	w = fb->cbufs[0]->width;
+	h = fb->cbufs[0]->height;
+	colour_format = fb->cbufs[0]->format;
+	rt = fb->cbufs[0];
+
+	if (fb->zsbuf) {
+		if (colour_format) {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		} else {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+		}
+
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	rt_format = NV20TCL_RT_FORMAT_TYPE_LINEAR | 0x20;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV20TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (zeta) {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (zeta->stride << 16));
+	} else {
+		BEGIN_RING(kelvin, NV20TCL_RT_PITCH, 1);
+		OUT_RING  (rt->stride | (rt->stride << 16));
+	}
+
+	nv20->rt[0] = rt->buffer;
+
+	if (zeta_format)
+	{
+		nv20->zeta = zeta->buffer;
+	}
+
+	BEGIN_RING(kelvin, NV20TCL_RT_HORIZ, 3);
+	OUT_RING  ((w << 16) | 0);
+	OUT_RING  ((h << 16) | 0);
+	OUT_RING  (rt_format);
+	BEGIN_RING(kelvin, NV20TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	OUT_RING  (((w - 1) << 16) | 0);
+	OUT_RING  (((h - 1) << 16) | 0);
+}
+
+static void nv20_vertex_layout(struct nv20_context *nv20)
+{
+	struct nv20_fragment_program *fp = nv20->fragprog.current;
+	struct draw_context *dc = nv20->draw;
+	uint32_t src;
+	int i;
+	struct vertex_info *vinfo = &nv20->vertex_info;
+	const enum interp_mode colorInterp = INTERP_LINEAR;
+	boolean colors[2] = { FALSE };
+	boolean generics[4] = { FALSE };
+	boolean fog = FALSE;
+
+	memset(vinfo, 0, sizeof(*vinfo));
+
+	/*
+	 * NV10 hardware vertex attribute order:
+	 * fog, weight, normal, tex1, tex0, 2nd color, color, position
+	 * vinfo->hwfmt[0] has a used-bit corresponding to each of these.
+	 * relation to TGSI_SEMANTIC_*:
+	 * - POSITION: position (always used)
+	 * - COLOR: 2nd color, color
+	 * - GENERIC: weight, normal, tex1, tex0
+	 * - FOG: fog
+	 */
+
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		int isn = fp->info.input_semantic_name[i];
+		int isi = fp->info.input_semantic_index[i];
+		switch (isn) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			assert(isi < 2);
+			colors[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			assert(isi < 4);
+			generics[isi] = TRUE;
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = TRUE;
+			break;
+		default:
+			assert(0 && "unknown input_semantic_name");
+		}
+	}
+
+	if (fog) {
+		int src = draw_find_vs_output(dc, TGSI_SEMANTIC_FOG, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << 7);
+	}
+
+	for (i = 3; i >= 0; i--) {
+		int src;
+		if (!generics[i])
+			continue;
+		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+		vinfo->hwfmt[0] |= (1 << (i + 3));
+	}
+
+	if (colors[1]) {
+		int src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 1);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 2);
+	}
+
+	if (colors[0]) {
+		int src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 0);
+		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+		vinfo->hwfmt[0] |= (1 << 1);
+	}
+
+	/* always do position */
+	src = draw_find_vs_output(dc, TGSI_SEMANTIC_POSITION, 0);
+	draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
+	vinfo->hwfmt[0] |= (1 << 0);
+
+	draw_compute_vertex_size(vinfo);
+}
+
+void
+nv20_emit_hw_state(struct nv20_context *nv20)
+{
+	int i;
+
+	if (nv20->dirty & NV20_NEW_VERTPROG) {
+		//nv20_vertprog_bind(nv20, nv20->vertprog.current);
+		nv20->dirty &= ~NV20_NEW_VERTPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_FRAGPROG) {
+		nv20_fragprog_bind(nv20, nv20->fragprog.current);
+		/*XXX: clear NV20_NEW_FRAGPROG if no new program uploaded */
+		nv20->dirty_samplers |= (1<<10);
+		nv20->dirty_samplers = 0;
+	}
+
+	if (nv20->dirty_samplers || (nv20->dirty & NV20_NEW_FRAGPROG)) {
+		nv20_fragtex_bind(nv20);
+		nv20->dirty &= ~NV20_NEW_FRAGPROG;
+	}
+
+	if (nv20->dirty & NV20_NEW_VTXARRAYS) {
+		nv20->dirty &= ~NV20_NEW_VTXARRAYS;
+		nv20_vertex_layout(nv20);
+		nv20_vtxbuf_bind(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLEND) {
+		nv20->dirty &= ~NV20_NEW_BLEND;
+		nv20_state_emit_blend(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_BLENDCOL) {
+		nv20->dirty &= ~NV20_NEW_BLENDCOL;
+		nv20_state_emit_blend_color(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_RAST) {
+		nv20->dirty &= ~NV20_NEW_RAST;
+		nv20_state_emit_rast(nv20);
+	}
+
+	if (nv20->dirty & NV20_NEW_DSA) {
+		nv20->dirty &= ~NV20_NEW_DSA;
+		nv20_state_emit_dsa(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_VIEWPORT) {
+		nv20->dirty &= ~NV20_NEW_VIEWPORT;
+		nv20_state_emit_viewport(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_SCISSOR) {
+		nv20->dirty &= ~NV20_NEW_SCISSOR;
+		nv20_state_emit_scissor(nv20);
+	}
+
+ 	if (nv20->dirty & NV20_NEW_FRAMEBUFFER) {
+		nv20->dirty &= ~NV20_NEW_FRAMEBUFFER;
+		nv20_state_emit_framebuffer(nv20);
+	}
+
+	/* Emit relocs for every referenced buffer.
+	 * This is to ensure the bufmgr has an accurate idea of how
+	 * the buffer is used.  This isn't very efficient, but we don't
+	 * seem to take a significant performance hit.  Will be improved
+	 * at some point.  Vertex arrays are emitted by nv20_vbo.c
+	 */
+
+	/* Render target */
+/* XXX figre out who's who for NV10TCL_DMA_* and fill accordingly
+ *	BEGIN_RING(kelvin, NV20TCL_DMA_COLOR0, 1);
+ *	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); */
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	if (nv20->zeta) {
+/* XXX
+ *		BEGIN_RING(kelvin, NV20TCL_DMA_ZETA, 1);
+ *		OUT_RELOCo(nv20->zeta, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR); */
+		BEGIN_RING(kelvin, NV20TCL_ZETA_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+		/* XXX for when we allocate LMA on nv17 */
+/*		BEGIN_RING(kelvin, NV10TCL_LMA_DEPTH_BUFFER_OFFSET, 1);
+		OUT_RELOCl(nv20->zeta + lma_offset);*/
+	}
+
+	/* Vertex buffer */
+	BEGIN_RING(kelvin, NV20TCL_DMA_VTXBUF0, 1);
+	OUT_RELOCo(nv20->rt[0], NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(kelvin, NV20TCL_COLOR_OFFSET, 1);
+	OUT_RELOCl(nv20->rt[0], 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	/* Texture images */
+	for (i = 0; i < 2; i++) {
+		if (!(nv20->fp_samplers & (1 << i)))
+			continue;
+		BEGIN_RING(kelvin, NV20TCL_TX_OFFSET(i), 1);
+		OUT_RELOCl(nv20->tex[i].buffer, 0, NOUVEAU_BO_VRAM |
+			   NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		BEGIN_RING(kelvin, NV20TCL_TX_FORMAT(i), 1);
+		OUT_RELOCd(nv20->tex[i].buffer, nv20->tex[i].format,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
+			   NOUVEAU_BO_OR, NV20TCL_TX_FORMAT_DMA0,
+			   NV20TCL_TX_FORMAT_DMA1);
+	}
+}
+
diff --git a/src/gallium/drivers/nv20/nv20_surface.c b/src/gallium/drivers/nv20/nv20_surface.c
new file mode 100644
index 0000000000..7bc68d0ca2
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_surface.c
@@ -0,0 +1,64 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv20_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+
+static void
+nv20_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nouveau_winsys *nvws = nv20->nvws;
+
+	nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+			   width, height);
+}
+
+static void
+nv20_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv20_context *nv20 = nv20_context(pipe);
+	struct nouveau_winsys *nvws = nv20->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv20_init_surface_functions(struct nv20_context *nv20)
+{
+	nv20->pipe.surface_copy = nv20_surface_copy;
+	nv20->pipe.surface_fill = nv20_surface_fill;
+}
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
new file mode 100644
index 0000000000..4edc4efebd
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -0,0 +1,77 @@
+#include "draw/draw_context.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+boolean nv20_draw_elements( struct pipe_context *pipe,
+                    struct pipe_buffer *indexBuffer,
+                    unsigned indexSize,
+                    unsigned prim, unsigned start, unsigned count)
+{
+	struct nv20_context *nv20 = nv20_context( pipe );
+	struct draw_context *draw = nv20->draw;
+	unsigned i;
+
+	nv20_emit_hw_state(nv20);
+
+	/*
+	 * Map vertex buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			void *buf
+				= pipe->winsys->buffer_map(pipe->winsys,
+						nv20->vtxbuf[i].buffer,
+						PIPE_BUFFER_USAGE_CPU_READ);
+			draw_set_mapped_vertex_buffer(draw, i, buf);
+		}
+	}
+	/* Map index buffer, if present */
+	if (indexBuffer) {
+		void *mapped_indexes
+			= pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+					PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
+	}
+	else {
+		/* no index/element buffer */
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	draw_set_mapped_constant_buffer(draw,
+					nv20->constbuf[PIPE_SHADER_VERTEX],
+					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
+
+	/* draw! */
+	draw_arrays(nv20->draw, prim, start, count);
+
+	/*
+	 * unmap vertex/index buffers
+	 */
+	for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
+		if (nv20->vtxbuf[i].buffer) {
+			pipe->winsys->buffer_unmap(pipe->winsys, nv20->vtxbuf[i].buffer);
+			draw_set_mapped_vertex_buffer(draw, i, NULL);
+		}
+	}
+	if (indexBuffer) {
+		pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+		draw_set_mapped_element_buffer(draw, 0, NULL);
+	}
+
+	return TRUE;
+}
+
+boolean nv20_draw_arrays( struct pipe_context *pipe,
+				 unsigned prim, unsigned start, unsigned count)
+{
+	return nv20_draw_elements(pipe, NULL, 0, prim, start, count);
+}
+
+
+
diff --git a/src/gallium/drivers/nv20/nv20_vertprog.c b/src/gallium/drivers/nv20/nv20_vertprog.c
new file mode 100644
index 0000000000..a885fcd7a5
--- /dev/null
+++ b/src/gallium/drivers/nv20/nv20_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv20_context.h"
+#include "nv20_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv20_shader.h"
+
+#define swz(s,x,y,z,w) nv20_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv20_sr_neg((s))
+#define abs(s) nv20_sr_abs((s))
+
+struct nv20_vpc {
+	struct nv20_vertex_program *vp;
+
+	struct nv20_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv20_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv20_sreg
+temp(struct nv20_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv20_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv20_sreg
+constant(struct nv20_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	struct nv20_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv20_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv20_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv20_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv20_vpc *vpc, uint32_t *hw, int pos, struct nv20_sreg src)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv20_vpc *vpc, uint32_t *hw, int slot, struct nv20_sreg dst)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv20_vp_arith(struct nv20_vpc *vpc, int slot, int op,
+	      struct nv20_sreg dst, int mask,
+	      struct nv20_sreg s0, struct nv20_sreg s1,
+	      struct nv20_sreg s2)
+{
+	struct nv20_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv20_sreg
+tgsi_src(struct nv20_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv20_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv20_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv20_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv20_sreg
+tgsi_dst(struct nv20_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv20_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv20_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv20_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv20_vertprog_parse_instruction(struct nv20_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv20_sreg src[3], dst, tmp;
+	struct nv20_sreg none = nv20_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_parse_decl_output(struct nv20_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv20_vertprog_prepare(struct nv20_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv20_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv20_vertprog_translate(struct nv20_context *nv20,
+			struct nv20_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv20_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv20_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv20_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv20_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv20_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv20_vertprog_validate(struct nv20_context *nv20)
+{ 
+	struct nouveau_winsys *nvws = nv20->nvws;
+	struct pipe_winsys *ws = nv20->pipe.winsys;
+	struct nouveau_grobj *rankine = nv20->screen->rankine;
+	struct nv20_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv20->vertprog;
+	constbuf = nv20->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv20_vertprog_translate(nv20, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv20->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv20->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv20_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv20_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv20_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv20->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv20->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv20_vertprog_destroy(struct nv20_context *nv20, struct nv20_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv20->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv20_state_entry nv20_state_vertprog = {
+	.validate = nv20_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv30/Makefile b/src/gallium/drivers/nv30/Makefile
new file mode 100644
index 0000000000..69f2790dfe
--- /dev/null
+++ b/src/gallium/drivers/nv30/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv30
+
+DRIVER_SOURCES = \
+	nv30_clear.c \
+	nv30_context.c \
+	nv30_draw.c \
+	nv30_fragprog.c \
+	nv30_fragtex.c \
+	nv30_miptree.c \
+	nv30_query.c \
+	nv30_screen.c \
+	nv30_state.c \
+	nv30_state_blend.c \
+	nv30_state_emit.c \
+	nv30_state_fb.c \
+	nv30_state_rasterizer.c \
+	nv30_state_scissor.c \
+	nv30_state_stipple.c \
+	nv30_state_viewport.c \
+	nv30_state_zsa.c \
+	nv30_surface.c \
+	nv30_vbo.c \
+	nv30_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv30/nv30_clear.c b/src/gallium/drivers/nv30/nv30_clear.c
new file mode 100644
index 0000000000..8c3ca204d5
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+
+void
+nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
new file mode 100644
index 0000000000..2bff28aca9
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+static void
+nv30_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(rankine, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv30_destroy(struct pipe_context *pipe)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	if (nv30->draw)
+		draw_destroy(nv30->draw);
+	FREE(nv30);
+}
+
+struct pipe_context *
+nv30_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_context *nv30;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv30 = CALLOC(1, sizeof(struct nv30_context));
+	if (!nv30)
+		return NULL;
+	nv30->screen = screen;
+	nv30->pctx_id = pctx_id;
+
+	nv30->nvws = nvws;
+
+	nv30->pipe.winsys = ws;
+	nv30->pipe.screen = pscreen;
+	nv30->pipe.destroy = nv30_destroy;
+	nv30->pipe.draw_arrays = nv30_draw_arrays;
+	nv30->pipe.draw_elements = nv30_draw_elements;
+	nv30->pipe.clear = nv30_clear;
+	nv30->pipe.flush = nv30_flush;
+
+	nv30_init_query_functions(nv30);
+	nv30_init_surface_functions(nv30);
+	nv30_init_state_functions(nv30);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv30->draw = draw_create();
+	draw_wide_point_threshold(nv30->draw, 9999999.0);
+	draw_wide_line_threshold(nv30->draw, 9999999.0);
+	draw_enable_line_stipple(nv30->draw, FALSE);
+	draw_enable_point_sprites(nv30->draw, FALSE);
+	draw_set_rasterize_stage(nv30->draw, nv30_draw_render_stage(nv30));
+
+	return &nv30->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv30/nv30_context.h b/src/gallium/drivers/nv30/nv30_context.h
new file mode 100644
index 0000000000..b933769700
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_context.h
@@ -0,0 +1,212 @@
+#ifndef __NV30_CONTEXT_H__
+#define __NV30_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv30_screen *ctx = nv30->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv30_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv30_state_index {
+	NV30_STATE_FB = 0,
+	NV30_STATE_VIEWPORT = 1,
+	NV30_STATE_BLEND = 2,
+	NV30_STATE_RAST = 3,
+	NV30_STATE_ZSA = 4,
+	NV30_STATE_BCOL = 5,
+	NV30_STATE_CLIP = 6,
+	NV30_STATE_SCISSOR = 7,
+	NV30_STATE_STIPPLE = 8,
+	NV30_STATE_FRAGPROG = 9,
+	NV30_STATE_VERTPROG = 10,
+	NV30_STATE_FRAGTEX0 = 11,
+	NV30_STATE_FRAGTEX1 = 12,
+	NV30_STATE_FRAGTEX2 = 13,
+	NV30_STATE_FRAGTEX3 = 14,
+	NV30_STATE_FRAGTEX4 = 15,
+	NV30_STATE_FRAGTEX5 = 16,
+	NV30_STATE_FRAGTEX6 = 17,
+	NV30_STATE_FRAGTEX7 = 18,
+	NV30_STATE_FRAGTEX8 = 19,
+	NV30_STATE_FRAGTEX9 = 20,
+	NV30_STATE_FRAGTEX10 = 21,
+	NV30_STATE_FRAGTEX11 = 22,
+	NV30_STATE_FRAGTEX12 = 23,
+	NV30_STATE_FRAGTEX13 = 24,
+	NV30_STATE_FRAGTEX14 = 25,
+	NV30_STATE_FRAGTEX15 = 26,
+	NV30_STATE_VERTTEX0 = 27,
+	NV30_STATE_VERTTEX1 = 28,
+	NV30_STATE_VERTTEX2 = 29,
+	NV30_STATE_VERTTEX3 = 30,
+	NV30_STATE_VTXBUF = 31,
+	NV30_STATE_VTXFMT = 32,
+	NV30_STATE_VTXATTR = 33,
+	NV30_STATE_MAX = 34
+};
+
+#include "nv30_screen.h"
+
+#define NV30_NEW_BLEND		(1 <<  0)
+#define NV30_NEW_RAST		(1 <<  1)
+#define NV30_NEW_ZSA		(1 <<  2)
+#define NV30_NEW_SAMPLER	(1 <<  3)
+#define NV30_NEW_FB		(1 <<  4)
+#define NV30_NEW_STIPPLE	(1 <<  5)
+#define NV30_NEW_SCISSOR	(1 <<  6)
+#define NV30_NEW_VIEWPORT	(1 <<  7)
+#define NV30_NEW_BCOL		(1 <<  8)
+#define NV30_NEW_VERTPROG	(1 <<  9)
+#define NV30_NEW_FRAGPROG	(1 << 10)
+#define NV30_NEW_ARRAYS		(1 << 11)
+#define NV30_NEW_UCP		(1 << 12)
+
+struct nv30_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv30_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV30_STATE_MAX];
+};
+
+struct nv30_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv30_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv30_state state;
+
+	/* Context state */
+	unsigned dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct nv30_vertex_program *vertprog;
+	struct nv30_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv30_rasterizer_state *rasterizer;
+	struct nv30_zsa_state *zsa;
+	struct nv30_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv30_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv30_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv30_context *
+nv30_context(struct pipe_context *pipe)
+{
+	return (struct nv30_context *)pipe;
+}
+
+struct nv30_state_entry {
+	boolean (*validate)(struct nv30_context *nv30);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv30_init_state_functions(struct nv30_context *nv30);
+extern void nv30_init_surface_functions(struct nv30_context *nv30);
+extern void nv30_init_query_functions(struct nv30_context *nv30);
+
+extern void nv30_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv30_draw.c */
+extern struct draw_stage *nv30_draw_render_stage(struct nv30_context *nv30);
+
+/* nv30_vertprog.c */
+extern void nv30_vertprog_destroy(struct nv30_context *,
+				  struct nv30_vertex_program *);
+
+/* nv30_fragprog.c */
+extern void nv30_fragprog_destroy(struct nv30_context *,
+				  struct nv30_fragment_program *);
+
+/* nv30_fragtex.c */
+extern void nv30_fragtex_bind(struct nv30_context *);
+
+/* nv30_state.c and friends */
+extern boolean nv30_state_validate(struct nv30_context *nv30);
+extern void nv30_state_emit(struct nv30_context *nv30);
+extern struct nv30_state_entry nv30_state_rasterizer;
+extern struct nv30_state_entry nv30_state_scissor;
+extern struct nv30_state_entry nv30_state_stipple;
+extern struct nv30_state_entry nv30_state_fragprog;
+extern struct nv30_state_entry nv30_state_vertprog;
+extern struct nv30_state_entry nv30_state_blend;
+extern struct nv30_state_entry nv30_state_blend_colour;
+extern struct nv30_state_entry nv30_state_zsa;
+extern struct nv30_state_entry nv30_state_viewport;
+extern struct nv30_state_entry nv30_state_framebuffer;
+extern struct nv30_state_entry nv30_state_fragtex;
+extern struct nv30_state_entry nv30_state_vbo;
+
+/* nv30_vbo.c */
+extern boolean nv30_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv30_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv30_clear.c */
+extern void nv30_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_draw.c b/src/gallium/drivers/nv30/nv30_draw.c
new file mode 100644
index 0000000000..74fc138c05
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_draw.c
@@ -0,0 +1,61 @@
+#include "draw/draw_pipe.h"
+
+#include "nv30_context.h"
+
+struct nv30_draw_stage {
+	struct draw_stage draw;
+	struct nv30_context *nv30;
+};
+
+static void
+nv30_draw_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_flush(struct draw_stage *draw, unsigned flags)
+{
+}
+
+static void
+nv30_draw_reset_stipple_counter(struct draw_stage *draw)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv30_draw_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+struct draw_stage *
+nv30_draw_render_stage(struct nv30_context *nv30)
+{
+	struct nv30_draw_stage *nv30draw = CALLOC_STRUCT(nv30_draw_stage);
+
+	nv30draw->nv30 = nv30;
+	nv30draw->draw.draw = nv30->draw;
+	nv30draw->draw.point = nv30_draw_point;
+	nv30draw->draw.line = nv30_draw_line;
+	nv30draw->draw.tri = nv30_draw_tri;
+	nv30draw->draw.flush = nv30_draw_flush;
+	nv30draw->draw.reset_stipple_counter = nv30_draw_reset_stipple_counter;
+	nv30draw->draw.destroy = nv30_draw_destroy;
+
+	return &nv30draw->draw;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
new file mode 100644
index 0000000000..320ba3f4bf
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -0,0 +1,911 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv30_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV30_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV30_FP_OP_COND_TR
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+#define scale(s,v) nv30_sr_scale((s), NV30_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv30_fpc {
+	struct nv30_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+	int num_regs;
+
+	uint depth_id;
+	uint colour_id;
+
+	unsigned inst_offset;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv30_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv30_sreg
+temp(struct nv30_fpc *fpc)
+{
+	int idx;
+
+	idx  = fpc->temp_temp_count++;
+	idx += fpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static INLINE struct nv30_sreg
+constant(struct nv30_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_fp_arith((cc), (s), NV30_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv30_fp_tex((cc), (s), NV30_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv30_fpc *fpc, int size)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv30_fpc *fpc, int pos, struct nv30_sreg src)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_INPUT:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV30_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		sr |= NV30_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV30SR_TEMP:
+		sr |= (NV30_FP_REG_TYPE_TEMP << NV30_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_FP_REG_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		grow_insns(fpc, 4);
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv30_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV30_FP_REG_TYPE_CONST << NV30_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_FP_REG_TYPE_INPUT << NV30_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV30_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv30_fpc *fpc, struct nv30_sreg dst)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV30SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV30_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV30SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV30_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv30_fp_arith(struct nv30_fpc *fpc, int sat, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV30_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV34TCL_FP_CONTROL_USES_KIL;
+	hw[0] |= (op << NV30_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV30_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV30_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV30_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV30_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV30_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV30_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV30_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV30_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV30_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv30_fp_tex(struct nv30_fpc *fpc, int sat, int op, int unit,
+	    struct nv30_sreg dst, int mask,
+	    struct nv30_sreg s0, struct nv30_sreg s1, struct nv30_sreg s2)
+{
+	struct nv30_fragment_program *fp = fpc->fp;
+
+	nv30_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV30_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index + 1);
+		if (fpc->high_temp < src.index)
+			fpc->high_temp = src.index;
+		break;
+	/* This is clearly insane, but gallium hands us shaders like this.
+	 * Luckily fragprog results are just temp regs..
+	 */
+	case TGSI_FILE_OUTPUT:
+		if (fsrc->SrcRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	int idx;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		if (fdst->DstRegister.Index == fpc->colour_id)
+			return nv30_sr(NV30SR_OUTPUT, 0);
+		else
+			return nv30_sr(NV30SR_OUTPUT, 1);
+		break;
+	case TGSI_FILE_TEMPORARY:
+		idx = fdst->DstRegister.Index + 1;
+		if (fpc->high_temp < idx)
+			fpc->high_temp = idx;
+		return nv30_sr(NV30SR_TEMP, idx);
+	case TGSI_FILE_NULL:
+		return nv30_sr(NV30SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv30_sr(NV30SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv30_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv30_sreg *src)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv30_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	struct nv30_sreg src[3], dst, tmp;
+	int mask, sat, unit = 0;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	fpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				NOUVEAU_MSG("extra src attr %d\n",
+					 fsrc->SrcRegister.Index);
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv30_sr(NV30SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV30_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		arith(fpc, sat, LRP, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		arith(fpc, sat, POW, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		arith(fpc, 0, RFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_attrib(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_parse_decl_output(struct nv30_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		fpc->depth_id = fdec->DeclarationRange.First;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		fpc->colour_id = fdec->DeclarationRange.First;
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_fragprog_prepare(struct nv30_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	/*int high_temp = -1, i;*/
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv30_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			/*case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;*/
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	/*if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv30_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}*/
+
+	return TRUE;
+
+out_err:
+	/*if (fpc->r_temp)
+		FREE(fpc->r_temp);*/
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv30_fragprog_translate(struct nv30_context *nv30,
+			struct nv30_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_fpc *fpc = NULL;
+
+	tgsi_dump(fp->pipe.tokens,0);
+
+	fpc = CALLOC(1, sizeof(struct nv30_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->high_temp = -1;
+	fpc->num_regs = 2;
+
+	if (!nv30_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= (fpc->num_regs-1)/2;
+	fp->fp_reg_control = (1<<16)|0x4;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+	fp->on_hw = FALSE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(fpc);
+}
+
+static void
+nv30_fragprog_upload(struct nv30_context *nv30,
+		     struct nv30_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv30_fragprog_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct pipe_buffer *constbuf =
+		nv30->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	/*nv30->fallback_swrast &= ~NV30_NEW_FRAGPROG;*/
+	nv30_fragprog_translate(nv30, fp);
+	if (!fp->translated) {
+		/*nv30->fallback_swrast |= NV30_NEW_FRAGPROG;*/
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv30_fragprog_upload(nv30, fp);
+
+	so = so_new(8, 1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_ACTIVE_PROGRAM, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV34TCL_FP_ACTIVE_PROGRAM_DMA0, NV34TCL_FP_ACTIVE_PROGRAM_DMA1);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_FP_REG_CONTROL, 1);
+	so_data  (so, fp->fp_reg_control);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_UNITS_ENABLE, 1);
+	so_data  (so, fp->samplers);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv30_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv30_fragprog_upload(nv30, fp);
+	}
+
+	if (new_consts || fp->so != nv30->state.hw[NV30_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv30->state.hw[NV30_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_fragprog_destroy(struct nv30_context *nv30,
+		      struct nv30_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv30_state_entry nv30_state_fragprog = {
+	.validate = nv30_fragprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FRAGPROG,
+		.hw = NV30_STATE_FRAGPROG
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_fragtex.c b/src/gallium/drivers/nv30/nv30_fragtex.c
new file mode 100644
index 0000000000..b1d2663af3
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_fragtex.c
@@ -0,0 +1,163 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w)                        \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV34TCL_TX_FORMAT_FORMAT_##tf,                                               \
+  (NV34TCL_TX_SWIZZLE_S0_X_##ts0x | NV34TCL_TX_SWIZZLE_S0_Y_##ts0y |           \
+   NV34TCL_TX_SWIZZLE_S0_Z_##ts0z | NV34TCL_TX_SWIZZLE_S0_W_##ts0w |           \
+   NV34TCL_TX_SWIZZLE_S1_X_##ts1x | NV34TCL_TX_SWIZZLE_S1_Y_##ts1y |           \
+   NV34TCL_TX_SWIZZLE_S1_Z_##ts1z | NV34TCL_TX_SWIZZLE_S1_W_##ts1w)            \
+}
+
+struct nv30_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+};
+
+static struct nv30_texture_format
+nv30_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y),
+//	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+//	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W),
+	{},
+};
+
+static struct nv30_texture_format *
+nv30_fragtex_format(uint pipe_format)
+{
+	struct nv30_texture_format *tf = nv30_texture_formats;
+	char fs[128];
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv30_fragtex_build(struct nv30_context *nv30, int unit)
+{
+	struct nv30_sampler_state *ps = nv30->tex_sampler[unit];
+	struct nv30_miptree *nv30mt = nv30->tex_miptree[unit];
+	struct pipe_texture *pt = &nv30mt->base;
+	struct nv30_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs , txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv30_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = tf->format;
+	txf |= ((pt->last_level>0) ? NV34TCL_TX_FORMAT_MIPMAP : 0);
+	txf |= log2i(pt->width[0]) << 20;
+	txf |= log2i(pt->height[0]) << 24;
+	txf |= log2i(pt->depth[0]) << 28;
+	txf |= NV34TCL_TX_FORMAT_NO_BORDER | 0x10000;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV34TCL_TX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV34TCL_TX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv30mt->level[0].pitch;
+		txf |= (1<<13) /*FIXME: NV34TCL_TX_FORMAT_LINEAR ? */;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv30->screen->rankine, NV34TCL_TX_OFFSET(unit), 8);
+	so_reloc (so, nv30mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv30mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV34TCL_TX_FORMAT_DMA0, NV34TCL_TX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV34TCL_TX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV34TCL_TX_NPOT_SIZE_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+
+	return so;
+}
+
+static boolean
+nv30_fragtex_validate(struct nv30_context *nv30)
+{
+	struct nv30_fragment_program *fp = nv30->fragprog;
+	struct nv30_state *state = &nv30->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv30->screen->rankine, NV34TCL_TX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv30->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv30_fragtex_build(nv30, unit);
+		so_ref(so, &nv30->state.hw[NV30_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV30_STATE_FRAGTEX0 + unit));
+	}
+
+	nv30->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_fragtex = {
+	.validate = nv30_fragtex_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SAMPLER | NV30_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
new file mode 100644
index 0000000000..aa670b9a45
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -0,0 +1,195 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv30_context.h"
+
+static void
+nv30_miptree_layout(struct nv30_miptree *nv30mt)
+{
+	struct pipe_texture *pt = &nv30mt->base;
+	boolean swizzled = FALSE;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f, pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+	
+	pitch = pt->width[0];
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (swizzled)
+			pitch = pt->nblocksx[l];
+		pitch = align(pitch, 64);
+
+		nv30mt->level[l].pitch = pitch * pt->block.size;
+		nv30mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			nv30mt->level[l].image_offset[f] = offset;
+			offset += nv30mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	nv30mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv30_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv30_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		/* XXX: Re-enable when SIFM size limits are fixed */
+		/*case PIPE_FORMAT_R16_SNORM:*/
+			break;
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	nv30_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv30_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv30_miptree *mt = (struct nv30_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		assert(mt->shadow_surface);
+		pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv30_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv30_miptree *nv30mt = (struct nv30_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, nv30mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = nv30mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = nv30mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = nv30mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = nv30mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv30_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen->winsys, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void
+nv30_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv30_miptree_create;
+	pscreen->texture_release = nv30_miptree_release;
+	pscreen->get_tex_surface = nv30_miptree_surface_new;
+	pscreen->tex_surface_release = nv30_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_query.c b/src/gallium/drivers/nv30/nv30_query.c
new file mode 100644
index 0000000000..d40d75f264
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv30_context.h"
+
+struct nv30_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv30_query *
+nv30_query(struct pipe_query *pipe)
+{
+	return (struct nv30_query *)pipe;
+}
+
+static struct pipe_query *
+nv30_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv30_query *q;
+
+	q = CALLOC(1, sizeof(struct nv30_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv30_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	if (q->object)
+		nv30->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv30_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64 tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv30->nvws->res_alloc(nv30->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv30->nvws->notifier_reset(nv30->screen->query, q->object->start);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(rankine, NV34TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv30_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+
+	BEGIN_RING(rankine, NV34TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV34TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV34TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv30_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_query *q = nv30_query(pq);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv30->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv30->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv30->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv30_init_query_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_query = nv30_query_create;
+	nv30->pipe.destroy_query = nv30_query_destroy;
+	nv30->pipe.begin_query = nv30_query_begin;
+	nv30->pipe.end_query = nv30_query_end;
+	nv30->pipe.get_query_result = nv30_query_result;
+}
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
new file mode 100644
index 0000000000..910a3c456d
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -0,0 +1,383 @@
+#include "pipe/p_screen.h"
+
+#include "nv30_context.h"
+#include "nv30_screen.h"
+
+#define NV30TCL_CHIPSET_3X_MASK 0x00000003
+#define NV34TCL_CHIPSET_3X_MASK 0x00000010
+#define NV35TCL_CHIPSET_3X_MASK 0x000001e0
+
+static const char *
+nv30_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv30_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv30_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 2;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+		return 0;
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+	case NOUVEAU_CAP_HW_IDXBUF:
+		return 1;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv30_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 8.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv30_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv30_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+			mt->shadow_surface = screen->get_tex_surface
+			(
+				screen, mt->shadow_tex,
+				surface->face, surface->level, surface->zslice,
+				surface->usage
+			);
+		}
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+
+	map = ws->buffer_map(ws, surface_to_map->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv30_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv30_miptree *mt = (struct nv30_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, surface_to_unmap->buffer);
+
+	if (surface_to_unmap != surface) {
+		struct nv30_screen *nvscreen = nv30_screen(screen);
+
+		nvscreen->nvws->surface_copy(nvscreen->nvws,
+		                             surface, 0, 0,
+		                             surface_to_unmap, 0, 0,
+		                             surface->width, surface->height);
+	}
+}
+
+static void
+nv30_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv30_screen *screen = nv30_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->rankine);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv30_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+	struct nouveau_stateobj *so;
+	unsigned rankine_class = 0;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret, i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x30:
+		if (NV30TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0397;
+		else
+		if (NV34TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0697;
+		else
+		if (NV35TCL_CHIPSET_3X_MASK & (1 << (chipset & 0x0f)))
+			rankine_class = 0x0497;
+		break;
+	default:
+		break;
+	}
+
+	if (!rankine_class) {
+		NOUVEAU_ERR("Unknown nv3x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, rankine_class, &screen->rankine);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 256) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv30_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static rankine initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->rankine, NV34TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+/*	so_method(so, screen->rankine, NV34TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);*/
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY7, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->rankine, NV34TCL_DMA_IN_MEMORY8, 1);
+	so_data  (so, nvws->channel->vram->handle);
+
+	for (i=1; i<8; i++) {
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(i), 1);
+		so_data  (so, 0);
+		so_method(so, screen->rankine, NV34TCL_VIEWPORT_CLIP_VERT(i), 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, screen->rankine, 0x220, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->rankine, 0x03b0, 1);
+	so_data  (so, 0x00100000);
+	so_method(so, screen->rankine, 0x1454, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x1d80, 1);
+	so_data  (so, 3);
+	so_method(so, screen->rankine, 0x1450, 1);
+	so_data  (so, 0x00030004);
+	
+	/* NEW */
+	so_method(so, screen->rankine, 0x1e98, 1);
+	so_data  (so, 0);
+	so_method(so, screen->rankine, 0x17e0, 3);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+	so_method(so, screen->rankine, 0x1f80, 16);
+	for (i=0; i<16; i++) {
+		so_data  (so, (i==8) ? 0x0000ffff : 0);
+	}
+
+	so_method(so, screen->rankine, 0x120, 3);
+	so_data  (so, 0);
+	so_data  (so, 1);
+	so_data  (so, 2);
+
+	so_method(so, screen->rankine, 0x1d88, 1);
+	so_data  (so, 0x00001200);
+
+	so_method(so, screen->rankine, NV34TCL_RC_ENABLE, 1);
+	so_data  (so, 0);
+
+	so_method(so, screen->rankine, NV34TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->rankine, NV34TCL_MULTISAMPLE_CONTROL, 1);
+	so_data  (so, 0xffff0000);
+
+	/* enables use of vp rather than fixed-function somehow */
+	so_method(so, screen->rankine, 0x1e94, 1);
+	so_data  (so, 0x13);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv30_screen_destroy;
+
+	screen->pipe.get_name = nv30_screen_get_name;
+	screen->pipe.get_vendor = nv30_screen_get_vendor;
+	screen->pipe.get_param = nv30_screen_get_param;
+	screen->pipe.get_paramf = nv30_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv30_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv30_surface_map;
+	screen->pipe.surface_unmap = nv30_surface_unmap;
+
+	nv30_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_screen.h b/src/gallium/drivers/nv30/nv30_screen.h
new file mode 100644
index 0000000000..b7ddc2a959
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV30_SCREEN_H__
+#define __NV30_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv30_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *rankine;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV30_STATE_MAX];
+};
+
+static INLINE struct nv30_screen *
+nv30_screen(struct pipe_screen *screen)
+{
+	return (struct nv30_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_shader.h b/src/gallium/drivers/nv30/nv30_shader.h
new file mode 100644
index 0000000000..dd3a36f78f
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_shader.h
@@ -0,0 +1,490 @@
+#ifndef __NV30_SHADER_H__
+#define __NV30_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * 128bit opcodes, split into 4 32-bit ones for ease of use.
+ *
+ * Non-native instructions
+ *   ABS - MOV + NV40_VP_INST0_DEST_ABS
+ *   POW - EX2 + MUL + LG2
+ *   SUB - ADD, second source negated
+ *   SWZ - MOV
+ *   XPD -  
+ *
+ * Register access
+ *   - Only one INPUT can be accessed per-instruction (move extras into TEMPs)
+ *   - Only one CONST can be accessed per-instruction (move extras into TEMPs)
+ *
+ * Relative Addressing
+ *   According to the value returned for
+ *   MAX_PROGRAM_NATIVE_ADDRESS_REGISTERS_ARB
+ *
+ *   there are only two address registers available.  The destination in the
+ *   ARL instruction is set to TEMP <n> (The temp isn't actually written).
+ *
+ *   When using vanilla ARB_v_p, the proprietary driver will squish both the
+ *   available ADDRESS regs into the first hardware reg in the X and Y
+ *   components.
+ *
+ *   To use an address reg as an index into consts, the CONST_SRC is set to
+ *   (const_base + offset) and INDEX_CONST is set.
+ *
+ *   To access the second address reg use ADDR_REG_SELECT_1. A particular
+ *   component of the address regs is selected with ADDR_SWZ.
+ *
+ *   Only one address register can be accessed per instruction.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details) Conditional
+ * execution of an instruction is enabled by setting COND_TEST_ENABLE, and
+ * selecting the condition which will allow the test to pass with
+ * COND_{FL,LT,...}.  It is possible to swizzle the values in the condition
+ * register, which allows for testing against an individual component.
+ *
+ * Branching:
+ *
+ *   The BRA/CAL instructions seem to follow a slightly different opcode
+ *   layout.  The destination instruction ID (IADDR) overlaps a source field.
+ *   Instruction ID's seem to be numbered based on the UPLOAD_FROM_ID FIFO
+ *   command, and is incremented automatically on each UPLOAD_INST FIFO
+ *   command.
+ *
+ *   Conditional branching is achieved by using the condition tests described
+ *   above.  There doesn't appear to be dedicated looping instructions, but
+ *   this can be done using a temp reg + conditional branching.
+ *
+ *   Subroutines may be uploaded before the main program itself, but the first
+ *   executed instruction is determined by the PROGRAM_START_ID FIFO command.
+ *
+ */
+
+/* DWORD 0 */
+
+#define NV30_VP_INST_ADDR_REG_SELECT_1        (1 << 24)
+#define NV30_VP_INST_SRC2_ABS           (1 << 23) /* guess */
+#define NV30_VP_INST_SRC1_ABS           (1 << 22) /* guess */
+#define NV30_VP_INST_SRC0_ABS           (1 << 21) /* guess */
+#define NV30_VP_INST_VEC_RESULT         (1 << 20)
+#define NV30_VP_INST_DEST_TEMP_ID_SHIFT        16
+#define NV30_VP_INST_DEST_TEMP_ID_MASK        (0x0F << 16)
+#define NV30_VP_INST_COND_UPDATE_ENABLE        (1<<15)
+#define NV30_VP_INST_VEC_DEST_TEMP_MASK      (0xF << 16)
+#define NV30_VP_INST_COND_TEST_ENABLE        (1<<14)
+#define NV30_VP_INST_COND_SHIFT          11
+#define NV30_VP_INST_COND_MASK          (0x07 << 11)
+#  define NV30_VP_INST_COND_FL  0 /* guess */  
+#  define NV30_VP_INST_COND_LT  1  
+#  define NV30_VP_INST_COND_EQ  2
+#  define NV30_VP_INST_COND_LE  3
+#  define NV30_VP_INST_COND_GT  4
+#  define NV30_VP_INST_COND_NE  5
+#  define NV30_VP_INST_COND_GE  6
+#  define NV30_VP_INST_COND_TR  7 /* guess */
+#define NV30_VP_INST_COND_SWZ_X_SHIFT        9
+#define NV30_VP_INST_COND_SWZ_X_MASK        (0x03 <<  9)
+#define NV30_VP_INST_COND_SWZ_Y_SHIFT        7
+#define NV30_VP_INST_COND_SWZ_Y_MASK        (0x03 <<  7)
+#define NV30_VP_INST_COND_SWZ_Z_SHIFT        5
+#define NV30_VP_INST_COND_SWZ_Z_MASK        (0x03 <<  5)
+#define NV30_VP_INST_COND_SWZ_W_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_W_MASK        (0x03 <<  3)
+#define NV30_VP_INST_COND_SWZ_ALL_SHIFT        3
+#define NV30_VP_INST_COND_SWZ_ALL_MASK        (0xFF <<  3)
+#define NV30_VP_INST_ADDR_SWZ_SHIFT        1
+#define NV30_VP_INST_ADDR_SWZ_MASK        (0x03 <<  1)
+#define NV30_VP_INST_SCA_OPCODEH_SHIFT        0
+#define NV30_VP_INST_SCA_OPCODEH_MASK        (0x01 <<  0)
+
+/* DWORD 1 */
+#define NV30_VP_INST_SCA_OPCODEL_SHIFT        28
+#define NV30_VP_INST_SCA_OPCODEL_MASK        (0x0F << 28)
+#  define NV30_VP_INST_OP_NOP  0x00
+#  define NV30_VP_INST_OP_RCP  0x02
+#  define NV30_VP_INST_OP_RCC  0x03
+#  define NV30_VP_INST_OP_RSQ  0x04
+#  define NV30_VP_INST_OP_EXP  0x05
+#  define NV30_VP_INST_OP_LOG  0x06
+#  define NV30_VP_INST_OP_LIT  0x07
+#  define NV30_VP_INST_OP_BRA  0x09
+#  define NV30_VP_INST_OP_CAL  0x0B
+#  define NV30_VP_INST_OP_RET  0x0C
+#  define NV30_VP_INST_OP_LG2  0x0D
+#  define NV30_VP_INST_OP_EX2  0x0E
+#  define NV30_VP_INST_OP_SIN  0x0F
+#  define NV30_VP_INST_OP_COS  0x10
+#define NV30_VP_INST_VEC_OPCODE_SHIFT        23
+#define NV30_VP_INST_VEC_OPCODE_MASK        (0x1F << 23)
+#  define NV30_VP_INST_OP_NOPV  0x00
+#  define NV30_VP_INST_OP_MOV  0x01
+#  define NV30_VP_INST_OP_MUL  0x02
+#  define NV30_VP_INST_OP_ADD  0x03
+#  define NV30_VP_INST_OP_MAD  0x04
+#  define NV30_VP_INST_OP_DP3  0x05
+#  define NV30_VP_INST_OP_DP4  0x07
+#  define NV30_VP_INST_OP_DPH  0x06
+#  define NV30_VP_INST_OP_DST  0x08
+#  define NV30_VP_INST_OP_MIN  0x09
+#  define NV30_VP_INST_OP_MAX  0x0A
+#  define NV30_VP_INST_OP_SLT  0x0B
+#  define NV30_VP_INST_OP_SGE  0x0C
+#  define NV30_VP_INST_OP_ARL  0x0D
+#  define NV30_VP_INST_OP_FRC  0x0E
+#  define NV30_VP_INST_OP_FLR  0x0F
+#  define NV30_VP_INST_OP_SEQ  0x10
+#  define NV30_VP_INST_OP_SFL  0x11
+#  define NV30_VP_INST_OP_SGT  0x12
+#  define NV30_VP_INST_OP_SLE  0x13
+#  define NV30_VP_INST_OP_SNE  0x14
+#  define NV30_VP_INST_OP_STR  0x15
+#  define NV30_VP_INST_OP_SSG  0x16
+#  define NV30_VP_INST_OP_ARR  0x17
+#  define NV30_VP_INST_OP_ARA  0x18
+#define NV30_VP_INST_CONST_SRC_SHIFT        14
+#define NV30_VP_INST_CONST_SRC_MASK        (0xFF << 14)
+#define NV30_VP_INST_INPUT_SRC_SHIFT        9    /*NV20*/
+#define NV30_VP_INST_INPUT_SRC_MASK        (0x0F <<  9)  /*NV20*/
+#  define NV30_VP_INST_IN_POS  0    /* These seem to match the bindings specified in */
+#  define NV30_VP_INST_IN_WEIGHT  1    /* the ARB_v_p spec (2.14.3.1) */
+#  define NV30_VP_INST_IN_NORMAL  2    
+#  define NV30_VP_INST_IN_COL0  3    /* Should probably confirm them all though */
+#  define NV30_VP_INST_IN_COL1  4
+#  define NV30_VP_INST_IN_FOGC  5
+#  define NV30_VP_INST_IN_TC0  8
+#  define NV30_VP_INST_IN_TC(n)  (8+n)
+#define NV30_VP_INST_SRC0H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC0H_MASK          (0x1FF << 0)  /*NV20*/
+
+/* Please note: the IADDR fields overlap other fields because they are used
+ * only for branch instructions.  See Branching: label above
+ *
+ * DWORD 2
+ */
+#define NV30_VP_INST_SRC0L_SHIFT        26    /*NV20*/
+#define NV30_VP_INST_SRC0L_MASK         (0x3F  <<26)  /* NV30_VP_SRC0_LOW_MASK << 26 */
+#define NV30_VP_INST_SRC1_SHIFT         11    /*NV20*/
+#define NV30_VP_INST_SRC1_MASK          (0x7FFF<<11)  /*NV20*/
+#define NV30_VP_INST_SRC2H_SHIFT        0    /*NV20*/
+#define NV30_VP_INST_SRC2H_MASK          (0x7FF << 0)  /* NV30_VP_SRC2_HIGH_MASK >> 4*/
+#define NV30_VP_INST_IADDR_SHIFT        2
+#define NV30_VP_INST_IADDR_MASK          (0xF <<  28)   /* NV30_VP_SRC2_LOW_MASK << 28 */
+
+/* DWORD 3 */
+#define NV30_VP_INST_SRC2L_SHIFT        28    /*NV20*/
+#define NV30_VP_INST_SRC2L_MASK          (0x0F  <<28)  /*NV20*/
+#define NV30_VP_INST_STEMP_WRITEMASK_SHIFT      24
+#define NV30_VP_INST_STEMP_WRITEMASK_MASK      (0x0F << 24)
+#define NV30_VP_INST_VTEMP_WRITEMASK_SHIFT      20
+#define NV30_VP_INST_VTEMP_WRITEMASK_MASK      (0x0F << 20)
+#define NV30_VP_INST_SDEST_WRITEMASK_SHIFT      16
+#define NV30_VP_INST_SDEST_WRITEMASK_MASK      (0x0F << 16)
+#define NV30_VP_INST_VDEST_WRITEMASK_SHIFT      12    /*NV20*/
+#define NV30_VP_INST_VDEST_WRITEMASK_MASK      (0x0F << 12)  /*NV20*/
+#define NV30_VP_INST_DEST_SHIFT        2
+#define NV30_VP_INST_DEST_MASK        (0x0F <<  2)
+#  define NV30_VP_INST_DEST_POS  0
+#  define NV30_VP_INST_DEST_BFC0  1
+#  define NV30_VP_INST_DEST_BFC1  2
+#  define NV30_VP_INST_DEST_COL0  3
+#  define NV30_VP_INST_DEST_COL1  4
+#  define NV30_VP_INST_DEST_FOGC  5
+#  define NV30_VP_INST_DEST_PSZ   6
+#  define NV30_VP_INST_DEST_TC(n)  (8+n)
+
+#define NV30_VP_INST_LAST                           (1 << 0)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV30_VP_SRC0_HIGH_SHIFT                                                6
+#define NV30_VP_SRC0_HIGH_MASK                                        0x00007FC0
+#define NV30_VP_SRC0_LOW_MASK                                         0x0000003F
+#define NV30_VP_SRC2_HIGH_SHIFT                                                4
+#define NV30_VP_SRC2_HIGH_MASK                                        0x00007FF0
+#define NV30_VP_SRC2_LOW_MASK                                         0x0000000F
+
+
+/* Source-register definition - matches NV20 exactly */
+#define NV30_VP_SRC_NEGATE          (1<<14)
+#define NV30_VP_SRC_SWZ_X_SHIFT        12
+#define NV30_VP_SRC_REG_SWZ_X_MASK        (0x03  <<12)
+#define NV30_VP_SRC_SWZ_Y_SHIFT        10
+#define NV30_VP_SRC_REG_SWZ_Y_MASK        (0x03  <<10)
+#define NV30_VP_SRC_SWZ_Z_SHIFT        8
+#define NV30_VP_SRC_REG_SWZ_Z_MASK        (0x03  << 8)
+#define NV30_VP_SRC_SWZ_W_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_W_MASK        (0x03  << 6)
+#define NV30_VP_SRC_REG_SWZ_ALL_SHIFT        6
+#define NV30_VP_SRC_REG_SWZ_ALL_MASK        (0xFF  << 6)
+#define NV30_VP_SRC_TEMP_SRC_SHIFT        2
+#define NV30_VP_SRC_REG_TEMP_ID_MASK        (0x0F  << 0)
+#define NV30_VP_SRC_REG_TYPE_SHIFT        0
+#define NV30_VP_SRC_REG_TYPE_MASK        (0x03  << 0)
+#define NV30_VP_SRC_REG_TYPE_TEMP  1
+#define NV30_VP_SRC_REG_TYPE_INPUT  2
+#define NV30_VP_SRC_REG_TYPE_CONST  3 /* guess */
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *   0 - Opcode, output reg/mask, ATTRIB source
+ *   1 - Source 0
+ *   2 - Source 1
+ *   3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *     result.color == R0.xyzw
+ *     result.depth == R1.z
+ * When the fragprog contains instructions to write depth, NV30_TCL_PRIMITIVE_3D_UNK1D78=0
+ * otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *     ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO/SWIZZLE_ONE
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as SWIZZLE_ZERO
+ * is implemented simply by not writing to the relevant components of the destination.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *   LIT
+ *   LRP - MAD+MAD
+ *   SUB - ADD, negate second source
+ *   RSQ - LG2 + EX2
+ *   POW - LG2 + MUL + EX2
+ *   SCS - COS + SIN
+ *   XPD
+ */
+
+//== Opcode / Destination selection ==
+#define NV30_FP_OP_PROGRAM_END          (1 << 0)
+#define NV30_FP_OP_OUT_REG_SHIFT        1
+#define NV30_FP_OP_OUT_REG_MASK          (31 << 1)  /* uncertain */
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV30_FP_OP_OUT_REG_HALF          (1 << 7)
+#define NV30_FP_OP_COND_WRITE_ENABLE        (1 << 8)
+#define NV30_FP_OP_OUTMASK_SHIFT        9
+#define NV30_FP_OP_OUTMASK_MASK          (0xF << 9)
+#  define NV30_FP_OP_OUT_X  (1<<9)
+#  define NV30_FP_OP_OUT_Y  (1<<10)
+#  define NV30_FP_OP_OUT_Z  (1<<11)
+#  define NV30_FP_OP_OUT_W  (1<<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV30_FP_OP_INPUT_SRC_SHIFT        13
+#define NV30_FP_OP_INPUT_SRC_MASK        (15 << 13)
+#  define NV30_FP_OP_INPUT_SRC_POSITION  0x0
+#  define NV30_FP_OP_INPUT_SRC_COL0  0x1
+#  define NV30_FP_OP_INPUT_SRC_COL1  0x2
+#  define NV30_FP_OP_INPUT_SRC_FOGC  0x3
+#  define NV30_FP_OP_INPUT_SRC_TC0    0x4
+#  define NV30_FP_OP_INPUT_SRC_TC(n)  (0x4 + n)
+#define NV30_FP_OP_TEX_UNIT_SHIFT        17
+#define NV30_FP_OP_TEX_UNIT_MASK        (0xF << 17) /* guess */
+#define NV30_FP_OP_PRECISION_SHIFT        22
+#define NV30_FP_OP_PRECISION_MASK        (3 << 22)
+#   define NV30_FP_PRECISION_FP32  0
+#   define NV30_FP_PRECISION_FP16  1
+#   define NV30_FP_PRECISION_FX12  2
+#define NV30_FP_OP_OPCODE_SHIFT          24
+#define NV30_FP_OP_OPCODE_MASK          (0x3F << 24)
+#  define NV30_FP_OP_OPCODE_NOP  0x00
+#  define NV30_FP_OP_OPCODE_MOV  0x01
+#  define NV30_FP_OP_OPCODE_MUL  0x02
+#  define NV30_FP_OP_OPCODE_ADD  0x03
+#  define NV30_FP_OP_OPCODE_MAD  0x04
+#  define NV30_FP_OP_OPCODE_DP3  0x05
+#  define NV30_FP_OP_OPCODE_DP4  0x06
+#  define NV30_FP_OP_OPCODE_DST  0x07
+#  define NV30_FP_OP_OPCODE_MIN  0x08
+#  define NV30_FP_OP_OPCODE_MAX  0x09
+#  define NV30_FP_OP_OPCODE_SLT  0x0A
+#  define NV30_FP_OP_OPCODE_SGE  0x0B
+#  define NV30_FP_OP_OPCODE_SLE  0x0C
+#  define NV30_FP_OP_OPCODE_SGT  0x0D
+#  define NV30_FP_OP_OPCODE_SNE  0x0E
+#  define NV30_FP_OP_OPCODE_SEQ  0x0F
+#  define NV30_FP_OP_OPCODE_FRC  0x10
+#  define NV30_FP_OP_OPCODE_FLR  0x11
+#  define NV30_FP_OP_OPCODE_KIL  0x12
+#  define NV30_FP_OP_OPCODE_PK4B   0x13
+#  define NV30_FP_OP_OPCODE_UP4B   0x14
+#  define NV30_FP_OP_OPCODE_DDX  0x15 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_DDY  0x16 /* can only write XY */
+#  define NV30_FP_OP_OPCODE_TEX  0x17
+#  define NV30_FP_OP_OPCODE_TXP  0x18
+#  define NV30_FP_OP_OPCODE_TXD  0x19
+#  define NV30_FP_OP_OPCODE_RCP  0x1A
+#  define NV30_FP_OP_OPCODE_RSQ  0x1B
+#  define NV30_FP_OP_OPCODE_EX2  0x1C
+#  define NV30_FP_OP_OPCODE_LG2  0x1D
+#  define NV30_FP_OP_OPCODE_LIT  0x1E
+#  define NV30_FP_OP_OPCODE_LRP  0x1F
+#  define NV30_FP_OP_OPCODE_STR  0x20 
+#  define NV30_FP_OP_OPCODE_SFL  0x21
+#  define NV30_FP_OP_OPCODE_COS  0x22
+#  define NV30_FP_OP_OPCODE_SIN  0x23
+#  define NV30_FP_OP_OPCODE_PK2H   0x24
+#  define NV30_FP_OP_OPCODE_UP2H   0x25
+#  define NV30_FP_OP_OPCODE_POW  0x26
+#  define NV30_FP_OP_OPCODE_PK4UB  0x27
+#  define NV30_FP_OP_OPCODE_UP4UB  0x28
+#  define NV30_FP_OP_OPCODE_PK2US  0x29
+#  define NV30_FP_OP_OPCODE_UP2US  0x2A
+#  define NV30_FP_OP_OPCODE_DP2A   0x2E
+#  define NV30_FP_OP_OPCODE_TXB  0x31
+#  define NV30_FP_OP_OPCODE_RFL  0x36
+#  define NV30_FP_OP_OPCODE_DIV  0x3A
+#define NV30_FP_OP_OUT_SAT          (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV30_FP_OP_OUT_ABS          (1 << 29)
+#define NV30_FP_OP_COND_SWZ_W_SHIFT        27
+#define NV30_FP_OP_COND_SWZ_W_MASK        (3 << 27)
+#define NV30_FP_OP_COND_SWZ_Z_SHIFT        25
+#define NV30_FP_OP_COND_SWZ_Z_MASK        (3 << 25)
+#define NV30_FP_OP_COND_SWZ_Y_SHIFT        23
+#define NV30_FP_OP_COND_SWZ_Y_MASK        (3 << 23)
+#define NV30_FP_OP_COND_SWZ_X_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_X_MASK        (3 << 21)
+#define NV30_FP_OP_COND_SWZ_ALL_SHIFT        21
+#define NV30_FP_OP_COND_SWZ_ALL_MASK        (0xFF << 21)
+#define NV30_FP_OP_COND_SHIFT          18
+#define NV30_FP_OP_COND_MASK          (0x07 << 18)
+#  define NV30_FP_OP_COND_FL  0
+#  define NV30_FP_OP_COND_LT  1
+#  define NV30_FP_OP_COND_EQ  2
+#  define NV30_FP_OP_COND_LE  3
+#  define NV30_FP_OP_COND_GT  4
+#  define NV30_FP_OP_COND_NE  5
+#  define NV30_FP_OP_COND_GE  6
+#  define NV30_FP_OP_COND_TR  7
+
+/* high order bits of SRC1 */
+#define NV30_FP_OP_DST_SCALE_SHIFT        28
+#define NV30_FP_OP_DST_SCALE_MASK        (3 << 28)
+#define NV30_FP_OP_DST_SCALE_1X                                                0
+#define NV30_FP_OP_DST_SCALE_2X                                                1
+#define NV30_FP_OP_DST_SCALE_4X                                                2
+#define NV30_FP_OP_DST_SCALE_8X                                                3
+#define NV30_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV30_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV30_FP_OP_DST_SCALE_INV_8X                                            7
+
+
+/* high order bits of SRC2 */
+#define NV30_FP_OP_INDEX_INPUT          (1 << 30)
+
+//== Register selection ==
+#define NV30_FP_REG_TYPE_SHIFT          0
+#define NV30_FP_REG_TYPE_MASK          (3 << 0)
+#  define NV30_FP_REG_TYPE_TEMP  0
+#  define NV30_FP_REG_TYPE_INPUT  1
+#  define NV30_FP_REG_TYPE_CONST  2
+#define NV30_FP_REG_SRC_SHIFT          2 /* uncertain */
+#define NV30_FP_REG_SRC_MASK          (31 << 2)
+#define NV30_FP_REG_SRC_HALF          (1 << 8)
+#define NV30_FP_REG_SWZ_ALL_SHIFT        9
+#define NV30_FP_REG_SWZ_ALL_MASK        (255 << 9)
+#define NV30_FP_REG_SWZ_X_SHIFT          9
+#define NV30_FP_REG_SWZ_X_MASK          (3 << 9)
+#define NV30_FP_REG_SWZ_Y_SHIFT          11
+#define NV30_FP_REG_SWZ_Y_MASK          (3 << 11)
+#define NV30_FP_REG_SWZ_Z_SHIFT          13
+#define NV30_FP_REG_SWZ_Z_MASK          (3 << 13)
+#define NV30_FP_REG_SWZ_W_SHIFT          15
+#define NV30_FP_REG_SWZ_W_MASK          (3 << 15)
+#  define NV30_FP_SWIZZLE_X  0
+#  define NV30_FP_SWIZZLE_Y  1
+#  define NV30_FP_SWIZZLE_Z  2
+#  define NV30_FP_SWIZZLE_W  3
+#define NV30_FP_REG_NEGATE          (1 << 17)
+
+#define NV30SR_NONE	0
+#define NV30SR_OUTPUT	1
+#define NV30SR_INPUT	2
+#define NV30SR_TEMP	3
+#define NV30SR_CONST	4
+
+struct nv30_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv30_sreg
+nv30_sr(int type, int index)
+{
+	struct nv30_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_swz(struct nv30_sreg src, int x, int y, int z, int w)
+{
+	struct nv30_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_neg(struct nv30_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_abs(struct nv30_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+nv30_sr_scale(struct nv30_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state.c b/src/gallium/drivers/nv30/nv30_state.c
new file mode 100644
index 0000000000..fc66075c83
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.c
@@ -0,0 +1,724 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static void *
+nv30_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, rankine, NV34TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, rankine, NV34TCL_BLEND_FUNC_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, rankine, NV34TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, rankine, NV34TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv30_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend = hwcso;
+	nv30->dirty |= NV30_NEW_BLEND;
+}
+
+static void
+nv30_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV34TCL_TX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_CLAMP;
+		break;
+/*	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV34TCL_TX_WRAP_S_MIRROR_CLAMP;
+		break;*/
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV34TCL_TX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV34TCL_TX_WRAP_S_SHIFT;
+}
+
+static void *
+nv30_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv30_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv30_sampler_state));
+
+	ps->fmt = 0;
+	/* TODO: Not all RECTs formats have this bit set, bits 15-8 of format
+	   are the tx format to use. We should store normalized coord flag
+	   in sampler state structure, and set appropriate format in
+	   nvxx_fragtex_build()
+	 */
+	/*NV34TCL_TX_FORMAT_RECT*/
+	/*if (!cso->normalized_coords) {
+		ps->fmt |= (1<<14) ;
+	}*/
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV34TCL_TX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV34TCL_TX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV34TCL_TX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+
+	if (cso->max_anisotropy >= 8.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_8X;
+	} else
+	if (cso->max_anisotropy >= 4.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_4X;
+	} else
+	if (cso->max_anisotropy >= 2.0) {
+		ps->en |= NV34TCL_TX_ENABLE_ANISO_2X;
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV34TCL_TX_FILTER_MAGNIFY_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV34TCL_TX_FILTER_MINIFY_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 14 /*NV34TCL_TX_ENABLE_MIPMAP_MAX_LOD_SHIFT*/;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit) << 26 /*NV34TCL_TX_ENABLE_MIPMAP_MIN_LOD_SHIFT*/;
+	}
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV34TCL_TX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv30_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv30->tex_sampler[unit] = sampler[unit];
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_samplers; unit++) {
+		nv30->tex_sampler[unit] = NULL;
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_samplers = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void
+nv30_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv30_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], miptree[unit]);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv30->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv30->tex_miptree[unit], NULL);
+		nv30->dirty_samplers |= (1 << unit);
+	}
+
+	nv30->nr_textures = nr;
+	nv30->dirty |= NV30_NEW_SAMPLER;
+}
+
+static void *
+nv30_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, rankine, NV34TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV34TCL_SHADE_MODEL_FLAT :
+				       NV34TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, rankine, NV34TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, rankine, NV34TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, rankine, NV34TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, rankine, NV34TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV34TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV34TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV34TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV34TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, rankine, NV34TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, rankine, NV34TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv30_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->rasterizer = hwcso;
+	nv30->dirty |= NV30_NEW_RAST;
+	/*nv30->draw_dirty |= NV30_NEW_RAST;*/
+}
+
+static void
+nv30_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv30_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+
+	so_method(so, rankine, NV34TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, rankine, NV34TCL_ALPHA_FUNC_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, rankine, NV34TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv30_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->zsa = hwcso;
+	nv30->dirty |= NV30_NEW_ZSA;
+}
+
+static void
+nv30_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv30_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	/*struct nv30_context *nv30 = nv30_context(pipe);*/
+	struct nv30_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv30_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	/*vp->draw = draw_create_vertex_shader(nv30->draw, &vp->pipe);*/
+
+	return (void *)vp;
+}
+
+static void
+nv30_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->vertprog = hwcso;
+	nv30->dirty |= NV30_NEW_VERTPROG;
+	/*nv30->draw_dirty |= NV30_NEW_VERTPROG;*/
+}
+
+static void
+nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_vertex_program *vp = hwcso;
+
+	/*draw_delete_vertex_shader(nv30->draw, vp->draw);*/
+	nv30_vertprog_destroy(nv30, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv30_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv30_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv30_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv30_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->fragprog = hwcso;
+	nv30->dirty |= NV30_NEW_FRAGPROG;
+}
+
+static void
+nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nv30_fragment_program *fp = hwcso;
+
+	nv30_fragprog_destroy(nv30, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv30_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->blend_colour = *bcol;
+	nv30->dirty |= NV30_NEW_BCOL;
+}
+
+static void
+nv30_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv30_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->constbuf[shader] = buf->buffer;
+	nv30->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv30->dirty |= NV30_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv30->dirty |= NV30_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv30_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->framebuffer = *fb;
+	nv30->dirty |= NV30_NEW_FB;
+}
+
+static void
+nv30_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->stipple, stipple->stipple, 4 * 32);
+	nv30->dirty |= NV30_NEW_STIPPLE;
+}
+
+static void
+nv30_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->scissor = *s;
+	nv30->dirty |= NV30_NEW_SCISSOR;
+}
+
+static void
+nv30_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->viewport = *vpt;
+	nv30->dirty |= NV30_NEW_VIEWPORT;
+	/*nv30->draw_dirty |= NV30_NEW_VIEWPORT;*/
+}
+
+static void
+nv30_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxbuf, vb, sizeof(*vb) * count);
+	nv30->vtxbuf_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	memcpy(nv30->vtxelt, ve, sizeof(*ve) * count);
+	nv30->vtxelt_nr = count;
+
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+static void
+nv30_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+
+	nv30->edgeflags = bitfield;
+	nv30->dirty |= NV30_NEW_ARRAYS;
+	/*nv30->draw_dirty |= NV30_NEW_ARRAYS;*/
+}
+
+void
+nv30_init_state_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.create_blend_state = nv30_blend_state_create;
+	nv30->pipe.bind_blend_state = nv30_blend_state_bind;
+	nv30->pipe.delete_blend_state = nv30_blend_state_delete;
+
+	nv30->pipe.create_sampler_state = nv30_sampler_state_create;
+	nv30->pipe.bind_sampler_states = nv30_sampler_state_bind;
+	nv30->pipe.delete_sampler_state = nv30_sampler_state_delete;
+	nv30->pipe.set_sampler_textures = nv30_set_sampler_texture;
+
+	nv30->pipe.create_rasterizer_state = nv30_rasterizer_state_create;
+	nv30->pipe.bind_rasterizer_state = nv30_rasterizer_state_bind;
+	nv30->pipe.delete_rasterizer_state = nv30_rasterizer_state_delete;
+
+	nv30->pipe.create_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_create;
+	nv30->pipe.bind_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_bind;
+	nv30->pipe.delete_depth_stencil_alpha_state =
+		nv30_depth_stencil_alpha_state_delete;
+
+	nv30->pipe.create_vs_state = nv30_vp_state_create;
+	nv30->pipe.bind_vs_state = nv30_vp_state_bind;
+	nv30->pipe.delete_vs_state = nv30_vp_state_delete;
+
+	nv30->pipe.create_fs_state = nv30_fp_state_create;
+	nv30->pipe.bind_fs_state = nv30_fp_state_bind;
+	nv30->pipe.delete_fs_state = nv30_fp_state_delete;
+
+	nv30->pipe.set_blend_color = nv30_set_blend_color;
+	nv30->pipe.set_clip_state = nv30_set_clip_state;
+	nv30->pipe.set_constant_buffer = nv30_set_constant_buffer;
+	nv30->pipe.set_framebuffer_state = nv30_set_framebuffer_state;
+	nv30->pipe.set_polygon_stipple = nv30_set_polygon_stipple;
+	nv30->pipe.set_scissor_state = nv30_set_scissor_state;
+	nv30->pipe.set_viewport_state = nv30_set_viewport_state;
+
+	nv30->pipe.set_edgeflags = nv30_set_edgeflags;
+	nv30->pipe.set_vertex_buffers = nv30_set_vertex_buffers;
+	nv30->pipe.set_vertex_elements = nv30_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv30/nv30_state.h b/src/gallium/drivers/nv30/nv30_state.h
new file mode 100644
index 0000000000..2023278e37
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state.h
@@ -0,0 +1,88 @@
+#ifndef __NV30_STATE_H__
+#define __NV30_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv30_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv30_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv30_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv30_vertex_program {
+	struct pipe_shader_state pipe;
+
+	boolean translated;
+
+	struct nv30_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv30_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv30_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	boolean on_hw;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv30_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	uint32_t fp_reg_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv30_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv30/nv30_state_blend.c b/src/gallium/drivers/nv30/nv30_state_blend.c
new file mode 100644
index 0000000000..44d43e132a
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_blend_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->blend->so, &nv30->state.hw[NV30_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend = {
+	.validate = nv30_state_blend_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BLEND,
+		.hw = NV30_STATE_BLEND
+	}
+};
+
+static boolean
+nv30_state_blend_colour_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv30->blend_colour;
+
+	so_method(so, nv30->screen->rankine, NV34TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_blend_colour = {
+	.validate = nv30_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV30_NEW_BCOL,
+		.hw = NV30_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_emit.c b/src/gallium/drivers/nv30/nv30_state_emit.c
new file mode 100644
index 0000000000..40fed621b2
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_emit.c
@@ -0,0 +1,118 @@
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+static struct nv30_state_entry *render_states[] = {
+	&nv30_state_framebuffer,
+	&nv30_state_rasterizer,
+	&nv30_state_scissor,
+	&nv30_state_stipple,
+	&nv30_state_fragprog,
+	&nv30_state_fragtex,
+	&nv30_state_vertprog,
+	&nv30_state_blend,
+	&nv30_state_blend_colour,
+	&nv30_state_zsa,
+	&nv30_state_viewport,
+	&nv30_state_vbo,
+	NULL
+};
+
+static void
+nv30_state_do_validate(struct nv30_context *nv30,
+		       struct nv30_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv30_state_entry *e = *states;
+
+		if (nv30->dirty & e->dirty.pipe) {
+			if (e->validate(nv30)) {
+				nv30->state.dirty |= (1ULL << e->dirty.hw);
+			}
+		}
+
+		states++;
+	}
+	nv30->dirty = 0;
+}
+
+void
+nv30_state_emit(struct nv30_context *nv30)
+{
+	struct nv30_state *state = &nv30->state;
+	struct nv30_screen *screen = nv30->screen;
+	unsigned i, samplers;
+	uint64 states;
+
+	if (nv30->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV30_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv30->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv30->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv30->nvws, nv30->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv30->nvws,
+				      state->hw[NV30_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_FRAGPROG]);
+	if (state->hw[NV30_STATE_VTXBUF] /*&& nv30->render_mode == HW*/)
+		so_emit_reloc_markers(nv30->nvws, state->hw[NV30_STATE_VTXBUF]);
+}
+
+boolean
+nv30_state_validate(struct nv30_context *nv30)
+{
+#if 0
+	boolean was_sw = nv30->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv30->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv30->fallback_swtnl & nv30->dirty)
+				!= nv30->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv30->pipe.flush(&nv30->pipe, 0, NULL);
+		nv30->dirty |= (NV30_NEW_VIEWPORT |
+				NV30_NEW_VERTPROG |
+				NV30_NEW_ARRAYS);
+		nv30->render_mode = HW;
+	}
+#endif
+	nv30_state_do_validate(nv30, render_states);
+#if 0
+	if (nv30->fallback_swtnl || nv30->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+#endif
+	return TRUE;
+}
diff --git a/src/gallium/drivers/nv30/nv30_state_fb.c b/src/gallium/drivers/nv30/nv30_state_fb.c
new file mode 100644
index 0000000000..73bdf7e56c
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_fb.c
@@ -0,0 +1,140 @@
+#include "nv30_context.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nv30_state_framebuffer_validate(struct nv30_context *nv30)
+{
+	struct pipe_framebuffer_state *fb = &nv30->framebuffer;
+	struct pipe_surface *rt[2], *zeta = NULL;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV34TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1)
+		rt_enable |= NV34TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->num_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		/* FIXME: NV34TCL_RT_FORMAT_LOG2_[WIDTH/HEIGHT] */
+		rt_format = NV34TCL_RT_FORMAT_TYPE_SWIZZLED |
+		log2i(fb->width) << 16 /*NV34TCL_RT_FORMAT_LOG2_WIDTH_SHIFT*/ |
+		log2i(fb->height) << 24 /*NV34TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT*/;
+	}
+	else
+		rt_format = NV34TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV34TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR0) {
+		uint32_t pitch = rt[0]->stride;
+		if (zeta) {
+			pitch |= (zeta->stride << 16);
+		} else {
+			pitch |= (pitch << 16);
+		}
+
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR0_PITCH, 2);
+		so_data  (so, pitch);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV34TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv30->screen->rankine, NV34TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv30->nvws->channel->vram->handle,
+			  nv30->nvws->channel->gart->handle);
+		so_method(so, nv30->screen->rankine, NV34TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		/* TODO: allocate LMA depth buffer */
+	}
+
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv30->screen->rankine, NV34TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv30->screen->rankine, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+	/* Wonder why this is needed, context should all be set to zero on init */
+	so_method(so, nv30->screen->rankine, NV34TCL_VIEWPORT_TX_ORIGIN, 1);
+	so_data  (so, 0);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_FB]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_framebuffer = {
+	.validate = nv30_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_FB,
+		.hw = NV30_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_rasterizer.c b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
new file mode 100644
index 0000000000..6d1b60e043
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_rasterizer_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->rasterizer->so,
+	       &nv30->state.hw[NV30_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_rasterizer = {
+	.validate = nv30_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV30_NEW_RAST,
+		.hw = NV30_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_scissor.c b/src/gallium/drivers/nv30/nv30_state_scissor.c
new file mode 100644
index 0000000000..1db9bc1795
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_scissor_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv30->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv30->state.scissor_enabled == 0))
+		return FALSE;
+	nv30->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv30->screen->rankine, NV34TCL_SCISSOR_HORIZ, 2);
+	if (nv30->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_scissor = {
+	.validate = nv30_state_scissor_validate,
+	.dirty = {
+		.pipe = NV30_NEW_SCISSOR | NV30_NEW_RAST,
+		.hw = NV30_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_stipple.c b/src/gallium/drivers/nv30/nv30_state_stipple.c
new file mode 100644
index 0000000000..41b42813b4
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_stipple_validate(struct nv30_context *nv30)
+{
+	struct pipe_rasterizer_state *rast = &nv30->rasterizer->pipe;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nouveau_stateobj *so;
+
+	if (nv30->state.hw[NV30_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv30->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv30->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_stipple = {
+	.validate = nv30_state_stipple_validate,
+	.dirty = {
+		.pipe = NV30_NEW_STIPPLE | NV30_NEW_RAST,
+		.hw = NV30_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_viewport.c b/src/gallium/drivers/nv30/nv30_state_viewport.c
new file mode 100644
index 0000000000..951d40ebfd
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_viewport.c
@@ -0,0 +1,70 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_viewport_validate(struct nv30_context *nv30)
+{
+	struct pipe_viewport_state *vpt = &nv30->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (/*nv30->render_mode == HW &&*/ !nv30->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv30->state.hw[NV30_STATE_VIEWPORT] &&
+	    (bypass || !(nv30->dirty & NV30_NEW_VIEWPORT)) &&
+	    nv30->state.viewport_bypass == bypass)
+		return FALSE;
+	nv30->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 1);
+*/	} else {
+		so_method(so, nv30->screen->rankine,
+			  NV34TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+/*		so_method(so, nv30->screen->rankine, 0x1d78, 1);
+		so_data  (so, 0x110);
+*/	}
+	/* TODO/FIXME: never saw value 0x0110 in renouveau dumps, only 0x0001 */
+	so_method(so, nv30->screen->rankine, 0x1d78, 1);
+	so_data  (so, 1);
+
+	so_ref(so, &nv30->state.hw[NV30_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_viewport = {
+	.validate = nv30_state_viewport_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VIEWPORT | NV30_NEW_RAST,
+		.hw = NV30_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_state_zsa.c b/src/gallium/drivers/nv30/nv30_state_zsa.c
new file mode 100644
index 0000000000..0940b7269b
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv30_context.h"
+
+static boolean
+nv30_state_zsa_validate(struct nv30_context *nv30)
+{
+	so_ref(nv30->zsa->so,
+	       &nv30->state.hw[NV30_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv30_state_entry nv30_state_zsa = {
+	.validate = nv30_state_zsa_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ZSA,
+		.hw = NV30_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_surface.c b/src/gallium/drivers/nv30/nv30_surface.c
new file mode 100644
index 0000000000..d3376a73bf
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_surface.c
@@ -0,0 +1,77 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv30_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv30_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	if (do_flip) {
+		/*XXX: This dodgyness will do for now for correctness.  But,
+		 *     need to investigate whether the 2D engine is able to
+		 *     manage a flip (perhaps SIFM?), if not, use the 3D engine
+		 */
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv30_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_winsys *nvws = nv30->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv30_init_surface_functions(struct nv30_context *nv30)
+{
+	nv30->pipe.surface_copy = nv30_surface_copy;
+	nv30->pipe.surface_fill = nv30_surface_fill;
+}
diff --git a/src/gallium/drivers/nv30/nv30_vbo.c b/src/gallium/drivers/nv30/nv30_vbo.c
new file mode 100644
index 0000000000..556f981d4a
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vbo.c
@@ -0,0 +1,556 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv30_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV34TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV34TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV34TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv30_vbo_set_idxbuf(struct nv30_context *nv30, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv30->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv30->idxbuf = NULL;
+		nv30->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV34TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv30->idxbuf ||
+	    type != nv30->idxbuf_format) {
+		nv30->dirty |= NV30_NEW_ARRAYS;
+		nv30->idxbuf = ib;
+		nv30->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_static_attrib(struct nv30_context *nv30, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV34TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, rankine, NV34TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	nv30_vbo_set_idxbuf(nv30, NULL, 0);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv30_draw_elements_u08(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u16(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(rankine, NV34TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv30_draw_elements_u32(struct nv30_context *nv30, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv30->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart = 0;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv30_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv30_draw_elements_u08(nv30, map, mode, start, count);
+		break;
+	case 2:
+		nv30_draw_elements_u16(nv30, map, mode, start, count);
+		break;
+	case 4:
+		nv30_draw_elements_u32(nv30, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv30_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	struct nouveau_channel *chan = nv30->nvws->channel;
+	unsigned restart = 0;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv30_state_emit(nv30);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(rankine, NV34TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(rankine, NV34TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(rankine, NV34TCL_VERTEX_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv30_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv30_context *nv30 = nv30_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv30_vbo_set_idxbuf(nv30, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv30_state_validate(nv30)) {
+		/*return nv30_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);*/
+		return FALSE;	
+	}
+
+	if (idxbuf) {
+		nv30_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv30_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv30_vbo_validate(struct nv30_context *nv30)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct pipe_buffer *ib = nv30->idxbuf;
+	unsigned ib_format = nv30->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv30->edgeflags) {
+		/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, rankine, NV34TCL_VTXBUF_ADDRESS(0), nv30->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, rankine, NV34TCL_VTXFMT(0), nv30->vtxelt_nr);
+
+	for (hw = 0; hw < nv30->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv30->vtxelt[hw];
+		vb = &nv30->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->pitch) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv30_vbo_static_attrib(nv30, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV34TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv30_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			/*nv30->fallback_swtnl |= NV30_NEW_ARRAYS;*/
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV34TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV34TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV34TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, rankine, NV34TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV34TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, rankine, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv30->state.hw[NV30_STATE_VTXBUF]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv30->state.hw[NV30_STATE_VTXFMT]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXFMT);
+	so_ref(sattr, &nv30->state.hw[NV30_STATE_VTXATTR]);
+	nv30->state.dirty |= (1ULL << NV30_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv30_state_entry nv30_state_vbo = {
+	.validate = nv30_vbo_validate,
+	.dirty = {
+		.pipe = NV30_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
diff --git a/src/gallium/drivers/nv30/nv30_vertprog.c b/src/gallium/drivers/nv30/nv30_vertprog.c
new file mode 100644
index 0000000000..72824559e8
--- /dev/null
+++ b/src/gallium/drivers/nv30/nv30_vertprog.c
@@ -0,0 +1,838 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "nv30_context.h"
+#include "nv30_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  2. Arb. swz/negation
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv30_shader.h"
+
+#define swz(s,x,y,z,w) nv30_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv30_sr_neg((s))
+#define abs(s) nv30_sr_abs((s))
+
+struct nv30_vpc {
+	struct nv30_vertex_program *vp;
+
+	struct nv30_vertex_program_exec *vpi;
+
+	unsigned output_map[PIPE_MAX_SHADER_OUTPUTS];
+
+	int high_temp;
+	int temp_temp_count;
+
+	struct nv30_sreg *imm;
+	unsigned nr_imm;
+};
+
+static struct nv30_sreg
+temp(struct nv30_vpc *vpc)
+{
+	int idx;
+
+	idx  = vpc->temp_temp_count++;
+	idx += vpc->high_temp + 1;
+	return nv30_sr(NV30SR_TEMP, idx);
+}
+
+static struct nv30_sreg
+constant(struct nv30_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	struct nv30_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv30_sr(NV30SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv30_sr(NV30SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv30_vp_arith((cc), (s), NV30_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv30_vpc *vpc, uint32_t *hw, int pos, struct nv30_sreg src)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV30SR_TEMP:
+		sr |= (NV30_VP_SRC_REG_TYPE_TEMP << NV30_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV30_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV30SR_INPUT:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV30_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV30SR_CONST:
+		sr |= (NV30_VP_SRC_REG_TYPE_CONST <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV30SR_NONE:
+		sr |= (NV30_VP_SRC_REG_TYPE_INPUT <<
+		       NV30_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV30_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV30_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV30_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV30_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV30_VP_SRC_SWZ_W_SHIFT));
+
+/*
+ * |VVV|
+ * d�.�b
+ *  \u/
+ *
+ */
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV30_VP_SRC0_HIGH_MASK) >>
+			  NV30_VP_SRC0_HIGH_SHIFT) << NV30_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV30_VP_SRC0_LOW_MASK) <<
+			  NV30_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV30_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV30_VP_SRC2_HIGH_MASK) >>
+			  NV30_VP_SRC2_HIGH_SHIFT) << NV30_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV30_VP_SRC2_LOW_MASK) <<
+			  NV30_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv30_vpc *vpc, uint32_t *hw, int slot, struct nv30_sreg dst)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV30SR_TEMP:
+		hw[0] |= (dst.index << NV30_VP_INST_DEST_TEMP_ID_SHIFT);
+		break;
+	case NV30SR_OUTPUT:
+		switch (dst.index) {
+		case NV30_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV30_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV30_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV30_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV30_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV30_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV30_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV30_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV30_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV30_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV30_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV30_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV30_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV30_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV30_VP_INST_DEST_SHIFT);
+		hw[0] |= NV30_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+
+		/*XXX: no way this is entirely correct, someone needs to
+		 *     figure out what exactly it is.
+		 */
+		hw[3] |= 0x800;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv30_vp_arith(struct nv30_vpc *vpc, int slot, int op,
+	      struct nv30_sreg dst, int mask,
+	      struct nv30_sreg s0, struct nv30_sreg s1,
+	      struct nv30_sreg s2)
+{
+	struct nv30_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV30_VP_INST_COND_TR << NV30_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV30_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV30_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV30_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV30_VP_INST_COND_SWZ_W_SHIFT));
+
+	hw[1] |= (op << NV30_VP_INST_VEC_OPCODE_SHIFT);
+//	hw[3] |= NV30_VP_INST_SCA_DEST_TEMP_MASK;
+//	hw[3] |= (mask << NV30_VP_INST_VEC_WRITEMASK_SHIFT);
+
+	if (dst.type == NV30SR_OUTPUT) {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_SDEST_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VDEST_WRITEMASK_SHIFT);
+	} else {
+		if (slot)
+			hw[3] |= (mask << NV30_VP_INST_STEMP_WRITEMASK_SHIFT);
+		else
+			hw[3] |= (mask << NV30_VP_INST_VTEMP_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv30_sreg
+tgsi_src(struct nv30_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv30_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv30_sr(NV30SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		if (vpc->high_temp < fsrc->SrcRegister.Index)
+			vpc->high_temp = fsrc->SrcRegister.Index;
+		src = nv30_sr(NV30SR_TEMP, fsrc->SrcRegister.Index);
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv30_sreg
+tgsi_dst(struct nv30_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv30_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = nv30_sr(NV30SR_OUTPUT,
+			      vpc->output_map[fdst->DstRegister.Index]);
+
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = nv30_sr(NV30SR_TEMP, fdst->DstRegister.Index);
+		if (vpc->high_temp < dst.index)
+			vpc->high_temp = dst.index;
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+nv30_vertprog_parse_instruction(struct nv30_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv30_sreg src[3], dst, tmp;
+	struct nv30_sreg none = nv30_sr(NV30SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	vpc->temp_temp_count = 0;
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		/*XXX: index comparison is broken now that consts come from
+		 *     two different register files.
+		 */
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_IMMEDIATE:
+			if (ci == -1 || ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(vpc, 0, OP_SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_parse_decl_output(struct nv30_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV30_VP_INST_DEST_POS;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV30_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV30_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV30_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV30_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV30_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->output_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv30_vertprog_prepare(struct nv30_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int nr_imm = 0;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv30_sreg));
+		assert(vpc->imm);
+	}
+
+	return TRUE;
+}
+
+static void
+nv30_vertprog_translate(struct nv30_context *nv30,
+			struct nv30_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv30_vpc *vpc = NULL;
+
+	tgsi_dump(vp->pipe.tokens,0);
+
+	vpc = CALLOC(1, sizeof(struct nv30_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+	vpc->high_temp = -1;
+
+	if (!nv30_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &parse.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_OUTPUT:
+				if (!nv30_vertprog_parse_decl_output(vpc, fdec))
+					goto out_err;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv30_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV30_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	FREE(vpc);
+}
+
+static boolean
+nv30_vertprog_validate(struct nv30_context *nv30)
+{ 
+	struct nouveau_winsys *nvws = nv30->nvws;
+	struct pipe_winsys *ws = nv30->pipe.winsys;
+	struct nouveau_grobj *rankine = nv30->screen->rankine;
+	struct nv30_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	vp = nv30->vertprog;
+	constbuf = nv30->constbuf[PIPE_SHADER_VERTEX];
+
+	/* Translate TGSI shader into hw bytecode */
+	if (!vp->translated) {
+		nv30_vertprog_translate(nv30, vp);
+		if (!vp->translated)
+			return FALSE;
+	}
+
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv30->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(2, 0);
+		so_method(so, rankine, NV34TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv30->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv30_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv30_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV30_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV30_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv30_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf) {
+			ws->buffer_unmap(ws, constbuf);
+		}
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP inst %d: 0x%08x 0x%08x 0x%08x 0x%08x\n",
+				i, vp->insns[i].data[0], vp->insns[i].data[1],
+				vp->insns[i].data[2], vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(rankine, NV34TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv30->state.hw[NV30_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv30->state.hw[NV30_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv30_vertprog_destroy(struct nv30_context *nv30, struct nv30_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv30->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv30_state_entry nv30_state_vertprog = {
+	.validate = nv30_vertprog_validate,
+	.dirty = {
+		.pipe = NV30_NEW_VERTPROG /*| NV30_NEW_UCP*/,
+		.hw = NV30_STATE_VERTPROG,
+	}
+};
diff --git a/src/gallium/drivers/nv40/Makefile b/src/gallium/drivers/nv40/Makefile
new file mode 100644
index 0000000000..9c8eadf7e4
--- /dev/null
+++ b/src/gallium/drivers/nv40/Makefile
@@ -0,0 +1,37 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv40
+
+DRIVER_SOURCES = \
+	nv40_clear.c \
+	nv40_context.c \
+	nv40_draw.c \
+	nv40_fragprog.c \
+	nv40_fragtex.c \
+	nv40_miptree.c \
+	nv40_query.c \
+	nv40_screen.c \
+	nv40_state.c \
+	nv40_state_blend.c \
+	nv40_state_emit.c \
+	nv40_state_fb.c \
+	nv40_state_rasterizer.c \
+	nv40_state_scissor.c \
+	nv40_state_stipple.c \
+	nv40_state_viewport.c \
+	nv40_state_zsa.c \
+	nv40_surface.c \
+	nv40_vbo.c \
+	nv40_vertprog.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv40/nv40_clear.c b/src/gallium/drivers/nv40/nv40_clear.c
new file mode 100644
index 0000000000..59efd620e3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_clear.c
@@ -0,0 +1,13 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+
+void
+nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
new file mode 100644
index 0000000000..cc63dd734b
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -0,0 +1,72 @@
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+static void
+nv40_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	
+	if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, 0x1fd8, 1);
+		OUT_RING  (1);
+	}
+
+	FIRE_RING(fence);
+}
+
+static void
+nv40_destroy(struct pipe_context *pipe)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	if (nv40->draw)
+		draw_destroy(nv40->draw);
+	FREE(nv40);
+}
+
+struct pipe_context *
+nv40_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_context *nv40;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nv40 = CALLOC(1, sizeof(struct nv40_context));
+	if (!nv40)
+		return NULL;
+	nv40->screen = screen;
+	nv40->pctx_id = pctx_id;
+
+	nv40->nvws = nvws;
+
+	nv40->pipe.winsys = ws;
+	nv40->pipe.screen = pscreen;
+	nv40->pipe.destroy = nv40_destroy;
+	nv40->pipe.draw_arrays = nv40_draw_arrays;
+	nv40->pipe.draw_elements = nv40_draw_elements;
+	nv40->pipe.clear = nv40_clear;
+	nv40->pipe.flush = nv40_flush;
+
+	nv40_init_query_functions(nv40);
+	nv40_init_surface_functions(nv40);
+	nv40_init_state_functions(nv40);
+
+	/* Create, configure, and install fallback swtnl path */
+	nv40->draw = draw_create();
+	draw_wide_point_threshold(nv40->draw, 9999999.0);
+	draw_wide_line_threshold(nv40->draw, 9999999.0);
+	draw_enable_line_stipple(nv40->draw, FALSE);
+	draw_enable_point_sprites(nv40->draw, FALSE);
+	draw_set_rasterize_stage(nv40->draw, nv40_draw_render_stage(nv40));
+
+	return &nv40->pipe;
+}
+	
diff --git a/src/gallium/drivers/nv40/nv40_context.h b/src/gallium/drivers/nv40/nv40_context.h
new file mode 100644
index 0000000000..adcfbdd85a
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_context.h
@@ -0,0 +1,233 @@
+#ifndef __NV40_CONTEXT_H__
+#define __NV40_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv40_screen *ctx = nv40->screen
+#include "nouveau/nouveau_push.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#include "nv40_state.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+enum nv40_state_index {
+	NV40_STATE_FB = 0,
+	NV40_STATE_VIEWPORT = 1,
+	NV40_STATE_BLEND = 2,
+	NV40_STATE_RAST = 3,
+	NV40_STATE_ZSA = 4,
+	NV40_STATE_BCOL = 5,
+	NV40_STATE_CLIP = 6,
+	NV40_STATE_SCISSOR = 7,
+	NV40_STATE_STIPPLE = 8,
+	NV40_STATE_FRAGPROG = 9,
+	NV40_STATE_VERTPROG = 10,
+	NV40_STATE_FRAGTEX0 = 11,
+	NV40_STATE_FRAGTEX1 = 12,
+	NV40_STATE_FRAGTEX2 = 13,
+	NV40_STATE_FRAGTEX3 = 14,
+	NV40_STATE_FRAGTEX4 = 15,
+	NV40_STATE_FRAGTEX5 = 16,
+	NV40_STATE_FRAGTEX6 = 17,
+	NV40_STATE_FRAGTEX7 = 18,
+	NV40_STATE_FRAGTEX8 = 19,
+	NV40_STATE_FRAGTEX9 = 20,
+	NV40_STATE_FRAGTEX10 = 21,
+	NV40_STATE_FRAGTEX11 = 22,
+	NV40_STATE_FRAGTEX12 = 23,
+	NV40_STATE_FRAGTEX13 = 24,
+	NV40_STATE_FRAGTEX14 = 25,
+	NV40_STATE_FRAGTEX15 = 26,
+	NV40_STATE_VERTTEX0 = 27,
+	NV40_STATE_VERTTEX1 = 28,
+	NV40_STATE_VERTTEX2 = 29,
+	NV40_STATE_VERTTEX3 = 30,
+	NV40_STATE_VTXBUF = 31,
+	NV40_STATE_VTXFMT = 32,
+	NV40_STATE_VTXATTR = 33,
+	NV40_STATE_MAX = 34
+};
+
+#include "nv40_screen.h"
+
+#define NV40_NEW_BLEND		(1 <<  0)
+#define NV40_NEW_RAST		(1 <<  1)
+#define NV40_NEW_ZSA		(1 <<  2)
+#define NV40_NEW_SAMPLER	(1 <<  3)
+#define NV40_NEW_FB		(1 <<  4)
+#define NV40_NEW_STIPPLE	(1 <<  5)
+#define NV40_NEW_SCISSOR	(1 <<  6)
+#define NV40_NEW_VIEWPORT	(1 <<  7)
+#define NV40_NEW_BCOL		(1 <<  8)
+#define NV40_NEW_VERTPROG	(1 <<  9)
+#define NV40_NEW_FRAGPROG	(1 << 10)
+#define NV40_NEW_ARRAYS		(1 << 11)
+#define NV40_NEW_UCP		(1 << 12)
+
+struct nv40_rasterizer_state {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_zsa_state {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_blend_state {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+
+struct nv40_state {
+	unsigned scissor_enabled;
+	unsigned stipple_enabled;
+	unsigned viewport_bypass;
+	unsigned fp_samplers;
+
+	uint64_t dirty;
+	struct nouveau_stateobj *hw[NV40_STATE_MAX];
+};
+
+struct nv40_context {
+	struct pipe_context pipe;
+
+	struct nouveau_winsys *nvws;
+	struct nv40_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	/* HW state derived from pipe states */
+	struct nv40_state state;
+	struct {
+		struct nv40_vertex_program *vertprog;
+
+		unsigned nr_attribs;
+		unsigned hw[PIPE_MAX_SHADER_INPUTS];
+		unsigned draw[PIPE_MAX_SHADER_INPUTS];
+		unsigned emit[PIPE_MAX_SHADER_INPUTS];
+	} swtnl;
+
+	enum {
+		HW, SWTNL, SWRAST
+	} render_mode;
+	unsigned fallback_swtnl;
+	unsigned fallback_swrast;
+
+	/* Context state */
+	unsigned dirty, draw_dirty;
+	struct pipe_scissor_state scissor;
+	unsigned stipple[32];
+	struct pipe_clip_state clip;
+	struct nv40_vertex_program *vertprog;
+	struct nv40_fragment_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	unsigned constbuf_nr[PIPE_SHADER_TYPES];
+	struct nv40_rasterizer_state *rasterizer;
+	struct nv40_zsa_state *zsa;
+	struct nv40_blend_state *blend;
+	struct pipe_blend_color blend_colour;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct pipe_buffer *idxbuf;
+	unsigned idxbuf_format;
+	struct nv40_sampler_state *tex_sampler[PIPE_MAX_SAMPLERS];
+	struct nv40_miptree *tex_miptree[PIPE_MAX_SAMPLERS];
+	unsigned nr_samplers;
+	unsigned nr_textures;
+	unsigned dirty_samplers;
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	const unsigned *edgeflags;
+};
+
+static INLINE struct nv40_context *
+nv40_context(struct pipe_context *pipe)
+{
+	return (struct nv40_context *)pipe;
+}
+
+struct nv40_state_entry {
+	boolean (*validate)(struct nv40_context *nv40);
+	struct {
+		unsigned pipe;
+		unsigned hw;
+	} dirty;
+};
+
+extern void nv40_init_state_functions(struct nv40_context *nv40);
+extern void nv40_init_surface_functions(struct nv40_context *nv40);
+extern void nv40_init_query_functions(struct nv40_context *nv40);
+
+extern void nv40_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv40_draw.c */
+extern struct draw_stage *nv40_draw_render_stage(struct nv40_context *nv40);
+extern boolean nv40_draw_elements_swtnl(struct pipe_context *pipe,
+					struct pipe_buffer *idxbuf,
+					unsigned ib_size, unsigned mode,
+					unsigned start, unsigned count);
+
+/* nv40_vertprog.c */
+extern void nv40_vertprog_destroy(struct nv40_context *,
+				  struct nv40_vertex_program *);
+
+/* nv40_fragprog.c */
+extern void nv40_fragprog_destroy(struct nv40_context *,
+				  struct nv40_fragment_program *);
+
+/* nv40_fragtex.c */
+extern void nv40_fragtex_bind(struct nv40_context *);
+
+/* nv40_state.c and friends */
+extern boolean nv40_state_validate(struct nv40_context *nv40);
+extern boolean nv40_state_validate_swtnl(struct nv40_context *nv40);
+extern void nv40_state_emit(struct nv40_context *nv40);
+extern struct nv40_state_entry nv40_state_rasterizer;
+extern struct nv40_state_entry nv40_state_scissor;
+extern struct nv40_state_entry nv40_state_stipple;
+extern struct nv40_state_entry nv40_state_fragprog;
+extern struct nv40_state_entry nv40_state_vertprog;
+extern struct nv40_state_entry nv40_state_blend;
+extern struct nv40_state_entry nv40_state_blend_colour;
+extern struct nv40_state_entry nv40_state_zsa;
+extern struct nv40_state_entry nv40_state_viewport;
+extern struct nv40_state_entry nv40_state_framebuffer;
+extern struct nv40_state_entry nv40_state_fragtex;
+extern struct nv40_state_entry nv40_state_vbo;
+extern struct nv40_state_entry nv40_state_vtxfmt;
+
+/* nv40_vbo.c */
+extern boolean nv40_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv40_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+
+/* nv40_clear.c */
+extern void nv40_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
new file mode 100644
index 0000000000..8e56cdc2fe
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -0,0 +1,349 @@
+#include "pipe/p_shader_tokens.h"
+
+#include "util/u_pack_color.h"
+
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pipe.h"
+
+#include "nv40_context.h"
+#define NV40_SHADER_NO_FUCKEDNESS
+#include "nv40_shader.h"
+
+/* Simple, but crappy, swtnl path, hopefully we wont need to hit this very
+ * often at all.  Uses "quadro style" vertex submission + a fixed vertex
+ * layout to avoid the need to generate a vertex program or vtxfmt.
+ */
+
+struct nv40_render_stage {
+	struct draw_stage stage;
+	struct nv40_context *nv40;
+	unsigned prim;
+};
+
+static INLINE struct nv40_render_stage *
+nv40_render_stage(struct draw_stage *stage)
+{
+	return (struct nv40_render_stage *)stage;
+}
+
+static INLINE void
+nv40_render_vertex(struct nv40_context *nv40, const struct vertex_header *v)
+{
+	unsigned i;
+
+	for (i = 0; i < nv40->swtnl.nr_attribs; i++) {
+		unsigned idx = nv40->swtnl.draw[i];
+		unsigned hw = nv40->swtnl.hw[i];
+
+		switch (nv40->swtnl.emit[i]) {
+		case EMIT_OMIT:
+			break;
+		case EMIT_1F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_1F(hw), 1);
+			OUT_RING  (fui(v->data[idx][0]));
+			break;
+		case EMIT_2F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_2F_X(hw), 2);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			break;
+		case EMIT_3F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_3F_X(hw), 3);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			break;
+		case EMIT_4F:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4F_X(hw), 4);
+			OUT_RING  (fui(v->data[idx][0]));
+			OUT_RING  (fui(v->data[idx][1]));
+			OUT_RING  (fui(v->data[idx][2]));
+			OUT_RING  (fui(v->data[idx][3]));
+			break;
+		case EMIT_4UB:
+			BEGIN_RING(curie, NV40TCL_VTX_ATTR_4UB(hw), 1);
+			OUT_RING  (pack_ub4(float_to_ubyte(v->data[idx][0]),
+					    float_to_ubyte(v->data[idx][1]),
+					    float_to_ubyte(v->data[idx][2]),
+					    float_to_ubyte(v->data[idx][3])));
+			break;
+		default:
+			assert(0);
+			break;
+		}
+	}
+}
+
+static INLINE void
+nv40_render_prim(struct draw_stage *stage, struct prim_header *prim,
+	       unsigned mode, unsigned count)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(stage);
+	struct nv40_context *nv40 = rs->nv40;
+	struct nouveau_pushbuf *pb = nv40->nvws->channel->pushbuf;
+	unsigned i;
+
+	/* Ensure there's room for 4xfloat32 + potentially 3 begin/end */
+	if (pb->remaining < ((count * 20) + 6)) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			NOUVEAU_ERR("AIII, missed flush\n");
+			assert(0);
+		}
+		FIRE_RING(NULL);
+		nv40_state_emit(nv40);
+	}
+
+	/* Switch primitive modes if necessary */
+	if (rs->prim != mode) {
+		if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+			BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+			OUT_RING  (NV40TCL_BEGIN_END_STOP);	
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (mode);
+		rs->prim = mode;
+	}
+
+	/* Emit vertex data */
+	for (i = 0; i < count; i++)
+		nv40_render_vertex(nv40, prim->v[i]);
+
+	/* If it's likely we'll need to empty the push buffer soon, finish
+	 * off the primitive now.
+	 */
+	if (pb->remaining < ((count * 20) + 6)) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_point(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_POINTS, 1);
+}
+
+static void
+nv40_render_line(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_LINES, 2);
+}
+
+static void
+nv40_render_tri(struct draw_stage *draw, struct prim_header *prim)
+{
+	nv40_render_prim(draw, prim, NV40TCL_BEGIN_END_TRIANGLES, 3);
+}
+
+static void
+nv40_render_flush(struct draw_stage *draw, unsigned flags)
+{
+	struct nv40_render_stage *rs = nv40_render_stage(draw);
+	struct nv40_context *nv40 = rs->nv40;
+
+	if (rs->prim != NV40TCL_BEGIN_END_STOP) {
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (NV40TCL_BEGIN_END_STOP);
+		rs->prim = NV40TCL_BEGIN_END_STOP;
+	}
+}
+
+static void
+nv40_render_reset_stipple_counter(struct draw_stage *draw)
+{
+}
+
+static void
+nv40_render_destroy(struct draw_stage *draw)
+{
+	FREE(draw);
+}
+
+static INLINE void
+emit_mov(struct nv40_vertex_program *vp,
+	 unsigned dst, unsigned src, unsigned vor, unsigned mask)
+{
+	struct nv40_vertex_program_exec *inst;
+
+	vp->insns = realloc(vp->insns,
+			    sizeof(struct nv40_vertex_program_exec) *
+			    ++vp->nr_insns);
+	inst = &vp->insns[vp->nr_insns - 1];
+
+	inst->data[0] = 0x401f9c6c;
+	inst->data[1] = 0x0040000d | (src << 8);
+	inst->data[2] = 0x8106c083;
+	inst->data[3] = 0x6041ff80 | (dst << 2) | (mask << 13);
+	inst->const_index = -1;
+	inst->has_branch_offset = FALSE;
+
+	vp->ir |= (1 << src);
+	if (vor != ~0)
+		vp->or |= (1 << vor);
+}
+
+static struct nv40_vertex_program *
+create_drawvp(struct nv40_context *nv40)
+{
+	struct nv40_vertex_program *vp = CALLOC_STRUCT(nv40_vertex_program);
+	unsigned i;
+
+	emit_mov(vp, NV40_VP_INST_DEST_POS, 0, ~0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL0, 3, 0, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_COL1, 4, 1, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC0, 3, 2, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_BFC1, 4, 3, 0xf);
+	emit_mov(vp, NV40_VP_INST_DEST_FOGC, 5, 4, 0x8);
+	for (i = 0; i < 8; i++)
+		emit_mov(vp, NV40_VP_INST_DEST_TC(i), 8 + i, 14 + i, 0xf);
+
+	vp->insns[vp->nr_insns - 1].data[3] |= 1;
+	vp->translated = TRUE;
+	return vp;
+}
+
+struct draw_stage *
+nv40_draw_render_stage(struct nv40_context *nv40)
+{
+	struct nv40_render_stage *render = CALLOC_STRUCT(nv40_render_stage);
+
+	if (!nv40->swtnl.vertprog)
+		nv40->swtnl.vertprog = create_drawvp(nv40);
+
+	render->nv40 = nv40;
+	render->stage.draw = nv40->draw;
+	render->stage.point = nv40_render_point;
+	render->stage.line = nv40_render_line;
+	render->stage.tri = nv40_render_tri;
+	render->stage.flush = nv40_render_flush;
+	render->stage.reset_stipple_counter = nv40_render_reset_stipple_counter;
+	render->stage.destroy = nv40_render_destroy;
+
+	return &render->stage;
+}
+
+boolean
+nv40_draw_elements_swtnl(struct pipe_context *pipe,
+			 struct pipe_buffer *idxbuf, unsigned idxbuf_size,
+			 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	unsigned i;
+	void *map;
+
+	if (!nv40_state_validate_swtnl(nv40))
+		return FALSE;
+	nv40->state.dirty &= ~(1ULL << NV40_STATE_VTXBUF);
+	nv40_state_emit(nv40);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++) {
+		map = ws->buffer_map(ws, nv40->vtxbuf[i].buffer,
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_vertex_buffer(nv40->draw, i, map);
+	}
+
+	if (idxbuf) {
+		map = ws->buffer_map(ws, idxbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_element_buffer(nv40->draw, idxbuf_size, map);
+	} else {
+		draw_set_mapped_element_buffer(nv40->draw, 0, NULL);
+	}
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX]) {
+		const unsigned nr = nv40->constbuf_nr[PIPE_SHADER_VERTEX];
+
+		map = ws->buffer_map(ws, nv40->constbuf[PIPE_SHADER_VERTEX],
+				     PIPE_BUFFER_USAGE_CPU_READ);
+		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+	}
+
+	draw_arrays(nv40->draw, mode, start, count);
+
+	for (i = 0; i < nv40->vtxbuf_nr; i++)
+		ws->buffer_unmap(ws, nv40->vtxbuf[i].buffer);
+
+	if (idxbuf)
+		ws->buffer_unmap(ws, idxbuf);
+
+	if (nv40->constbuf[PIPE_SHADER_VERTEX])
+		ws->buffer_unmap(ws, nv40->constbuf[PIPE_SHADER_VERTEX]);
+
+	draw_flush(nv40->draw);
+	pipe->flush(pipe, 0, NULL);
+
+	return TRUE;
+}
+
+static INLINE void
+emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
+	    unsigned semantic, unsigned index)
+{
+	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned a = nv40->swtnl.nr_attribs++;
+
+	nv40->swtnl.hw[a] = hw;
+	nv40->swtnl.emit[a] = emit;
+	nv40->swtnl.draw[a] = draw_out;
+}
+
+static boolean
+nv40_state_vtxfmt_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	unsigned colour = 0, texcoords = 0, fog = 0, i;
+
+	/* Determine needed fragprog inputs */
+	for (i = 0; i < fp->info.num_inputs; i++) {
+		switch (fp->info.input_semantic_name[i]) {
+		case TGSI_SEMANTIC_POSITION:
+			break;
+		case TGSI_SEMANTIC_COLOR:
+			colour |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_GENERIC:
+			texcoords |= (1 << fp->info.input_semantic_index[i]);
+			break;
+		case TGSI_SEMANTIC_FOG:
+			fog = 1;
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	nv40->swtnl.nr_attribs = 0;
+
+	/* Map draw vtxprog output to hw attribute IDs */
+	for (i = 0; i < 2; i++) {
+		if (!(colour & (1 << i)))
+			continue;
+		emit_attrib(nv40, 3 + i, EMIT_4UB, TGSI_SEMANTIC_COLOR, i);
+	}
+
+	for (i = 0; i < 8; i++) {
+		if (!(texcoords & (1 << i)))
+			continue;
+		emit_attrib(nv40, 8 + i, EMIT_4F, TGSI_SEMANTIC_GENERIC, i);
+	}
+
+	if (fog) {
+		emit_attrib(nv40, 5, EMIT_1F, TGSI_SEMANTIC_FOG, 0);
+	}
+
+	emit_attrib(nv40, 0, EMIT_3F, TGSI_SEMANTIC_POSITION, 0);
+
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vtxfmt = {
+	.validate = nv40_state_vtxfmt_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
new file mode 100644
index 0000000000..91dcbebda0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -0,0 +1,991 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 1
+#define MASK_Y 2
+#define MASK_Z 4
+#define MASK_W 8
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE NV40_FP_OP_DST_SCALE_1X
+#define DEF_CTEST NV40_FP_OP_COND_TR
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+#define scale(s,v) nv40_sr_scale((s), NV40_FP_OP_DST_SCALE_##v)
+
+#define MAX_CONSTS 128
+#define MAX_IMM 32
+struct nv40_fpc {
+	struct nv40_fragment_program *fp;
+
+	uint attrib_map[PIPE_MAX_SHADER_INPUTS];
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_temp;
+
+	int num_regs;
+
+	unsigned inst_offset;
+	unsigned have_const;
+
+	struct {
+		int pipe;
+		float vals[4];
+	} consts[MAX_CONSTS];
+	int nr_consts;
+
+	struct nv40_sreg imm[MAX_IMM];
+	unsigned nr_imm;
+};
+
+static INLINE struct nv40_sreg
+temp(struct nv40_fpc *fpc)
+{
+	int idx = ffs(~fpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	fpc->r_temps |= (1 << idx);
+	fpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_fpc *fpc)
+{
+	fpc->r_temps &= ~fpc->r_temps_discard;
+	fpc->r_temps_discard = 0;
+}
+
+static INLINE struct nv40_sreg
+constant(struct nv40_fpc *fpc, int pipe, float vals[4])
+{
+	int idx;
+
+	if (fpc->nr_consts == MAX_CONSTS)
+		assert(0);
+	idx = fpc->nr_consts++;
+
+	fpc->consts[idx].pipe = pipe;
+	if (pipe == -1)
+		memcpy(fpc->consts[idx].vals, vals, 4 * sizeof(float));
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_fp_arith((cc), (s), NV40_FP_OP_OPCODE_##o, \
+			(d), (m), (s0), (s1), (s2))
+#define tex(cc,s,o,u,d,m,s0,s1,s2) \
+	nv40_fp_tex((cc), (s), NV40_FP_OP_OPCODE_##o, (u), \
+		    (d), (m), (s0), none, none)
+
+static void
+grow_insns(struct nv40_fpc *fpc, int size)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	fp->insn_len += size;
+	fp->insn = realloc(fp->insn, sizeof(uint32_t) * fp->insn_len);
+}
+
+static void
+emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_INPUT:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		hw[0] |= (src.index << NV40_FP_OP_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_OUTPUT:
+		sr |= NV40_FP_REG_SRC_HALF;
+		/* fall-through */
+	case NV40SR_TEMP:
+		sr |= (NV40_FP_REG_TYPE_TEMP << NV40_FP_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_FP_REG_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		if (!fpc->have_const) {
+			grow_insns(fpc, 4);
+			fpc->have_const = 1;
+		}
+
+		hw = &fp->insn[fpc->inst_offset];
+		if (fpc->consts[src.index].pipe >= 0) {
+			struct nv40_fragment_program_data *fpd;
+
+			fp->consts = realloc(fp->consts, ++fp->nr_consts *
+					     sizeof(*fpd));
+			fpd = &fp->consts[fp->nr_consts - 1];
+			fpd->offset = fpc->inst_offset + 4;
+			fpd->index = fpc->consts[src.index].pipe;
+			memset(&fp->insn[fpd->offset], 0, sizeof(uint32_t) * 4);
+		} else {
+			memcpy(&fp->insn[fpc->inst_offset + 4],
+				fpc->consts[src.index].vals,
+				sizeof(uint32_t) * 4);
+		}
+
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_FP_REG_NEGATE;
+
+	if (src.abs)
+		hw[1] |= (1 << (29 + pos));
+
+	sr |= ((src.swz[0] << NV40_FP_REG_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_FP_REG_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_FP_REG_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_FP_REG_SWZ_W_SHIFT));
+
+	hw[pos + 1] |= sr;
+}
+
+static void
+emit_dst(struct nv40_fpc *fpc, struct nv40_sreg dst)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw = &fp->insn[fpc->inst_offset];
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		if (fpc->num_regs < (dst.index + 1))
+			fpc->num_regs = dst.index + 1;
+		break;
+	case NV40SR_OUTPUT:
+		if (dst.index == 1) {
+			fp->fp_control |= 0xe;
+		} else {
+			hw[0] |= NV40_FP_OP_OUT_REG_HALF;
+		}
+		break;
+	case NV40SR_NONE:
+		hw[0] |= (1 << 30);
+		break;
+	default:
+		assert(0);
+	}
+
+	hw[0] |= (dst.index << NV40_FP_OP_OUT_REG_SHIFT);
+}
+
+static void
+nv40_fp_arith(struct nv40_fpc *fpc, int sat, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+	uint32_t *hw;
+
+	fpc->inst_offset = fp->insn_len;
+	fpc->have_const = 0;
+	grow_insns(fpc, 4);
+	hw = &fp->insn[fpc->inst_offset];
+	memset(hw, 0, sizeof(uint32_t) * 4);
+
+	if (op == NV40_FP_OP_OPCODE_KIL)
+		fp->fp_control |= NV40TCL_FP_CONTROL_KIL;
+	hw[0] |= (op << NV40_FP_OP_OPCODE_SHIFT);
+	hw[0] |= (mask << NV40_FP_OP_OUTMASK_SHIFT);
+	hw[2] |= (dst.dst_scale << NV40_FP_OP_DST_SCALE_SHIFT);
+
+	if (sat)
+		hw[0] |= NV40_FP_OP_OUT_SAT;
+
+	if (dst.cc_update)
+		hw[0] |= NV40_FP_OP_COND_WRITE_ENABLE;
+	hw[1] |= (dst.cc_test << NV40_FP_OP_COND_SHIFT);
+	hw[1] |= ((dst.cc_swz[0] << NV40_FP_OP_COND_SWZ_X_SHIFT) |
+		  (dst.cc_swz[1] << NV40_FP_OP_COND_SWZ_Y_SHIFT) |
+		  (dst.cc_swz[2] << NV40_FP_OP_COND_SWZ_Z_SHIFT) |
+		  (dst.cc_swz[3] << NV40_FP_OP_COND_SWZ_W_SHIFT));
+
+	emit_dst(fpc, dst);
+	emit_src(fpc, 0, s0);
+	emit_src(fpc, 1, s1);
+	emit_src(fpc, 2, s2);
+}
+
+static void
+nv40_fp_tex(struct nv40_fpc *fpc, int sat, int op, int unit,
+	    struct nv40_sreg dst, int mask,
+	    struct nv40_sreg s0, struct nv40_sreg s1, struct nv40_sreg s2)
+{
+	struct nv40_fragment_program *fp = fpc->fp;
+
+	nv40_fp_arith(fpc, sat, op, dst, mask, s0, s1, s2);
+
+	fp->insn[fpc->inst_offset] |= (unit << NV40_FP_OP_TEX_UNIT_SHIFT);
+	fp->samplers |= (1 << unit);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc)
+{
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT,
+			      fpc->attrib_map[fsrc->SrcRegister.Index]);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(fpc, fsrc->SrcRegister.Index, NULL);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		assert(fsrc->SrcRegister.Index < fpc->nr_imm);
+		src = fpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = fpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	/* NV40 fragprog result regs are just temps, so this is simple */
+	case TGSI_FILE_OUTPUT:
+		src = fpc->r_result[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_fpc *fpc, const struct tgsi_full_dst_register *fdst) {
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		return fpc->r_result[fdst->DstRegister.Index];
+	case TGSI_FILE_TEMPORARY:
+		return fpc->r_temp[fdst->DstRegister.Index];
+	case TGSI_FILE_NULL:
+		return nv40_sr(NV40SR_NONE, 0);
+	default:
+		NOUVEAU_ERR("bad dst file %d\n", fdst->DstRegister.File);
+		return nv40_sr(NV40SR_NONE, 0);
+	}
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_fpc *fpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(fpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= (1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= (1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= (1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(fpc);
+
+	if (mask)
+		arith(fpc, 0, MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(fpc, 0, SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(fpc, 0, STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(fpc);
+		arith(fpc, 0, STR, one, neg_mask, one, none, none);
+		arith(fpc, 0, MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
+				const struct tgsi_full_instruction *finst)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg src[3], dst, tmp;
+	int mask, sat, unit;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(fpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(fpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(fpc, fsrc);
+			} else {
+				src[i] = temp(fpc);
+				arith(fpc, 0, MOV, src[i], MASK_ALL,
+				      tgsi_src(fpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		case TGSI_FILE_SAMPLER:
+			unit = fsrc->SrcRegister.Index;
+			break;
+		case TGSI_FILE_OUTPUT:
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(fpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+	sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(fpc, sat, MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_CMP:
+		tmp = temp(fpc);
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp.cc_update = 1;
+		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_LT;
+		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
+		break;
+	case TGSI_OPCODE_COS:
+		arith(fpc, sat, COS, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DDX:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDX, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDX, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DDY:
+		if (mask & (MASK_Z | MASK_W)) {
+			tmp = temp(fpc);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y,
+			      swz(src[0], Z, W, Z, W), none, none);
+			arith(fpc, 0, MOV, tmp, MASK_Z | MASK_W,
+			      swz(tmp, X, Y, X, Y), none, none);
+			arith(fpc, sat, DDY, tmp, MASK_X | MASK_Y, src[0],
+			      none, none);
+			arith(fpc, 0, MOV, dst, mask, tmp, none, none);
+		} else {
+			arith(fpc, sat, DDY, dst, mask, src[0], none, none);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(fpc, sat, DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(fpc, sat, DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[1], none);
+		arith(fpc, sat, ADD, dst, mask, swz(tmp, X, X, X, X),
+		      swz(src[1], W, W, W, W), none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(fpc, sat, DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(fpc, sat, EX2, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(fpc, sat, FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(fpc, sat, FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_KILP:
+		arith(fpc, 0, KIL, none, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_KIL:
+		dst = nv40_sr(NV40SR_NONE, 0);
+		dst.cc_update = 1;
+		arith(fpc, 0, MOV, dst, MASK_ALL, src[0], none, none);
+		dst.cc_update = 0; dst.cc_test = NV40_FP_OP_COND_LT;
+		arith(fpc, 0, KIL, dst, 0, none, none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(fpc, sat, LG2, dst, mask, src[0], none, none);
+		break;
+//	case TGSI_OPCODE_LIT:
+	case TGSI_OPCODE_LRP:
+		tmp = temp(fpc);
+		arith(fpc, 0, MAD, tmp, mask, neg(src[0]), src[2], src[2]);
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], tmp);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(fpc, sat, MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(fpc, sat, MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(fpc, sat, MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(fpc, sat, MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(fpc, sat, MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_NOISE1:
+	case TGSI_OPCODE_NOISE2:
+	case TGSI_OPCODE_NOISE3:
+	case TGSI_OPCODE_NOISE4:
+		arith(fpc, sat, SFL, dst, mask, none, none, none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, tmp, MASK_X,
+		      swz(src[0], X, X, X, X), none, none);
+		arith(fpc, 0, MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(fpc, sat, EX2, dst, mask,
+		      swz(tmp, X, X, X, X), none, none);
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(fpc, sat, RCP, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_RET:
+		assert(0);
+		break;
+	case TGSI_OPCODE_RFL:
+		tmp = temp(fpc);
+		arith(fpc, 0, DP3, tmp, MASK_X, src[0], src[0], none);
+		arith(fpc, 0, DP3, tmp, MASK_Y, src[0], src[1], none);
+		arith(fpc, 0, DIV, scale(tmp, 2X), MASK_Z,
+		      swz(tmp, Y, Y, Y, Y), swz(tmp, X, X, X, X), none);
+		arith(fpc, sat, MAD, dst, mask,
+		      swz(tmp, Z, Z, Z, Z), src[0], neg(src[1]));
+		break;
+	case TGSI_OPCODE_RSQ:
+		tmp = temp(fpc);
+		arith(fpc, 0, LG2, scale(tmp, INV_2X), MASK_X,
+		      abs(swz(src[0], X, X, X, X)), none, none);
+		arith(fpc, sat, EX2, dst, mask,
+		      neg(swz(tmp, X, X, X, X)), none, none);
+		break;
+	case TGSI_OPCODE_SCS:
+		if (mask & MASK_X) {
+			arith(fpc, sat, COS, dst, MASK_X,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		if (mask & MASK_Y) {
+			arith(fpc, sat, SIN, dst, MASK_Y,
+			      swz(src[0], X, X, X, X), none, none);
+		}
+		break;
+	case TGSI_OPCODE_SEQ:
+		arith(fpc, sat, SEQ, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SFL:
+		arith(fpc, sat, SFL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(fpc, sat, SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SGT:
+		arith(fpc, sat, SGT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SIN:
+		arith(fpc, sat, SIN, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_SLE:
+		arith(fpc, sat, SLE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(fpc, sat, SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SNE:
+		arith(fpc, sat, SNE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_STR:
+		arith(fpc, sat, STR, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(fpc, sat, ADD, dst, mask, src[0], neg(src[1]), none);
+		break;
+	case TGSI_OPCODE_TEX:
+		tex(fpc, sat, TEX, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXB:
+		tex(fpc, sat, TXB, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_TXP:
+		tex(fpc, sat, TXP, unit, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(fpc);
+		arith(fpc, 0, MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(fpc, sat, MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(fpc);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_attrib(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_FP_OP_INPUT_SRC_POSITION;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_FP_OP_INPUT_SRC_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_FP_OP_INPUT_SRC_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_FP_OP_INPUT_SRC_FOGC;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_FP_OP_INPUT_SRC_TC(fdec->Semantic.
+						     SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad input semantic\n");
+		return FALSE;
+	}
+
+	fpc->attrib_map[fdec->DeclarationRange.First] = hw;
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_parse_decl_output(struct nv40_fpc *fpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	unsigned hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = 1;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		switch (fdec->Semantic.SemanticIndex) {
+		case 0: hw = 0; break;
+		case 1: hw = 2; break;
+		case 2: hw = 3; break;
+		case 3: hw = 4; break;
+		default:
+			NOUVEAU_ERR("bad rcol index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	fpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	fpc->r_temps |= (1 << hw);
+	return TRUE;
+}
+
+static boolean
+nv40_fragprog_prepare(struct nv40_fpc *fpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, i;
+
+	tgsi_parse_init(&p, fpc->fp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_INPUT:
+				if (!nv40_fragprog_parse_decl_attrib(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_fragprog_parse_decl_output(fpc, fdec))
+					goto out_err;
+				break;
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			struct tgsi_full_immediate *imm;
+			float vals[4];
+			
+			imm = &p.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+			assert(fpc->nr_imm < MAX_IMM);
+
+			vals[0] = imm->u.ImmediateFloat32[0].Float;
+			vals[1] = imm->u.ImmediateFloat32[1].Float;
+			vals[2] = imm->u.ImmediateFloat32[2].Float;
+			vals[3] = imm->u.ImmediateFloat32[3].Float;
+			fpc->imm[fpc->nr_imm++] = constant(fpc, -1, vals);
+		}
+			break;
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (++high_temp) {
+		fpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			fpc->r_temp[i] = temp(fpc);
+		fpc->r_temps_discard = 0;
+	}
+
+	return TRUE;
+
+out_err:
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	tgsi_parse_free(&p);
+	return FALSE;
+}
+
+static void
+nv40_fragprog_translate(struct nv40_context *nv40,
+			struct nv40_fragment_program *fp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_fpc *fpc = NULL;
+
+	fpc = CALLOC(1, sizeof(struct nv40_fpc));
+	if (!fpc)
+		return;
+	fpc->fp = fp;
+	fpc->num_regs = 2;
+
+	if (!nv40_fragprog_prepare(fpc)) {
+		FREE(fpc);
+		return;
+	}
+
+	tgsi_parse_init(&parse, fp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_fragprog_parse_instruction(fpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	fp->fp_control |= fpc->num_regs << NV40TCL_FP_CONTROL_TEMP_COUNT_SHIFT;
+
+	/* Terminate final instruction */
+	fp->insn[fpc->inst_offset] |= 0x00000001;
+
+	/* Append NOP + END instruction, may or may not be necessary. */
+	fpc->inst_offset = fp->insn_len;
+	grow_insns(fpc, 4);
+	fp->insn[fpc->inst_offset + 0] = 0x00000001;
+	fp->insn[fpc->inst_offset + 1] = 0x00000000;
+	fp->insn[fpc->inst_offset + 2] = 0x00000000;
+	fp->insn[fpc->inst_offset + 3] = 0x00000000;
+	
+	fp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (fpc->r_temp)
+		FREE(fpc->r_temp);
+	FREE(fpc);
+}
+
+static void
+nv40_fragprog_upload(struct nv40_context *nv40,
+		     struct nv40_fragment_program *fp)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	const uint32_t le = 1;
+	uint32_t *map;
+	int i;
+
+	map = ws->buffer_map(ws, fp->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+#if 0
+	for (i = 0; i < fp->insn_len; i++) {
+		fflush(stdout); fflush(stderr);
+		NOUVEAU_ERR("%d 0x%08x\n", i, fp->insn[i]);
+		fflush(stdout); fflush(stderr);
+	}
+#endif
+
+	if ((*(const uint8_t *)&le)) {
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = fp->insn[i];
+		}
+	} else {
+		/* Weird swapping for big-endian chips */
+		for (i = 0; i < fp->insn_len; i++) {
+			map[i] = ((fp->insn[i] & 0xffff) << 16) |
+				  ((fp->insn[i] >> 16) & 0xffff);
+		}
+	}
+
+	ws->buffer_unmap(ws, fp->buffer);
+}
+
+static boolean
+nv40_fragprog_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct pipe_buffer *constbuf =
+		nv40->constbuf[PIPE_SHADER_FRAGMENT];
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_stateobj *so;
+	boolean new_consts = FALSE;
+	int i;
+
+	if (fp->translated)
+		goto update_constants;
+
+	nv40->fallback_swrast &= ~NV40_NEW_FRAGPROG;
+	nv40_fragprog_translate(nv40, fp);
+	if (!fp->translated) {
+		nv40->fallback_swrast |= NV40_NEW_FRAGPROG;
+		return FALSE;
+	}
+
+	fp->buffer = ws->buffer_create(ws, 0x100, 0, fp->insn_len * 4);
+	nv40_fragprog_upload(nv40, fp);
+
+	so = so_new(4, 1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_ADDRESS, 1);
+	so_reloc (so, fp->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+		  NV40TCL_FP_ADDRESS_DMA0, NV40TCL_FP_ADDRESS_DMA1);
+	so_method(so, nv40->screen->curie, NV40TCL_FP_CONTROL, 1);
+	so_data  (so, fp->fp_control);
+	so_ref(so, &fp->so);
+
+update_constants:
+	if (fp->nr_consts) {
+		float *map;
+		
+		map = ws->buffer_map(ws, constbuf, PIPE_BUFFER_USAGE_CPU_READ);
+		for (i = 0; i < fp->nr_consts; i++) {
+			struct nv40_fragment_program_data *fpd = &fp->consts[i];
+			uint32_t *p = &fp->insn[fpd->offset];
+			uint32_t *cb = (uint32_t *)&map[fpd->index * 4];
+
+			if (!memcmp(p, cb, 4 * sizeof(float)))
+				continue;
+			memcpy(p, cb, 4 * sizeof(float));
+			new_consts = TRUE;
+		}
+		ws->buffer_unmap(ws, constbuf);
+
+		if (new_consts)
+			nv40_fragprog_upload(nv40, fp);
+	}
+
+	if (new_consts || fp->so != nv40->state.hw[NV40_STATE_FRAGPROG]) {
+		so_ref(fp->so, &nv40->state.hw[NV40_STATE_FRAGPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_fragprog_destroy(struct nv40_context *nv40,
+		      struct nv40_fragment_program *fp)
+{
+	if (fp->insn_len)
+		FREE(fp->insn);
+}
+
+struct nv40_state_entry nv40_state_fragprog = {
+	.validate = nv40_fragprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FRAGPROG,
+		.hw = NV40_STATE_FRAGPROG
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_fragtex.c b/src/gallium/drivers/nv40/nv40_fragtex.c
new file mode 100644
index 0000000000..0227d22620
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_fragtex.c
@@ -0,0 +1,168 @@
+#include "nv40_context.h"
+
+#define _(m,tf,ts0x,ts0y,ts0z,ts0w,ts1x,ts1y,ts1z,ts1w,sx,sy,sz,sw)            \
+{                                                                              \
+  TRUE,                                                                        \
+  PIPE_FORMAT_##m,                                                             \
+  NV40TCL_TEX_FORMAT_FORMAT_##tf,                                              \
+  (NV40TCL_TEX_SWIZZLE_S0_X_##ts0x | NV40TCL_TEX_SWIZZLE_S0_Y_##ts0y |         \
+   NV40TCL_TEX_SWIZZLE_S0_Z_##ts0z | NV40TCL_TEX_SWIZZLE_S0_W_##ts0w |         \
+   NV40TCL_TEX_SWIZZLE_S1_X_##ts1x | NV40TCL_TEX_SWIZZLE_S1_Y_##ts1y |         \
+   NV40TCL_TEX_SWIZZLE_S1_Z_##ts1z | NV40TCL_TEX_SWIZZLE_S1_W_##ts1w),         \
+  ((NV40TCL_TEX_FILTER_SIGNED_RED*sx) | (NV40TCL_TEX_FILTER_SIGNED_GREEN*sy) |       \
+   (NV40TCL_TEX_FILTER_SIGNED_BLUE*sz) | (NV40TCL_TEX_FILTER_SIGNED_ALPHA*sw))       \
+}
+
+struct nv40_texture_format {
+	boolean defined;
+	uint	pipe;
+	int     format;
+	int     swizzle;
+	int     sign;
+};
+
+static struct nv40_texture_format
+nv40_texture_formats[] = {
+	_(A8R8G8B8_UNORM, A8R8G8B8,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A1R5G5B5_UNORM, A1R5G5B5,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(A4R4G4B4_UNORM, A4R4G4B4,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(R5G6B5_UNORM  , R5G6B5  ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(L8_UNORM      , L8      ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(A8_UNORM      , L8      , ZERO, ZERO, ZERO,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(R16_SNORM     , A16     , ZERO, ZERO,   S1,  ONE, X, X, X, Y, 1, 1, 1, 1),
+	_(I8_UNORM      , L8      ,   S1,   S1,   S1,   S1, X, X, X, X, 0, 0, 0, 0),
+	_(A8L8_UNORM    , A8L8    ,   S1,   S1,   S1,   S1, X, X, X, Y, 0, 0, 0, 0),
+	_(Z16_UNORM     , Z16     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(Z24S8_UNORM   , Z24     ,   S1,   S1,   S1,  ONE, X, X, X, X, 0, 0, 0, 0),
+	_(DXT1_RGB      , DXT1    ,   S1,   S1,   S1,  ONE, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT1_RGBA     , DXT1    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT3_RGBA     , DXT3    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	_(DXT5_RGBA     , DXT5    ,   S1,   S1,   S1,   S1, X, Y, Z, W, 0, 0, 0, 0),
+	{},
+};
+
+static struct nv40_texture_format *
+nv40_fragtex_format(uint pipe_format)
+{
+	struct nv40_texture_format *tf = nv40_texture_formats;
+
+	while (tf->defined) {
+		if (tf->pipe == pipe_format)
+			return tf;
+		tf++;
+	}
+
+	NOUVEAU_ERR("unknown texture format %s\n", pf_name(pipe_format));
+	return NULL;
+}
+
+
+static struct nouveau_stateobj *
+nv40_fragtex_build(struct nv40_context *nv40, int unit)
+{
+	struct nv40_sampler_state *ps = nv40->tex_sampler[unit];
+	struct nv40_miptree *nv40mt = nv40->tex_miptree[unit];
+	struct pipe_texture *pt = &nv40mt->base;
+	struct nv40_texture_format *tf;
+	struct nouveau_stateobj *so;
+	uint32_t txf, txs, txp;
+	unsigned tex_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+
+	tf = nv40_fragtex_format(pt->format);
+	if (!tf)
+		assert(0);
+
+	txf  = ps->fmt;
+	txf |= tf->format | 0x8000;
+	txf |= ((pt->last_level + 1) << NV40TCL_TEX_FORMAT_MIPMAP_COUNT_SHIFT);
+
+	if (1) /* XXX */
+		txf |= NV40TCL_TEX_FORMAT_NO_BORDER;
+
+	switch (pt->target) {
+	case PIPE_TEXTURE_CUBE:
+		txf |= NV40TCL_TEX_FORMAT_CUBIC;
+		/* fall-through */
+	case PIPE_TEXTURE_2D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_2D;
+		break;
+	case PIPE_TEXTURE_3D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_3D;
+		break;
+	case PIPE_TEXTURE_1D:
+		txf |= NV40TCL_TEX_FORMAT_DIMS_1D;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown target %d\n", pt->target);
+		return NULL;
+	}
+
+	if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		txp = 0;
+	} else {
+		txp  = nv40mt->level[0].pitch;
+		txf |= NV40TCL_TEX_FORMAT_LINEAR;
+	}
+
+	txs = tf->swizzle;
+
+	so = so_new(16, 2);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_OFFSET(unit), 8);
+	so_reloc (so, nv40mt->buffer, 0, tex_flags | NOUVEAU_BO_LOW, 0, 0);
+	so_reloc (so, nv40mt->buffer, txf, tex_flags | NOUVEAU_BO_OR,
+		  NV40TCL_TEX_FORMAT_DMA0, NV40TCL_TEX_FORMAT_DMA1);
+	so_data  (so, ps->wrap);
+	so_data  (so, NV40TCL_TEX_ENABLE_ENABLE | ps->en);
+	so_data  (so, txs);
+	so_data  (so, ps->filt | tf->sign | 0x2000 /*voodoo*/);
+	so_data  (so, (pt->width[0] << NV40TCL_TEX_SIZE0_W_SHIFT) |
+		       pt->height[0]);
+	so_data  (so, ps->bcol);
+	so_method(so, nv40->screen->curie, NV40TCL_TEX_SIZE1(unit), 1);
+	so_data  (so, (pt->depth[0] << NV40TCL_TEX_SIZE1_DEPTH_SHIFT) | txp);
+
+	return so;
+}
+
+static boolean
+nv40_fragtex_validate(struct nv40_context *nv40)
+{
+	struct nv40_fragment_program *fp = nv40->fragprog;
+	struct nv40_state *state = &nv40->state;
+	struct nouveau_stateobj *so;
+	unsigned samplers, unit;
+
+	samplers = state->fp_samplers & ~fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = so_new(2, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_TEX_ENABLE(unit), 1);
+		so_data  (so, 0);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	samplers = nv40->dirty_samplers & fp->samplers;
+	while (samplers) {
+		unit = ffs(samplers) - 1;
+		samplers &= ~(1 << unit);
+
+		so = nv40_fragtex_build(nv40, unit);
+		so_ref(so, &nv40->state.hw[NV40_STATE_FRAGTEX0 + unit]);
+		state->dirty |= (1ULL << (NV40_STATE_FRAGTEX0 + unit));
+	}
+
+	nv40->state.fp_samplers = fp->samplers;
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_fragtex = {
+	.validate = nv40_fragtex_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SAMPLER | NV40_NEW_FRAGPROG,
+		.hw = 0
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
new file mode 100644
index 0000000000..b68967c07f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -0,0 +1,194 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv40_context.h"
+
+static void
+nv40_miptree_layout(struct nv40_miptree *mt)
+{
+	struct pipe_texture *pt = &mt->base;
+	uint width = pt->width[0], height = pt->height[0], depth = pt->depth[0];
+	uint offset = 0;
+	int nr_faces, l, f, pitch;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		nr_faces = 6;
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		nr_faces = pt->depth[0];
+	} else {
+		nr_faces = 1;
+	}
+
+	pitch = pt->width[0];
+	for (l = 0; l <= pt->last_level; l++) {
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		if (!(pt->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR))
+			pitch = pt->nblocksx[l];
+		pitch = align(pitch, 64);
+
+		mt->level[l].pitch = pitch * pt->block.size;
+		mt->level[l].image_offset =
+			CALLOC(nr_faces, sizeof(unsigned));
+
+		width  = MAX2(1, width  >> 1);
+		height = MAX2(1, height >> 1);
+		depth  = MAX2(1, depth  >> 1);
+	}
+
+	for (f = 0; f < nr_faces; f++) {
+		for (l = 0; l <= pt->last_level; l++) {
+			mt->level[l].image_offset[f] = offset;
+			offset += mt->level[l].pitch * pt->height[l];
+		}
+	}
+
+	mt->total_size = offset;
+}
+
+static struct pipe_texture *
+nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv40_miptree *mt;
+
+	mt = MALLOC(sizeof(struct nv40_miptree));
+	if (!mt)
+		return NULL;
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+	mt->shadow_tex = NULL;
+	mt->shadow_surface = NULL;
+
+	/* Swizzled textures must be POT */
+	if (pt->width[0] & (pt->width[0] - 1) ||
+	    pt->height[0] & (pt->height[0] - 1))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else
+	if (pt->tex_usage & (PIPE_TEXTURE_USAGE_PRIMARY |
+	                     PIPE_TEXTURE_USAGE_DISPLAY_TARGET))
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+	else {
+		switch (pt->format) {
+		/* TODO: Figure out which formats can be swizzled */
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_X8R8G8B8_UNORM:
+		/* XXX: Re-enable when SIFM size limits are fixed */
+		/*case PIPE_FORMAT_R16_SNORM:*/
+			break;
+		default:
+			mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+		}
+	}
+
+	nv40_miptree_layout(mt);
+
+	mt->buffer = ws->buffer_create(ws, 256,
+				       PIPE_BUFFER_USAGE_PIXEL |
+				       NOUVEAU_BUFFER_USAGE_TEXTURE,
+				       mt->total_size);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv40_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	int l;
+
+	*ppt = NULL;
+	if (--pt->refcount)
+		return;
+
+	pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+	for (l = 0; l <= pt->last_level; l++) {
+		if (mt->level[l].image_offset)
+			FREE(mt->level[l].image_offset);
+	}
+
+	if (mt->shadow_tex) {
+		assert(mt->shadow_surface);
+		pscreen->tex_surface_release(pscreen, &mt->shadow_surface);
+		nv40_miptree_release(pscreen, &mt->shadow_tex);
+	}
+
+	FREE(mt);
+}
+
+static struct pipe_surface *
+nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv40_miptree *mt = (struct nv40_miptree *)pt;
+	struct pipe_surface *ps;
+
+	ps = CALLOC_STRUCT(pipe_surface);
+	if (!ps)
+		return NULL;
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = mt->level[level].pitch;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+	ps->face = face;
+	ps->level = level;
+	ps->zslice = zslice;
+
+	if (pt->target == PIPE_TEXTURE_CUBE) {
+		ps->offset = mt->level[level].image_offset[face];
+	} else
+	if (pt->target == PIPE_TEXTURE_3D) {
+		ps->offset = mt->level[level].image_offset[zslice];
+	} else {
+		ps->offset = mt->level[level].image_offset[0];
+	}
+
+	return ps;
+}
+
+static void
+nv40_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+
+	*psurface = NULL;
+	if (--ps->refcount > 0)
+		return;
+
+	pipe_texture_reference(&ps->texture, NULL);
+	pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+	FREE(ps);
+}
+
+void
+nv40_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv40_miptree_create;
+	pscreen->texture_release = nv40_miptree_release;
+	pscreen->get_tex_surface = nv40_miptree_surface_new;
+	pscreen->tex_surface_release = nv40_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_query.c b/src/gallium/drivers/nv40/nv40_query.c
new file mode 100644
index 0000000000..57f39cfab0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_query.c
@@ -0,0 +1,122 @@
+#include "pipe/p_context.h"
+
+#include "nv40_context.h"
+
+struct nv40_query {
+	struct nouveau_resource *object;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv40_query *
+nv40_query(struct pipe_query *pipe)
+{
+	return (struct nv40_query *)pipe;
+}
+
+static struct pipe_query *
+nv40_query_create(struct pipe_context *pipe, unsigned query_type)
+{
+	struct nv40_query *q;
+
+	q = CALLOC(1, sizeof(struct nv40_query));
+	q->type = query_type;
+
+	return (struct pipe_query *)q;
+}
+
+static void
+nv40_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	if (q->object)
+		nv40->nvws->res_free(&q->object);
+	FREE(q);
+}
+
+static void
+nv40_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	assert(q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	/* Happens when end_query() is called, then another begin_query()
+	 * without querying the result in-between.  For now we'll wait for
+	 * the existing query to notify completion, but it could be better.
+	 */
+	if (q->object) {
+		uint64 tmp;
+		pipe->get_query_result(pipe, pq, 1, &tmp);
+	}
+
+	if (nv40->nvws->res_alloc(nv40->screen->query_heap, 1, NULL, &q->object))
+		assert(0);
+	nv40->nvws->notifier_reset(nv40->screen->query, q->object->start);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_RESET, 1);
+	OUT_RING  (1);
+	BEGIN_RING(curie, NV40TCL_QUERY_UNK17CC, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
+}
+
+static void
+nv40_query_end(struct pipe_context *pipe, struct pipe_query *pq)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+
+	BEGIN_RING(curie, NV40TCL_QUERY_GET, 1);
+	OUT_RING  ((0x01 << NV40TCL_QUERY_GET_UNK24_SHIFT) |
+		   ((q->object->start * 32) << NV40TCL_QUERY_GET_OFFSET_SHIFT));
+	FIRE_RING(NULL);
+}
+
+static boolean
+nv40_query_result(struct pipe_context *pipe, struct pipe_query *pq,
+		  boolean wait, uint64 *result)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_query *q = nv40_query(pq);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	assert(q->object && q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+
+	if (!q->ready) {
+		unsigned status;
+
+		status = nvws->notifier_status(nv40->screen->query,
+					       q->object->start);
+		if (status != NV_NOTIFY_STATE_STATUS_COMPLETED) {
+			if (wait == FALSE)
+				return FALSE;
+			nvws->notifier_wait(nv40->screen->query, q->object->start,
+					    NV_NOTIFY_STATE_STATUS_COMPLETED,
+					    0);
+		}
+
+		q->result = nvws->notifier_retval(nv40->screen->query,
+						  q->object->start);
+		q->ready = TRUE;
+		nvws->res_free(&q->object);
+	}
+
+	*result = q->result;
+	return TRUE;
+}
+
+void
+nv40_init_query_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_query = nv40_query_create;
+	nv40->pipe.destroy_query = nv40_query_destroy;
+	nv40->pipe.begin_query = nv40_query_begin;
+	nv40->pipe.end_query = nv40_query_end;
+	nv40->pipe.get_query_result = nv40_query_result;
+}
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
new file mode 100644
index 0000000000..25c7868296
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -0,0 +1,365 @@
+#include "pipe/p_screen.h"
+
+#include "nv40_context.h"
+#include "nv40_screen.h"
+
+#define NV4X_GRCLASS4097_CHIPSETS 0x00000baf
+#define NV4X_GRCLASS4497_CHIPSETS 0x00005450
+#define NV6X_GRCLASS4497_CHIPSETS 0x00000088
+
+static const char *
+nv40_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv40_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv40_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 16;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 1;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 1;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 1;
+	case PIPE_CAP_POINT_SPRITE:
+		return 1;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 4;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 1;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 1;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:
+		if (screen->curie->grclass == NV40TCL)
+			return 1;
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv40_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 16.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static boolean
+nv40_screen_surface_format_supported(struct pipe_screen *pscreen,
+				     enum pipe_format format,
+				     enum pipe_texture_target target,
+				     unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM: 
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_R16_SNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static void *
+nv40_surface_map(struct pipe_screen *screen, struct pipe_surface *surface,
+		 unsigned flags )
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_map;
+	void			*map;
+
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		if (!mt->shadow_tex) {
+			unsigned old_tex_usage = surface->texture->tex_usage;
+			surface->texture->tex_usage = NOUVEAU_TEXTURE_USAGE_LINEAR;
+			mt->shadow_tex = screen->texture_create(screen, surface->texture);
+			surface->texture->tex_usage = old_tex_usage;
+
+			assert(mt->shadow_tex->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR);
+			mt->shadow_surface = screen->get_tex_surface
+			(
+				screen, mt->shadow_tex,
+				surface->face, surface->level, surface->zslice,
+				surface->usage
+			);
+		}
+
+		surface_to_map = mt->shadow_surface;
+	}
+	else
+		surface_to_map = surface;
+
+	assert(surface_to_map);
+
+	map = ws->buffer_map(ws, surface_to_map->buffer, flags);
+	if (!map)
+		return NULL;
+
+	return map + surface_to_map->offset;
+}
+
+static void
+nv40_surface_unmap(struct pipe_screen *screen, struct pipe_surface *surface)
+{
+	struct pipe_winsys	*ws = screen->winsys;
+	struct pipe_surface	*surface_to_unmap;
+
+	/* TODO: Copy from shadow just before push buffer is flushed instead.
+	         There are probably some programs that map/unmap excessively
+	         before rendering. */
+	if (!(surface->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		struct nv40_miptree *mt = (struct nv40_miptree *)surface->texture;
+
+		assert(mt->shadow_tex);
+
+		surface_to_unmap = mt->shadow_surface;
+	}
+	else
+		surface_to_unmap = surface;
+
+	assert(surface_to_unmap);
+
+	ws->buffer_unmap(ws, surface_to_unmap->buffer);
+
+	if (surface_to_unmap != surface) {
+		struct nv40_screen *nvscreen = nv40_screen(screen);
+
+		nvscreen->nvws->surface_copy(nvscreen->nvws,
+		                             surface, 0, 0,
+		                             surface_to_unmap, 0, 0,
+		                             surface->width, surface->height);
+	}
+}
+
+static void
+nv40_screen_destroy(struct pipe_screen *pscreen)
+{
+	struct nv40_screen *screen = nv40_screen(pscreen);
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	nvws->res_free(&screen->vp_exec_heap);
+	nvws->res_free(&screen->vp_data_heap);
+	nvws->res_free(&screen->query_heap);
+	nvws->notifier_free(&screen->query);
+	nvws->notifier_free(&screen->sync);
+	nvws->grobj_free(&screen->curie);
+
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv40_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv40_screen *screen = CALLOC_STRUCT(nv40_screen);
+	struct nouveau_stateobj *so;
+	unsigned curie_class;
+	unsigned chipset = nvws->channel->device->chipset;
+	int ret;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	switch (chipset & 0xf0) {
+	case 0x40:
+		if (NV4X_GRCLASS4097_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV40TCL;
+		else
+		if (NV4X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	case 0x60:
+		if (NV6X_GRCLASS4497_CHIPSETS & (1 << (chipset & 0x0f)))
+			curie_class = NV44TCL;
+		break;
+	default:
+		break;
+	}
+
+	if (!curie_class) {
+		NOUVEAU_ERR("Unknown nv4x chipset: nv%02x\n", chipset);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, curie_class, &screen->curie);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		return FALSE;
+	}
+
+	/* Notifier for sync purposes */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Query objects */
+	ret = nvws->notifier_alloc(nvws, 32, &screen->query);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query objects: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->res_init(&screen->query_heap, 0, 32);
+	if (ret) {
+		NOUVEAU_ERR("Error initialising query object heap: %d\n", ret);
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Vtxprog resources */
+	if (nvws->res_init(&screen->vp_exec_heap, 0, 512) ||
+	    nvws->res_init(&screen->vp_data_heap, 0, 256)) {
+		nv40_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static curie initialisation */
+	so = so_new(128, 0);
+	so_method(so, screen->curie, NV40TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_TEXTURE0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR1, 1);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_VTXBUF0, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->gart->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_FENCE, 2);
+	so_data  (so, 0);
+	so_data  (so, screen->query->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_UNK01AC, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+	so_method(so, screen->curie, NV40TCL_DMA_COLOR2, 2);
+	so_data  (so, nvws->channel->vram->handle);
+	so_data  (so, nvws->channel->vram->handle);
+
+	so_method(so, screen->curie, 0x1ea4, 3);
+	so_data  (so, 0x00000010);
+	so_data  (so, 0x01000100);
+	so_data  (so, 0xff800006);
+
+	/* vtxprog output routing */
+	so_method(so, screen->curie, 0x1fc4, 1);
+	so_data  (so, 0x06144321);
+	so_method(so, screen->curie, 0x1fc8, 2);
+	so_data  (so, 0xedcba987);
+	so_data  (so, 0x00000021);
+	so_method(so, screen->curie, 0x1fd0, 1);
+	so_data  (so, 0x00171615);
+	so_method(so, screen->curie, 0x1fd4, 1);
+	so_data  (so, 0x001b1a19);
+
+	so_method(so, screen->curie, 0x1ef8, 1);
+	so_data  (so, 0x0020ffff);
+	so_method(so, screen->curie, 0x1d64, 1);
+	so_data  (so, 0x00d30000);
+	so_method(so, screen->curie, 0x1e94, 1);
+	so_data  (so, 0x00000001);
+
+	so_emit(nvws, so);
+	so_ref(NULL, &so);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+	screen->pipe.destroy = nv40_screen_destroy;
+
+	screen->pipe.get_name = nv40_screen_get_name;
+	screen->pipe.get_vendor = nv40_screen_get_vendor;
+	screen->pipe.get_param = nv40_screen_get_param;
+	screen->pipe.get_paramf = nv40_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv40_screen_surface_format_supported;
+
+	screen->pipe.surface_map = nv40_surface_map;
+	screen->pipe.surface_unmap = nv40_surface_unmap;
+
+	nv40_screen_init_miptree_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_screen.h b/src/gallium/drivers/nv40/nv40_screen.h
new file mode 100644
index 0000000000..c04a1275a0
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_screen.h
@@ -0,0 +1,35 @@
+#ifndef __NV40_SCREEN_H__
+#define __NV40_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv40_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	/* HW graphics objects */
+	struct nouveau_grobj *curie;
+	struct nouveau_notifier *sync;
+
+	/* Query object resources */
+	struct nouveau_notifier *query;
+	struct nouveau_resource *query_heap;
+
+	/* Vtxprog resources */
+	struct nouveau_resource *vp_exec_heap;
+	struct nouveau_resource *vp_data_heap;
+
+	/* Current 3D state of channel */
+	struct nouveau_stateobj *state[NV40_STATE_MAX];
+};
+
+static INLINE struct nv40_screen *
+nv40_screen(struct pipe_screen *screen)
+{
+	return (struct nv40_screen *)screen;
+}
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_shader.h b/src/gallium/drivers/nv40/nv40_shader.h
new file mode 100644
index 0000000000..854dccf548
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_shader.h
@@ -0,0 +1,556 @@
+#ifndef __NV40_SHADER_H__
+#define __NV40_SHADER_H__
+
+/* Vertex programs instruction set
+ *
+ * The NV40 instruction set is very similar to NV30.  Most fields are in
+ * a slightly different position in the instruction however.
+ *
+ * Merged instructions
+ *     In some cases it is possible to put two instructions into one opcode
+ *     slot.  The rules for when this is OK is not entirely clear to me yet.
+ *
+ *     There are separate writemasks and dest temp register fields for each
+ *     grouping of instructions.  There is however only one field with the
+ *     ID of a result register.  Writing to temp/result regs is selected by
+ *     setting VEC_RESULT/SCA_RESULT.
+ *
+ * Temporary registers
+ *     The source/dest temp register fields have been extended by 1 bit, to
+ *     give a total of 32 temporary registers.
+ *
+ * Relative Addressing
+ *     NV40 can use an address register to index into vertex attribute regs.
+ *     This is done by putting the offset value into INPUT_SRC and setting
+ *     the INDEX_INPUT flag.
+ *
+ * Conditional execution (see NV_vertex_program{2,3} for details)
+ *     There is a second condition code register on NV40, it's use is enabled
+ *     by setting the COND_REG_SELECT_1 flag.
+ *
+ * Texture lookup
+ *     TODO
+ */
+
+/* ---- OPCODE BITS 127:96 / data DWORD 0 --- */
+#define NV40_VP_INST_VEC_RESULT                                        (1 << 30)
+/* uncertain.. */
+#define NV40_VP_INST_COND_UPDATE_ENABLE                        ((1 << 14)|1<<29)
+/* use address reg as index into attribs */
+#define NV40_VP_INST_INDEX_INPUT                                       (1 << 27)
+#define NV40_VP_INST_COND_REG_SELECT_1                                 (1 << 25)
+#define NV40_VP_INST_ADDR_REG_SELECT_1                                 (1 << 24)
+#define NV40_VP_INST_SRC2_ABS                                          (1 << 23)
+#define NV40_VP_INST_SRC1_ABS                                          (1 << 22)
+#define NV40_VP_INST_SRC0_ABS                                          (1 << 21)
+#define NV40_VP_INST_VEC_DEST_TEMP_SHIFT                                      15
+#define NV40_VP_INST_VEC_DEST_TEMP_MASK                             (0x1F << 15)
+#define NV40_VP_INST_COND_TEST_ENABLE                                  (1 << 13)
+#define NV40_VP_INST_COND_SHIFT                                               10
+#define NV40_VP_INST_COND_MASK                                       (0x7 << 10)
+#    define NV40_VP_INST_COND_FL                                               0
+#    define NV40_VP_INST_COND_LT                                               1
+#    define NV40_VP_INST_COND_EQ                                               2
+#    define NV40_VP_INST_COND_LE                                               3
+#    define NV40_VP_INST_COND_GT                                               4
+#    define NV40_VP_INST_COND_NE                                               5
+#    define NV40_VP_INST_COND_GE                                               6
+#    define NV40_VP_INST_COND_TR                                               7
+#define NV40_VP_INST_COND_SWZ_X_SHIFT                                          8
+#define NV40_VP_INST_COND_SWZ_X_MASK                                    (3 << 8)
+#define NV40_VP_INST_COND_SWZ_Y_SHIFT                                          6
+#define NV40_VP_INST_COND_SWZ_Y_MASK                                    (3 << 6)
+#define NV40_VP_INST_COND_SWZ_Z_SHIFT                                          4
+#define NV40_VP_INST_COND_SWZ_Z_MASK                                    (3 << 4)
+#define NV40_VP_INST_COND_SWZ_W_SHIFT                                          2
+#define NV40_VP_INST_COND_SWZ_W_MASK                                    (3 << 2)
+#define NV40_VP_INST_COND_SWZ_ALL_SHIFT                                        2
+#define NV40_VP_INST_COND_SWZ_ALL_MASK                               (0xFF << 2)
+#define NV40_VP_INST_ADDR_SWZ_SHIFT                                            0
+#define NV40_VP_INST_ADDR_SWZ_MASK                                   (0x03 << 0)
+#define NV40_VP_INST0_KNOWN ( \
+                NV40_VP_INST_INDEX_INPUT | \
+                NV40_VP_INST_COND_REG_SELECT_1 | \
+                NV40_VP_INST_ADDR_REG_SELECT_1 | \
+                NV40_VP_INST_SRC2_ABS | \
+                NV40_VP_INST_SRC1_ABS | \
+                NV40_VP_INST_SRC0_ABS | \
+                NV40_VP_INST_VEC_DEST_TEMP_MASK | \
+                NV40_VP_INST_COND_TEST_ENABLE | \
+                NV40_VP_INST_COND_MASK | \
+                NV40_VP_INST_COND_SWZ_ALL_MASK | \
+                NV40_VP_INST_ADDR_SWZ_MASK)
+
+/* ---- OPCODE BITS 95:64 / data DWORD 1 --- */
+#define NV40_VP_INST_VEC_OPCODE_SHIFT                                         22
+#define NV40_VP_INST_VEC_OPCODE_MASK                                (0x1F << 22)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_MUL                                             0x02
+#    define NV40_VP_INST_OP_ADD                                             0x03
+#    define NV40_VP_INST_OP_MAD                                             0x04
+#    define NV40_VP_INST_OP_DP3                                             0x05
+#    define NV40_VP_INST_OP_DPH                                             0x06
+#    define NV40_VP_INST_OP_DP4                                             0x07
+#    define NV40_VP_INST_OP_DST                                             0x08
+#    define NV40_VP_INST_OP_MIN                                             0x09
+#    define NV40_VP_INST_OP_MAX                                             0x0A
+#    define NV40_VP_INST_OP_SLT                                             0x0B
+#    define NV40_VP_INST_OP_SGE                                             0x0C
+#    define NV40_VP_INST_OP_ARL                                             0x0D
+#    define NV40_VP_INST_OP_FRC                                             0x0E
+#    define NV40_VP_INST_OP_FLR                                             0x0F
+#    define NV40_VP_INST_OP_SEQ                                             0x10
+#    define NV40_VP_INST_OP_SFL                                             0x11
+#    define NV40_VP_INST_OP_SGT                                             0x12
+#    define NV40_VP_INST_OP_SLE                                             0x13
+#    define NV40_VP_INST_OP_SNE                                             0x14
+#    define NV40_VP_INST_OP_STR                                             0x15
+#    define NV40_VP_INST_OP_SSG                                             0x16
+#    define NV40_VP_INST_OP_ARR                                             0x17
+#    define NV40_VP_INST_OP_ARA                                             0x18
+#    define NV40_VP_INST_OP_TXL                                             0x19
+#define NV40_VP_INST_SCA_OPCODE_SHIFT                                         27
+#define NV40_VP_INST_SCA_OPCODE_MASK                                (0x1F << 27)
+#    define NV40_VP_INST_OP_NOP                                             0x00
+#    define NV40_VP_INST_OP_MOV                                             0x01
+#    define NV40_VP_INST_OP_RCP                                             0x02
+#    define NV40_VP_INST_OP_RCC                                             0x03
+#    define NV40_VP_INST_OP_RSQ                                             0x04
+#    define NV40_VP_INST_OP_EXP                                             0x05
+#    define NV40_VP_INST_OP_LOG                                             0x06
+#    define NV40_VP_INST_OP_LIT                                             0x07
+#    define NV40_VP_INST_OP_BRA                                             0x09
+#    define NV40_VP_INST_OP_CAL                                             0x0B
+#    define NV40_VP_INST_OP_RET                                             0x0C
+#    define NV40_VP_INST_OP_LG2                                             0x0D
+#    define NV40_VP_INST_OP_EX2                                             0x0E
+#    define NV40_VP_INST_OP_SIN                                             0x0F
+#    define NV40_VP_INST_OP_COS                                             0x10
+#    define NV40_VP_INST_OP_PUSHA                                           0x13
+#    define NV40_VP_INST_OP_POPA                                            0x14
+#define NV40_VP_INST_CONST_SRC_SHIFT                                          12
+#define NV40_VP_INST_CONST_SRC_MASK                                 (0xFF << 12)
+#define NV40_VP_INST_INPUT_SRC_SHIFT                                           8
+#define NV40_VP_INST_INPUT_SRC_MASK                                  (0x0F << 8)
+#    define NV40_VP_INST_IN_POS                                                0
+#    define NV40_VP_INST_IN_WEIGHT                                             1
+#    define NV40_VP_INST_IN_NORMAL                                             2
+#    define NV40_VP_INST_IN_COL0                                               3
+#    define NV40_VP_INST_IN_COL1                                               4
+#    define NV40_VP_INST_IN_FOGC                                               5
+#    define NV40_VP_INST_IN_TC0                                                8
+#    define NV40_VP_INST_IN_TC(n)                                          (8+n)
+#define NV40_VP_INST_SRC0H_SHIFT                                               0
+#define NV40_VP_INST_SRC0H_MASK                                      (0xFF << 0)
+#define NV40_VP_INST1_KNOWN ( \
+                NV40_VP_INST_VEC_OPCODE_MASK | \
+                NV40_VP_INST_SCA_OPCODE_MASK | \
+                NV40_VP_INST_CONST_SRC_MASK  | \
+                NV40_VP_INST_INPUT_SRC_MASK  | \
+                NV40_VP_INST_SRC0H_MASK \
+                )
+
+/* ---- OPCODE BITS 63:32 / data DWORD 2 --- */
+#define NV40_VP_INST_SRC0L_SHIFT                                              23
+#define NV40_VP_INST_SRC0L_MASK                                    (0x1FF << 23)
+#define NV40_VP_INST_SRC1_SHIFT                                                6
+#define NV40_VP_INST_SRC1_MASK                                    (0x1FFFF << 6)
+#define NV40_VP_INST_SRC2H_SHIFT                                               0
+#define NV40_VP_INST_SRC2H_MASK                                      (0x3F << 0)
+#define NV40_VP_INST_IADDRH_SHIFT                                              0
+#define NV40_VP_INST_IADDRH_MASK                                     (0x1F << 0)
+
+/* ---- OPCODE BITS 31:0 / data DWORD 3 --- */
+#define NV40_VP_INST_IADDRL_SHIFT                                             29
+#define NV40_VP_INST_IADDRL_MASK                                       (7 << 29)
+#define NV40_VP_INST_SRC2L_SHIFT                                              21
+#define NV40_VP_INST_SRC2L_MASK                                    (0x7FF << 21)
+#define NV40_VP_INST_SCA_WRITEMASK_SHIFT                                      17
+#define NV40_VP_INST_SCA_WRITEMASK_MASK                              (0xF << 17)
+#    define NV40_VP_INST_SCA_WRITEMASK_X                               (1 << 20)
+#    define NV40_VP_INST_SCA_WRITEMASK_Y                               (1 << 19)
+#    define NV40_VP_INST_SCA_WRITEMASK_Z                               (1 << 18)
+#    define NV40_VP_INST_SCA_WRITEMASK_W                               (1 << 17)
+#define NV40_VP_INST_VEC_WRITEMASK_SHIFT                                      13
+#define NV40_VP_INST_VEC_WRITEMASK_MASK                              (0xF << 13)
+#    define NV40_VP_INST_VEC_WRITEMASK_X                               (1 << 16)
+#    define NV40_VP_INST_VEC_WRITEMASK_Y                               (1 << 15)
+#    define NV40_VP_INST_VEC_WRITEMASK_Z                               (1 << 14)
+#    define NV40_VP_INST_VEC_WRITEMASK_W                               (1 << 13)
+#define NV40_VP_INST_SCA_RESULT                                        (1 << 12)
+#define NV40_VP_INST_SCA_DEST_TEMP_SHIFT                                       7
+#define NV40_VP_INST_SCA_DEST_TEMP_MASK                              (0x1F << 7)
+#define NV40_VP_INST_DEST_SHIFT                                                2
+#define NV40_VP_INST_DEST_MASK                                         (31 << 2)
+#    define NV40_VP_INST_DEST_POS                                              0
+#    define NV40_VP_INST_DEST_COL0                                             1
+#    define NV40_VP_INST_DEST_COL1                                             2
+#    define NV40_VP_INST_DEST_BFC0                                             3
+#    define NV40_VP_INST_DEST_BFC1                                             4
+#    define NV40_VP_INST_DEST_FOGC                                             5
+#    define NV40_VP_INST_DEST_PSZ                                              6
+#    define NV40_VP_INST_DEST_TC0                                              7
+#    define NV40_VP_INST_DEST_TC(n)                                        (7+n)
+#    define NV40_VP_INST_DEST_TEMP                                          0x1F
+#define NV40_VP_INST_INDEX_CONST                                        (1 << 1)
+#define NV40_VP_INST_LAST                                               (1 << 0)
+#define NV40_VP_INST3_KNOWN ( \
+                NV40_VP_INST_SRC2L_MASK |\
+                NV40_VP_INST_SCA_WRITEMASK_MASK |\
+                NV40_VP_INST_VEC_WRITEMASK_MASK |\
+                NV40_VP_INST_SCA_DEST_TEMP_MASK |\
+                NV40_VP_INST_DEST_MASK |\
+                NV40_VP_INST_INDEX_CONST)
+
+/* Useful to split the source selection regs into their pieces */
+#define NV40_VP_SRC0_HIGH_SHIFT                                                9
+#define NV40_VP_SRC0_HIGH_MASK                                        0x0001FE00
+#define NV40_VP_SRC0_LOW_MASK                                         0x000001FF
+#define NV40_VP_SRC2_HIGH_SHIFT                                               11
+#define NV40_VP_SRC2_HIGH_MASK                                        0x0001F800
+#define NV40_VP_SRC2_LOW_MASK                                         0x000007FF
+
+/* Source selection - these are the bits you fill NV40_VP_INST_SRCn with */
+#define NV40_VP_SRC_NEGATE                                             (1 << 16)
+#define NV40_VP_SRC_SWZ_X_SHIFT                                               14
+#define NV40_VP_SRC_SWZ_X_MASK                                         (3 << 14)
+#define NV40_VP_SRC_SWZ_Y_SHIFT                                               12
+#define NV40_VP_SRC_SWZ_Y_MASK                                         (3 << 12)
+#define NV40_VP_SRC_SWZ_Z_SHIFT                                               10
+#define NV40_VP_SRC_SWZ_Z_MASK                                         (3 << 10)
+#define NV40_VP_SRC_SWZ_W_SHIFT                                                8
+#define NV40_VP_SRC_SWZ_W_MASK                                          (3 << 8)
+#define NV40_VP_SRC_SWZ_ALL_SHIFT                                              8
+#define NV40_VP_SRC_SWZ_ALL_MASK                                     (0xFF << 8)
+#define NV40_VP_SRC_TEMP_SRC_SHIFT                                             2
+#define NV40_VP_SRC_TEMP_SRC_MASK                                    (0x1F << 2)
+#define NV40_VP_SRC_REG_TYPE_SHIFT                                             0
+#define NV40_VP_SRC_REG_TYPE_MASK                                       (3 << 0)
+#    define NV40_VP_SRC_REG_TYPE_UNK0                                          0
+#    define NV40_VP_SRC_REG_TYPE_TEMP                                          1
+#    define NV40_VP_SRC_REG_TYPE_INPUT                                         2
+#    define NV40_VP_SRC_REG_TYPE_CONST                                         3
+
+
+/*
+ * Each fragment program opcode appears to be comprised of 4 32-bit values.
+ *
+ *         0 - Opcode, output reg/mask, ATTRIB source
+ *         1 - Source 0
+ *         2 - Source 1
+ *         3 - Source 2
+ *
+ * There appears to be no special difference between result regs and temp regs.
+ *                 result.color == R0.xyzw
+ *                 result.depth == R1.z
+ * When the fragprog contains instructions to write depth,
+ * NV30_TCL_PRIMITIVE_3D_UNK1D78=0 otherwise it is set to 1.
+ *
+ * Constants are inserted directly after the instruction that uses them.
+ * 
+ * It appears that it's not possible to use two input registers in one
+ * instruction as the input sourcing is done in the instruction dword
+ * and not the source selection dwords.  As such instructions such as:
+ * 
+ *                 ADD result.color, fragment.color, fragment.texcoord[0];
+ *
+ * must be split into two MOV's and then an ADD (nvidia does this) but
+ * I'm not sure why it's not just one MOV and then source the second input
+ * in the ADD instruction..
+ *
+ * Negation of the full source is done with NV30_FP_REG_NEGATE, arbitrary
+ * negation requires multiplication with a const.
+ *
+ * Arbitrary swizzling is supported with the exception of SWIZZLE_ZERO and
+ * SWIZZLE_ONE.
+ *
+ * The temp/result regs appear to be initialised to (0.0, 0.0, 0.0, 0.0) as
+ * SWIZZLE_ZERO is implemented simply by not writing to the relevant components
+ * of the destination.
+ *
+ * Looping
+ *   Loops appear to be fairly expensive on NV40 at least, the proprietary
+ *   driver goes to a lot of effort to avoid using the native looping
+ *   instructions.  If the total number of *executed* instructions between
+ *   REP/ENDREP or LOOP/ENDLOOP is <=500, the driver will unroll the loop.
+ *   The maximum loop count is 255.
+ *
+ * Conditional execution
+ *   TODO
+ * 
+ * Non-native instructions:
+ *         LIT
+ *         LRP - MAD+MAD
+ *         SUB - ADD, negate second source
+ *         RSQ - LG2 + EX2
+ *         POW - LG2 + MUL + EX2
+ *         SCS - COS + SIN
+ *         XPD
+ *         DP2 - MUL + ADD
+ *         NRM
+ */
+
+//== Opcode / Destination selection ==
+#define NV40_FP_OP_PROGRAM_END                                          (1 << 0)
+#define NV40_FP_OP_OUT_REG_SHIFT                                               1
+#define NV40_FP_OP_OUT_REG_MASK                                        (63 << 1)
+/* Needs to be set when writing outputs to get expected result.. */
+#define NV40_FP_OP_OUT_REG_HALF                                         (1 << 7)
+#define NV40_FP_OP_COND_WRITE_ENABLE                                    (1 << 8)
+#define NV40_FP_OP_OUTMASK_SHIFT                                               9
+#define NV40_FP_OP_OUTMASK_MASK                                       (0xF << 9)
+#    define NV40_FP_OP_OUT_X                                            (1 << 9)
+#    define NV40_FP_OP_OUT_Y                                            (1 <<10)
+#    define NV40_FP_OP_OUT_Z                                            (1 <<11)
+#    define NV40_FP_OP_OUT_W                                            (1 <<12)
+/* Uncertain about these, especially the input_src values.. it's possible that
+ * they can be dynamically changed.
+ */
+#define NV40_FP_OP_INPUT_SRC_SHIFT                                            13
+#define NV40_FP_OP_INPUT_SRC_MASK                                     (15 << 13)
+#    define NV40_FP_OP_INPUT_SRC_POSITION                                    0x0
+#    define NV40_FP_OP_INPUT_SRC_COL0                                        0x1
+#    define NV40_FP_OP_INPUT_SRC_COL1                                        0x2
+#    define NV40_FP_OP_INPUT_SRC_FOGC                                        0x3
+#    define NV40_FP_OP_INPUT_SRC_TC0                                         0x4
+#    define NV40_FP_OP_INPUT_SRC_TC(n)                                 (0x4 + n)
+#    define NV40_FP_OP_INPUT_SRC_FACING                                      0xE
+#define NV40_FP_OP_TEX_UNIT_SHIFT                                             17
+#define NV40_FP_OP_TEX_UNIT_MASK                                     (0xF << 17)
+#define NV40_FP_OP_PRECISION_SHIFT                                            22
+#define NV40_FP_OP_PRECISION_MASK                                      (3 << 22)
+#   define NV40_FP_PRECISION_FP32                                              0
+#   define NV40_FP_PRECISION_FP16                                              1
+#   define NV40_FP_PRECISION_FX12                                              2
+#define NV40_FP_OP_OPCODE_SHIFT                                               24
+#define NV40_FP_OP_OPCODE_MASK                                      (0x3F << 24)
+#        define NV40_FP_OP_OPCODE_NOP                                       0x00
+#        define NV40_FP_OP_OPCODE_MOV                                       0x01
+#        define NV40_FP_OP_OPCODE_MUL                                       0x02
+#        define NV40_FP_OP_OPCODE_ADD                                       0x03
+#        define NV40_FP_OP_OPCODE_MAD                                       0x04
+#        define NV40_FP_OP_OPCODE_DP3                                       0x05
+#        define NV40_FP_OP_OPCODE_DP4                                       0x06
+#        define NV40_FP_OP_OPCODE_DST                                       0x07
+#        define NV40_FP_OP_OPCODE_MIN                                       0x08
+#        define NV40_FP_OP_OPCODE_MAX                                       0x09
+#        define NV40_FP_OP_OPCODE_SLT                                       0x0A
+#        define NV40_FP_OP_OPCODE_SGE                                       0x0B
+#        define NV40_FP_OP_OPCODE_SLE                                       0x0C
+#        define NV40_FP_OP_OPCODE_SGT                                       0x0D
+#        define NV40_FP_OP_OPCODE_SNE                                       0x0E
+#        define NV40_FP_OP_OPCODE_SEQ                                       0x0F
+#        define NV40_FP_OP_OPCODE_FRC                                       0x10
+#        define NV40_FP_OP_OPCODE_FLR                                       0x11
+#        define NV40_FP_OP_OPCODE_KIL                                       0x12
+#        define NV40_FP_OP_OPCODE_PK4B                                      0x13
+#        define NV40_FP_OP_OPCODE_UP4B                                      0x14
+/* DDX/DDY can only write to XY */
+#        define NV40_FP_OP_OPCODE_DDX                                       0x15
+#        define NV40_FP_OP_OPCODE_DDY                                       0x16
+#        define NV40_FP_OP_OPCODE_TEX                                       0x17
+#        define NV40_FP_OP_OPCODE_TXP                                       0x18
+#        define NV40_FP_OP_OPCODE_TXD                                       0x19
+#        define NV40_FP_OP_OPCODE_RCP                                       0x1A
+#        define NV40_FP_OP_OPCODE_EX2                                       0x1C
+#        define NV40_FP_OP_OPCODE_LG2                                       0x1D
+#        define NV40_FP_OP_OPCODE_STR                                       0x20
+#        define NV40_FP_OP_OPCODE_SFL                                       0x21
+#        define NV40_FP_OP_OPCODE_COS                                       0x22
+#        define NV40_FP_OP_OPCODE_SIN                                       0x23
+#        define NV40_FP_OP_OPCODE_PK2H                                      0x24
+#        define NV40_FP_OP_OPCODE_UP2H                                      0x25
+#        define NV40_FP_OP_OPCODE_PK4UB                                     0x27
+#        define NV40_FP_OP_OPCODE_UP4UB                                     0x28
+#        define NV40_FP_OP_OPCODE_PK2US                                     0x29
+#        define NV40_FP_OP_OPCODE_UP2US                                     0x2A
+#        define NV40_FP_OP_OPCODE_DP2A                                      0x2E
+#        define NV40_FP_OP_OPCODE_TXL                                       0x2F
+#        define NV40_FP_OP_OPCODE_TXB                                       0x31
+#        define NV40_FP_OP_OPCODE_DIV                                       0x3A
+#        define NV40_FP_OP_OPCODE_UNK_LIT                                   0x3C
+/* The use of these instructions appears to be indicated by bit 31 of DWORD 2.*/
+#        define NV40_FP_OP_BRA_OPCODE_BRK                                    0x0
+#        define NV40_FP_OP_BRA_OPCODE_CAL                                    0x1
+#        define NV40_FP_OP_BRA_OPCODE_IF                                     0x2
+#        define NV40_FP_OP_BRA_OPCODE_LOOP                                   0x3
+#        define NV40_FP_OP_BRA_OPCODE_REP                                    0x4
+#        define NV40_FP_OP_BRA_OPCODE_RET                                    0x5
+#define NV40_FP_OP_OUT_SAT                                             (1 << 31)
+
+/* high order bits of SRC0 */
+#define NV40_FP_OP_OUT_ABS                                             (1 << 29)
+#define NV40_FP_OP_COND_SWZ_W_SHIFT                                           27
+#define NV40_FP_OP_COND_SWZ_W_MASK                                     (3 << 27)
+#define NV40_FP_OP_COND_SWZ_Z_SHIFT                                           25
+#define NV40_FP_OP_COND_SWZ_Z_MASK                                     (3 << 25)
+#define NV40_FP_OP_COND_SWZ_Y_SHIFT                                           23
+#define NV40_FP_OP_COND_SWZ_Y_MASK                                     (3 << 23)
+#define NV40_FP_OP_COND_SWZ_X_SHIFT                                           21
+#define NV40_FP_OP_COND_SWZ_X_MASK                                     (3 << 21)
+#define NV40_FP_OP_COND_SWZ_ALL_SHIFT                                         21
+#define NV40_FP_OP_COND_SWZ_ALL_MASK                                (0xFF << 21)
+#define NV40_FP_OP_COND_SHIFT                                                 18
+#define NV40_FP_OP_COND_MASK                                        (0x07 << 18)
+#        define NV40_FP_OP_COND_FL                                             0
+#        define NV40_FP_OP_COND_LT                                             1
+#        define NV40_FP_OP_COND_EQ                                             2
+#        define NV40_FP_OP_COND_LE                                             3
+#        define NV40_FP_OP_COND_GT                                             4
+#        define NV40_FP_OP_COND_NE                                             5
+#        define NV40_FP_OP_COND_GE                                             6
+#        define NV40_FP_OP_COND_TR                                             7
+
+/* high order bits of SRC1 */
+#define NV40_FP_OP_OPCODE_IS_BRANCH                                      (1<<31)
+#define NV40_FP_OP_DST_SCALE_SHIFT                                            28
+#define NV40_FP_OP_DST_SCALE_MASK                                      (3 << 28)
+#define NV40_FP_OP_DST_SCALE_1X                                                0
+#define NV40_FP_OP_DST_SCALE_2X                                                1
+#define NV40_FP_OP_DST_SCALE_4X                                                2
+#define NV40_FP_OP_DST_SCALE_8X                                                3
+#define NV40_FP_OP_DST_SCALE_INV_2X                                            5
+#define NV40_FP_OP_DST_SCALE_INV_4X                                            6
+#define NV40_FP_OP_DST_SCALE_INV_8X                                            7
+
+/* SRC1 LOOP */
+#define NV40_FP_OP_LOOP_INCR_SHIFT                                            19
+#define NV40_FP_OP_LOOP_INCR_MASK                                   (0xFF << 19)
+#define NV40_FP_OP_LOOP_INDEX_SHIFT                                           10
+#define NV40_FP_OP_LOOP_INDEX_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_LOOP_COUNT_SHIFT                                            2
+#define NV40_FP_OP_LOOP_COUNT_MASK                                   (0xFF << 2)
+
+/* SRC1 IF */
+#define NV40_FP_OP_ELSE_ID_SHIFT                                               2
+#define NV40_FP_OP_ELSE_ID_MASK                                      (0xFF << 2)
+
+/* SRC1 CAL */
+#define NV40_FP_OP_IADDR_SHIFT                                                 2
+#define NV40_FP_OP_IADDR_MASK                                        (0xFF << 2)
+
+/* SRC1 REP
+ *   I have no idea why there are 3 count values here..  but they
+ *   have always been filled with the same value in my tests so
+ *   far..
+ */
+#define NV40_FP_OP_REP_COUNT1_SHIFT                                            2
+#define NV40_FP_OP_REP_COUNT1_MASK                                   (0xFF << 2)
+#define NV40_FP_OP_REP_COUNT2_SHIFT                                           10
+#define NV40_FP_OP_REP_COUNT2_MASK                                  (0xFF << 10)
+#define NV40_FP_OP_REP_COUNT3_SHIFT                                           19
+#define NV40_FP_OP_REP_COUNT3_MASK                                  (0xFF << 19)
+
+/* SRC2 REP/IF */
+#define NV40_FP_OP_END_ID_SHIFT                                                2
+#define NV40_FP_OP_END_ID_MASK                                       (0xFF << 2)
+
+// SRC2 high-order
+#define NV40_FP_OP_INDEX_INPUT                                         (1 << 30)
+#define NV40_FP_OP_ADDR_INDEX_SHIFT                                           19
+#define NV40_FP_OP_ADDR_INDEX_MASK                                   (0xF << 19)
+
+//== Register selection ==
+#define NV40_FP_REG_TYPE_SHIFT                                                 0
+#define NV40_FP_REG_TYPE_MASK                                           (3 << 0)
+#        define NV40_FP_REG_TYPE_TEMP                                          0
+#        define NV40_FP_REG_TYPE_INPUT                                         1
+#        define NV40_FP_REG_TYPE_CONST                                         2
+#define NV40_FP_REG_SRC_SHIFT                                                  2
+#define NV40_FP_REG_SRC_MASK                                           (63 << 2)
+#define NV40_FP_REG_SRC_HALF                                            (1 << 8)
+#define NV40_FP_REG_SWZ_ALL_SHIFT                                              9
+#define NV40_FP_REG_SWZ_ALL_MASK                                      (255 << 9)
+#define NV40_FP_REG_SWZ_X_SHIFT                                                9
+#define NV40_FP_REG_SWZ_X_MASK                                          (3 << 9)
+#define NV40_FP_REG_SWZ_Y_SHIFT                                               11
+#define NV40_FP_REG_SWZ_Y_MASK                                         (3 << 11)
+#define NV40_FP_REG_SWZ_Z_SHIFT                                               13
+#define NV40_FP_REG_SWZ_Z_MASK                                         (3 << 13)
+#define NV40_FP_REG_SWZ_W_SHIFT                                               15
+#define NV40_FP_REG_SWZ_W_MASK                                         (3 << 15)
+#        define NV40_FP_SWIZZLE_X                                              0
+#        define NV40_FP_SWIZZLE_Y                                              1
+#        define NV40_FP_SWIZZLE_Z                                              2
+#        define NV40_FP_SWIZZLE_W                                              3
+#define NV40_FP_REG_NEGATE                                             (1 << 17)
+
+#ifndef NV40_SHADER_NO_FUCKEDNESS
+#define NV40SR_NONE	0
+#define NV40SR_OUTPUT	1
+#define NV40SR_INPUT	2
+#define NV40SR_TEMP	3
+#define NV40SR_CONST	4
+
+struct nv40_sreg {
+	int type;
+	int index;
+
+	int dst_scale;
+
+	int negate;
+	int abs;
+	int swz[4];
+
+	int cc_update;
+	int cc_update_reg;
+	int cc_test;
+	int cc_test_reg;
+	int cc_swz[4];
+};
+
+static INLINE struct nv40_sreg
+nv40_sr(int type, int index)
+{
+	struct nv40_sreg temp = {
+		.type = type,
+		.index = index,
+		.dst_scale = DEF_SCALE,
+		.abs = 0,
+		.negate = 0,
+		.swz = { 0, 1, 2, 3 },
+		.cc_update = 0,
+		.cc_update_reg = 0,
+		.cc_test = DEF_CTEST,
+		.cc_test_reg = 0,
+		.cc_swz = { 0, 1, 2, 3 },
+	};
+	return temp;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_swz(struct nv40_sreg src, int x, int y, int z, int w)
+{
+	struct nv40_sreg dst = src;
+
+	dst.swz[SWZ_X] = src.swz[x];
+	dst.swz[SWZ_Y] = src.swz[y];
+	dst.swz[SWZ_Z] = src.swz[z];
+	dst.swz[SWZ_W] = src.swz[w];
+	return dst;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_neg(struct nv40_sreg src)
+{
+	src.negate = !src.negate;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_abs(struct nv40_sreg src)
+{
+	src.abs = 1;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+nv40_sr_scale(struct nv40_sreg src, int scale)
+{
+	src.dst_scale = scale;
+	return src;
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state.c b/src/gallium/drivers/nv40/nv40_state.c
new file mode 100644
index 0000000000..255c4b294d
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.c
@@ -0,0 +1,740 @@
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+static void *
+nv40_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_blend_state *bso = CALLOC(1, sizeof(*bso));
+	struct nouveau_stateobj *so = so_new(16, 0);
+
+	if (cso->blend_enable) {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 3);
+		so_data  (so, 1);
+		so_data  (so, (nvgl_blend_func(cso->alpha_src_factor) << 16) |
+			       nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, nvgl_blend_func(cso->alpha_dst_factor) << 16 |
+			      nvgl_blend_func(cso->rgb_dst_factor));
+		so_method(so, curie, NV40TCL_BLEND_EQUATION, 1);
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func) << 16 |
+			      nvgl_blend_eqn(cso->rgb_func));
+	} else {
+		so_method(so, curie, NV40TCL_BLEND_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_COLOR_MASK, 1);
+	so_data  (so, (((cso->colormask & PIPE_MASK_A) ? (0x01 << 24) : 0) |
+		       ((cso->colormask & PIPE_MASK_R) ? (0x01 << 16) : 0) |
+		       ((cso->colormask & PIPE_MASK_G) ? (0x01 <<  8) : 0) |
+		       ((cso->colormask & PIPE_MASK_B) ? (0x01 <<  0) : 0)));
+
+	if (cso->logicop_enable) {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	} else {
+		so_method(so, curie, NV40TCL_COLOR_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, curie, NV40TCL_DITHER_ENABLE, 1);
+	so_data  (so, cso->dither ? 1 : 0);
+
+	so_ref(so, &bso->so);
+	bso->pipe = *cso;
+	return (void *)bso;
+}
+
+static void
+nv40_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend = hwcso;
+	nv40->dirty |= NV40_NEW_BLEND;
+}
+
+static void
+nv40_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_blend_state *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+
+static INLINE unsigned
+wrap_mode(unsigned wrap) {
+	unsigned ret;
+
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		ret = NV40TCL_TEX_WRAP_S_MIRRORED_REPEAT;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_CLAMP;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_EDGE;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP_TO_BORDER;
+		break;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		ret = NV40TCL_TEX_WRAP_S_MIRROR_CLAMP;
+		break;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		ret = NV40TCL_TEX_WRAP_S_REPEAT;
+		break;
+	}
+
+	return ret >> NV40TCL_TEX_WRAP_S_SHIFT;
+}
+
+static void *
+nv40_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	struct nv40_sampler_state *ps;
+	uint32_t filter = 0;
+
+	ps = MALLOC(sizeof(struct nv40_sampler_state));
+
+	ps->fmt = 0;
+	if (!cso->normalized_coords)
+		ps->fmt |= NV40TCL_TEX_FORMAT_RECT;
+
+	ps->wrap = ((wrap_mode(cso->wrap_s) << NV40TCL_TEX_WRAP_S_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV40TCL_TEX_WRAP_T_SHIFT) |
+		    (wrap_mode(cso->wrap_r) << NV40TCL_TEX_WRAP_R_SHIFT));
+
+	ps->en = 0;
+	if (cso->max_anisotropy >= 2.0) {
+		/* no idea, binary driver sets it, works without it.. meh.. */
+		ps->wrap |= (1 << 5);
+
+		if (cso->max_anisotropy >= 16.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_16X;
+		} else
+		if (cso->max_anisotropy >= 12.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_12X;
+		} else
+		if (cso->max_anisotropy >= 10.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_10X;
+		} else
+		if (cso->max_anisotropy >= 8.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_8X;
+		} else
+		if (cso->max_anisotropy >= 6.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_6X;
+		} else
+		if (cso->max_anisotropy >= 4.0) {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_4X;
+		} else {
+			ps->en |= NV40TCL_TEX_ENABLE_ANISO_2X;
+		}
+	}
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		filter |= NV40TCL_TEX_FILTER_MAG_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		filter |= NV40TCL_TEX_FILTER_MAG_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_NEAREST;
+			break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_LINEAR;
+			break;
+		}
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		switch (cso->min_mip_filter) {
+		case PIPE_TEX_MIPFILTER_NEAREST:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_NEAREST;
+		break;
+		case PIPE_TEX_MIPFILTER_LINEAR:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST_MIPMAP_LINEAR;
+			break;
+		case PIPE_TEX_MIPFILTER_NONE:
+		default:
+			filter |= NV40TCL_TEX_FILTER_MIN_NEAREST;
+			break;
+		}
+		break;
+	}
+
+	ps->filt = filter;
+
+	{
+		float limit;
+
+		limit = CLAMP(cso->lod_bias, -16.0, 15.0);
+		ps->filt |= (int)(cso->lod_bias * 256.0) & 0x1fff;
+
+		limit = CLAMP(cso->max_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 7;
+
+		limit = CLAMP(cso->min_lod, 0.0, 15.0);
+		ps->en |= (int)(limit * 256.0) << 19;
+	}
+
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		switch (cso->compare_func) {
+		case PIPE_FUNC_NEVER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NEVER;
+			break;
+		case PIPE_FUNC_GREATER:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GREATER;
+			break;
+		case PIPE_FUNC_EQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_EQUAL;
+			break;
+		case PIPE_FUNC_GEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_GEQUAL;
+			break;
+		case PIPE_FUNC_LESS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LESS;
+			break;
+		case PIPE_FUNC_NOTEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_NOTEQUAL;
+			break;
+		case PIPE_FUNC_LEQUAL:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_LEQUAL;
+			break;
+		case PIPE_FUNC_ALWAYS:
+			ps->wrap |= NV40TCL_TEX_WRAP_RCOMP_ALWAYS;
+			break;
+		default:
+			break;
+		}
+	}
+
+	ps->bcol = ((float_to_ubyte(cso->border_color[3]) << 24) |
+		    (float_to_ubyte(cso->border_color[0]) << 16) |
+		    (float_to_ubyte(cso->border_color[1]) <<  8) |
+		    (float_to_ubyte(cso->border_color[2]) <<  0));
+
+	return (void *)ps;
+}
+
+static void
+nv40_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		nv40->tex_sampler[unit] = sampler[unit];
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_samplers; unit++) {
+		nv40->tex_sampler[unit] = NULL;
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_samplers = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void
+nv40_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv40_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **miptree)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned unit;
+
+	for (unit = 0; unit < nr; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], miptree[unit]);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	for (unit = nr; unit < nv40->nr_textures; unit++) {
+		pipe_texture_reference((struct pipe_texture **)
+				       &nv40->tex_miptree[unit], NULL);
+		nv40->dirty_samplers |= (1 << unit);
+	}
+
+	nv40->nr_textures = nr;
+	nv40->dirty |= NV40_NEW_SAMPLER;
+}
+
+static void *
+nv40_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_rasterizer_state *rsso = CALLOC(1, sizeof(*rsso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	/*XXX: ignored:
+	 * 	light_twoside
+	 * 	point_smooth -nohw
+	 * 	multisample
+	 */
+
+	so_method(so, curie, NV40TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV40TCL_SHADE_MODEL_FLAT :
+				       NV40TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, curie, NV40TCL_LINE_WIDTH, 2);
+	so_data  (so, (unsigned char)(cso->line_width * 8.0) & 0xff);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	so_method(so, curie, NV40TCL_LINE_STIPPLE_ENABLE, 2);
+	so_data  (so, cso->line_stipple_enable ? 1 : 0);
+	so_data  (so, (cso->line_stipple_pattern << 16) |
+		       cso->line_stipple_factor);
+
+	so_method(so, curie, NV40TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, curie, NV40TCL_POLYGON_MODE_FRONT, 6);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CCW);
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV40TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV40TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV40TCL_CULL_FACE_BACK);
+			break;
+		}
+		so_data(so, NV40TCL_FRONT_FACE_CW);
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+	so_data(so, (cso->cull_mode != PIPE_WINDING_NONE) ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, curie, NV40TCL_POLYGON_OFFSET_FACTOR, 2);
+		so_data  (so, fui(cso->offset_scale));
+		so_data  (so, fui(cso->offset_units * 2));
+	}
+
+	so_method(so, curie, NV40TCL_POINT_SPRITE, 1);
+	if (cso->point_sprite) {
+		unsigned psctl = (1 << 0), i;
+
+		for (i = 0; i < 8; i++) {
+			if (cso->sprite_coord_mode[i] != PIPE_SPRITE_COORD_NONE)
+				psctl |= (1 << (8 + i));
+		}
+
+		so_data(so, psctl);
+	} else {
+		so_data(so, 0);
+	}
+
+	so_ref(so, &rsso->so);
+	rsso->pipe = *cso;
+	return (void *)rsso;
+}
+
+static void
+nv40_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->rasterizer = hwcso;
+	nv40->dirty |= NV40_NEW_RAST;
+	nv40->draw_dirty |= NV40_NEW_RAST;
+}
+
+static void
+nv40_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_rasterizer_state *rsso = hwcso;
+
+	so_ref(NULL, &rsso->so);
+	FREE(rsso);
+}
+
+static void *
+nv40_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_zsa_state *zsaso = CALLOC(1, sizeof(*zsaso));
+	struct nouveau_stateobj *so = so_new(32, 0);
+	struct nouveau_grobj *curie = nv40->screen->curie;
+
+	so_method(so, curie, NV40TCL_DEPTH_FUNC, 3);
+	so_data  (so, nvgl_comparison_op(cso->depth.func));
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	so_data  (so, cso->depth.enabled ? 1 : 0);
+
+	so_method(so, curie, NV40TCL_ALPHA_TEST_ENABLE, 3);
+	so_data  (so, cso->alpha.enabled ? 1 : 0);
+	so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	so_data  (so, float_to_ubyte(cso->alpha.ref));
+
+	if (cso->stencil[0].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, cso->stencil[0].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 8);
+		so_data  (so, cso->stencil[1].enabled ? 1 : 0);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].value_mask);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+	} else {
+		so_method(so, curie, NV40TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &zsaso->so);
+	zsaso->pipe = *cso;
+	return (void *)zsaso;
+}
+
+static void
+nv40_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->zsa = hwcso;
+	nv40->dirty |= NV40_NEW_ZSA;
+}
+
+static void
+nv40_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_zsa_state *zsaso = hwcso;
+
+	so_ref(NULL, &zsaso->so);
+	FREE(zsaso);
+}
+
+static void *
+nv40_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp;
+
+	vp = CALLOC(1, sizeof(struct nv40_vertex_program));
+	vp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	vp->draw = draw_create_vertex_shader(nv40->draw, &vp->pipe);
+
+	return (void *)vp;
+}
+
+static void
+nv40_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->vertprog = hwcso;
+	nv40->dirty |= NV40_NEW_VERTPROG;
+	nv40->draw_dirty |= NV40_NEW_VERTPROG;
+}
+
+static void
+nv40_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_vertex_program *vp = hwcso;
+
+	draw_delete_vertex_shader(nv40->draw, vp->draw);
+	nv40_vertprog_destroy(nv40, vp);
+	FREE((void*)vp->pipe.tokens);
+	FREE(vp);
+}
+
+static void *
+nv40_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv40_fragment_program *fp;
+
+	fp = CALLOC(1, sizeof(struct nv40_fragment_program));
+	fp->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+
+	tgsi_scan_shader(fp->pipe.tokens, &fp->info);
+
+	return (void *)fp;
+}
+
+static void
+nv40_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->fragprog = hwcso;
+	nv40->dirty |= NV40_NEW_FRAGPROG;
+}
+
+static void
+nv40_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nv40_fragment_program *fp = hwcso;
+
+	nv40_fragprog_destroy(nv40, fp);
+	FREE((void*)fp->pipe.tokens);
+	FREE(fp);
+}
+
+static void
+nv40_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->blend_colour = *bcol;
+	nv40->dirty |= NV40_NEW_BCOL;
+}
+
+static void
+nv40_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->clip = *clip;
+	nv40->dirty |= NV40_NEW_UCP;
+	nv40->draw_dirty |= NV40_NEW_UCP;
+}
+
+static void
+nv40_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->constbuf[shader] = buf->buffer;
+	nv40->constbuf_nr[shader] = buf->size / (4 * sizeof(float));
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv40->dirty |= NV40_NEW_VERTPROG;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv40->dirty |= NV40_NEW_FRAGPROG;
+	}
+}
+
+static void
+nv40_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->framebuffer = *fb;
+	nv40->dirty |= NV40_NEW_FB;
+}
+
+static void
+nv40_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->stipple, stipple->stipple, 4 * 32);
+	nv40->dirty |= NV40_NEW_STIPPLE;
+}
+
+static void
+nv40_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->scissor = *s;
+	nv40->dirty |= NV40_NEW_SCISSOR;
+}
+
+static void
+nv40_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->viewport = *vpt;
+	nv40->dirty |= NV40_NEW_VIEWPORT;
+	nv40->draw_dirty |= NV40_NEW_VIEWPORT;
+}
+
+static void
+nv40_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxbuf, vb, sizeof(*vb) * count);
+	nv40->vtxbuf_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	memcpy(nv40->vtxelt, ve, sizeof(*ve) * count);
+	nv40->vtxelt_nr = count;
+
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+static void
+nv40_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+
+	nv40->edgeflags = bitfield;
+	nv40->dirty |= NV40_NEW_ARRAYS;
+	nv40->draw_dirty |= NV40_NEW_ARRAYS;
+}
+
+void
+nv40_init_state_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.create_blend_state = nv40_blend_state_create;
+	nv40->pipe.bind_blend_state = nv40_blend_state_bind;
+	nv40->pipe.delete_blend_state = nv40_blend_state_delete;
+
+	nv40->pipe.create_sampler_state = nv40_sampler_state_create;
+	nv40->pipe.bind_sampler_states = nv40_sampler_state_bind;
+	nv40->pipe.delete_sampler_state = nv40_sampler_state_delete;
+	nv40->pipe.set_sampler_textures = nv40_set_sampler_texture;
+
+	nv40->pipe.create_rasterizer_state = nv40_rasterizer_state_create;
+	nv40->pipe.bind_rasterizer_state = nv40_rasterizer_state_bind;
+	nv40->pipe.delete_rasterizer_state = nv40_rasterizer_state_delete;
+
+	nv40->pipe.create_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_create;
+	nv40->pipe.bind_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_bind;
+	nv40->pipe.delete_depth_stencil_alpha_state =
+		nv40_depth_stencil_alpha_state_delete;
+
+	nv40->pipe.create_vs_state = nv40_vp_state_create;
+	nv40->pipe.bind_vs_state = nv40_vp_state_bind;
+	nv40->pipe.delete_vs_state = nv40_vp_state_delete;
+
+	nv40->pipe.create_fs_state = nv40_fp_state_create;
+	nv40->pipe.bind_fs_state = nv40_fp_state_bind;
+	nv40->pipe.delete_fs_state = nv40_fp_state_delete;
+
+	nv40->pipe.set_blend_color = nv40_set_blend_color;
+	nv40->pipe.set_clip_state = nv40_set_clip_state;
+	nv40->pipe.set_constant_buffer = nv40_set_constant_buffer;
+	nv40->pipe.set_framebuffer_state = nv40_set_framebuffer_state;
+	nv40->pipe.set_polygon_stipple = nv40_set_polygon_stipple;
+	nv40->pipe.set_scissor_state = nv40_set_scissor_state;
+	nv40->pipe.set_viewport_state = nv40_set_viewport_state;
+
+	nv40->pipe.set_edgeflags = nv40_set_edgeflags;
+	nv40->pipe.set_vertex_buffers = nv40_set_vertex_buffers;
+	nv40->pipe.set_vertex_elements = nv40_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state.h b/src/gallium/drivers/nv40/nv40_state.h
new file mode 100644
index 0000000000..9c55903ae3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state.h
@@ -0,0 +1,91 @@
+#ifndef __NV40_STATE_H__
+#define __NV40_STATE_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv40_sampler_state {
+	uint32_t fmt;
+	uint32_t wrap;
+	uint32_t en;
+	uint32_t filt;
+	uint32_t bcol;
+};
+
+struct nv40_vertex_program_exec {
+	uint32_t data[4];
+	boolean has_branch_offset;
+	int const_index;
+};
+
+struct nv40_vertex_program_data {
+	int index; /* immediates == -1 */
+	float value[4];
+};
+
+struct nv40_vertex_program {
+	struct pipe_shader_state pipe;
+
+	struct draw_vertex_shader *draw;
+
+	boolean translated;
+
+	struct pipe_clip_state ucp;
+
+	struct nv40_vertex_program_exec *insns;
+	unsigned nr_insns;
+	struct nv40_vertex_program_data *consts;
+	unsigned nr_consts;
+
+	struct nouveau_resource *exec;
+	unsigned exec_start;
+	struct nouveau_resource *data;
+	unsigned data_start;
+	unsigned data_start_min;
+
+	uint32_t ir;
+	uint32_t or;
+	uint32_t clip_ctrl;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_fragment_program_data {
+	unsigned offset;
+	unsigned index;
+};
+
+struct nv40_fragment_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+
+	boolean translated;
+	unsigned samplers;
+
+	uint32_t *insn;
+	int       insn_len;
+
+	struct nv40_fragment_program_data *consts;
+	unsigned nr_consts;
+
+	struct pipe_buffer *buffer;
+
+	uint32_t fp_control;
+	struct nouveau_stateobj *so;
+};
+
+struct nv40_miptree {
+	struct pipe_texture base;
+
+	struct pipe_buffer *buffer;
+	uint total_size;
+
+	struct pipe_texture *shadow_tex;
+	struct pipe_surface *shadow_surface;
+
+	struct {
+		uint pitch;
+		uint *image_offset;
+	} level[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+#endif
diff --git a/src/gallium/drivers/nv40/nv40_state_blend.c b/src/gallium/drivers/nv40/nv40_state_blend.c
new file mode 100644
index 0000000000..95e6d7394f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_blend.c
@@ -0,0 +1,40 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_blend_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->blend->so, &nv40->state.hw[NV40_STATE_BLEND]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend = {
+	.validate = nv40_state_blend_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BLEND,
+		.hw = NV40_STATE_BLEND
+	}
+};
+
+static boolean
+nv40_state_blend_colour_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *so = so_new(2, 0);
+	struct pipe_blend_color *bcol = &nv40->blend_colour;
+
+	so_method(so, nv40->screen->curie, NV40TCL_BLEND_COLOR, 1);
+	so_data  (so, ((float_to_ubyte(bcol->color[3]) << 24) |
+		       (float_to_ubyte(bcol->color[0]) << 16) |
+		       (float_to_ubyte(bcol->color[1]) <<  8) |
+		       (float_to_ubyte(bcol->color[2]) <<  0)));
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_BCOL]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_blend_colour = {
+	.validate = nv40_state_blend_colour_validate,
+	.dirty = {
+		.pipe = NV40_NEW_BCOL,
+		.hw = NV40_STATE_BCOL
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_emit.c b/src/gallium/drivers/nv40/nv40_state_emit.c
new file mode 100644
index 0000000000..ab88dc416e
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_emit.c
@@ -0,0 +1,184 @@
+#include "nv40_context.h"
+#include "nv40_state.h"
+#include "draw/draw_context.h"
+
+static struct nv40_state_entry *render_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vbo,
+	NULL
+};
+
+static struct nv40_state_entry *swtnl_states[] = {
+	&nv40_state_framebuffer,
+	&nv40_state_rasterizer,
+	&nv40_state_scissor,
+	&nv40_state_stipple,
+	&nv40_state_fragprog,
+	&nv40_state_fragtex,
+	&nv40_state_vertprog,
+	&nv40_state_blend,
+	&nv40_state_blend_colour,
+	&nv40_state_zsa,
+	&nv40_state_viewport,
+	&nv40_state_vtxfmt,
+	NULL
+};
+
+static void
+nv40_state_do_validate(struct nv40_context *nv40,
+		       struct nv40_state_entry **states)
+{
+	const struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	while (*states) {
+		struct nv40_state_entry *e = *states;
+
+		if (nv40->dirty & e->dirty.pipe) {
+			if (e->validate(nv40))
+				nv40->state.dirty |= (1ULL << e->dirty.hw);
+		}
+
+		states++;
+	}
+	nv40->dirty = 0;
+}
+
+void
+nv40_state_emit(struct nv40_context *nv40)
+{
+	struct nv40_state *state = &nv40->state;
+	struct nv40_screen *screen = nv40->screen;
+	unsigned i, samplers;
+	uint64 states;
+
+	if (nv40->pctx_id != screen->cur_pctx) {
+		for (i = 0; i < NV40_STATE_MAX; i++) {
+			if (state->hw[i] && screen->state[i] != state->hw[i])
+				state->dirty |= (1ULL << i);
+		}
+
+		screen->cur_pctx = nv40->pctx_id;
+	}
+
+	for (i = 0, states = state->dirty; states; i++) {
+		if (!(states & (1ULL << i)))
+			continue;
+		so_ref (state->hw[i], &nv40->screen->state[i]);
+		if (state->hw[i])
+			so_emit(nv40->nvws, nv40->screen->state[i]);
+		states &= ~(1ULL << i);
+	}
+
+	if (state->dirty & ((1ULL << NV40_STATE_FRAGPROG) |
+			    (1ULL << NV40_STATE_FRAGTEX0))) {
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (2);
+		BEGIN_RING(curie, NV40TCL_TEX_CACHE_CTL, 1);
+		OUT_RING  (1);
+	}
+
+	state->dirty = 0;
+
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FB]);
+	for (i = 0, samplers = state->fp_samplers; i < 16 && samplers; i++) {
+		if (!(samplers & (1 << i)))
+			continue;
+		so_emit_reloc_markers(nv40->nvws,
+				      state->hw[NV40_STATE_FRAGTEX0+i]);
+		samplers &= ~(1ULL << i);
+	}
+	so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_FRAGPROG]);
+	if (state->hw[NV40_STATE_VTXBUF] && nv40->render_mode == HW)
+		so_emit_reloc_markers(nv40->nvws, state->hw[NV40_STATE_VTXBUF]);
+}
+
+boolean
+nv40_state_validate(struct nv40_context *nv40)
+{
+	boolean was_sw = nv40->fallback_swtnl ? TRUE : FALSE;
+
+	if (nv40->render_mode != HW) {
+		/* Don't even bother trying to go back to hw if none
+		 * of the states that caused swtnl previously have changed.
+		 */
+		if ((nv40->fallback_swtnl & nv40->dirty)
+				!= nv40->fallback_swtnl)
+			return FALSE;
+
+		/* Attempt to go to hwtnl again */
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = HW;
+	}
+
+	nv40_state_do_validate(nv40, render_states);
+	if (nv40->fallback_swtnl || nv40->fallback_swrast)
+		return FALSE;
+	
+	if (was_sw)
+		NOUVEAU_ERR("swtnl->hw\n");
+
+	return TRUE;
+}
+
+boolean
+nv40_state_validate_swtnl(struct nv40_context *nv40)
+{
+	struct draw_context *draw = nv40->draw;
+
+	/* Setup for swtnl */
+	if (nv40->render_mode == HW) {
+		NOUVEAU_ERR("hw->swtnl 0x%08x\n", nv40->fallback_swtnl);
+		nv40->pipe.flush(&nv40->pipe, 0, NULL);
+		nv40->dirty |= (NV40_NEW_VIEWPORT |
+				NV40_NEW_VERTPROG |
+				NV40_NEW_ARRAYS);
+		nv40->render_mode = SWTNL;
+	}
+
+	if (nv40->draw_dirty & NV40_NEW_VERTPROG)
+		draw_bind_vertex_shader(draw, nv40->vertprog->draw);
+
+	if (nv40->draw_dirty & NV40_NEW_RAST)
+		draw_set_rasterizer_state(draw, &nv40->rasterizer->pipe);
+
+	if (nv40->draw_dirty & NV40_NEW_UCP)
+		draw_set_clip_state(draw, &nv40->clip);
+
+	if (nv40->draw_dirty & NV40_NEW_VIEWPORT)
+		draw_set_viewport_state(draw, &nv40->viewport);
+
+	if (nv40->draw_dirty & NV40_NEW_ARRAYS) {
+		draw_set_edgeflags(draw, nv40->edgeflags);
+		draw_set_vertex_buffers(draw, nv40->vtxbuf_nr, nv40->vtxbuf);
+		draw_set_vertex_elements(draw, nv40->vtxelt_nr, nv40->vtxelt);	
+	}
+
+	nv40_state_do_validate(nv40, swtnl_states);
+	if (nv40->fallback_swrast) {
+		NOUVEAU_ERR("swtnl->swrast 0x%08x\n", nv40->fallback_swrast);
+		return FALSE;
+	}
+
+	nv40->draw_dirty = 0;
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv40/nv40_state_fb.c b/src/gallium/drivers/nv40/nv40_state_fb.c
new file mode 100644
index 0000000000..28592d71c3
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_fb.c
@@ -0,0 +1,155 @@
+#include "nv40_context.h"
+#include "nouveau/nouveau_util.h"
+
+static boolean
+nv40_state_framebuffer_validate(struct nv40_context *nv40)
+{
+	struct pipe_framebuffer_state *fb = &nv40->framebuffer;
+	struct pipe_surface *rt[4], *zeta;
+	uint32_t rt_enable, rt_format;
+	int i, colour_format = 0, zeta_format = 0;
+	struct nouveau_stateobj *so = so_new(64, 10);
+	unsigned rt_flags = NOUVEAU_BO_RDWR | NOUVEAU_BO_VRAM;
+	unsigned w = fb->width;
+	unsigned h = fb->height;
+
+	rt_enable = 0;
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (colour_format) {
+			assert(colour_format == fb->cbufs[i]->format);
+		} else {
+			colour_format = fb->cbufs[i]->format;
+			rt_enable |= (NV40TCL_RT_ENABLE_COLOR0 << i);
+			rt[i] = fb->cbufs[i];
+		}
+	}
+
+	if (rt_enable & (NV40TCL_RT_ENABLE_COLOR1 | NV40TCL_RT_ENABLE_COLOR2 |
+			 NV40TCL_RT_ENABLE_COLOR3))
+		rt_enable |= NV40TCL_RT_ENABLE_MRT;
+
+	if (fb->zsbuf) {
+		zeta_format = fb->zsbuf->format;
+		zeta = fb->zsbuf;
+	}
+
+	if (!(rt[0]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR)) {
+		assert(!(fb->width & (fb->width - 1)) && !(fb->height & (fb->height - 1)));
+		for (i = 1; i < fb->num_cbufs; i++)
+			assert(!(rt[i]->texture->tex_usage & NOUVEAU_TEXTURE_USAGE_LINEAR));
+
+		rt_format = NV40TCL_RT_FORMAT_TYPE_SWIZZLED |
+		            log2i(fb->width) << NV40TCL_RT_FORMAT_LOG2_WIDTH_SHIFT |
+		            log2i(fb->height) << NV40TCL_RT_FORMAT_LOG2_HEIGHT_SHIFT;
+	}
+	else
+		rt_format = NV40TCL_RT_FORMAT_TYPE_LINEAR;
+
+	switch (colour_format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_A8R8G8B8;
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_COLOR_R5G6B5;
+		break;
+	default:
+		assert(0);
+	}
+
+	switch (zeta_format) {
+	case PIPE_FORMAT_Z16_UNORM:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z16;
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case 0:
+		rt_format |= NV40TCL_RT_FORMAT_ZETA_Z24S8;
+		break;
+	default:
+		assert(0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR0) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR0, 1);
+		so_reloc (so, rt[0]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR0_PITCH, 2);
+		so_data  (so, rt[0]->stride);
+		so_reloc (so, rt[0]->buffer, rt[0]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR1) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR1, 1);
+		so_reloc (so, rt[1]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR1_OFFSET, 2);
+		so_reloc (so, rt[1]->buffer, rt[1]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_data  (so, rt[1]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR2) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR2, 1);
+		so_reloc (so, rt[2]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_OFFSET, 1);
+		so_reloc (so, rt[2]->buffer, rt[2]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR2_PITCH, 1);
+		so_data  (so, rt[2]->stride);
+	}
+
+	if (rt_enable & NV40TCL_RT_ENABLE_COLOR3) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_COLOR3, 1);
+		so_reloc (so, rt[3]->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_OFFSET, 1);
+		so_reloc (so, rt[3]->buffer, rt[3]->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_COLOR3_PITCH, 1);
+		so_data  (so, rt[3]->stride);
+	}
+
+	if (zeta_format) {
+		so_method(so, nv40->screen->curie, NV40TCL_DMA_ZETA, 1);
+		so_reloc (so, zeta->buffer, 0, rt_flags | NOUVEAU_BO_OR,
+			  nv40->nvws->channel->vram->handle,
+			  nv40->nvws->channel->gart->handle);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_OFFSET, 1);
+		so_reloc (so, zeta->buffer, zeta->offset, rt_flags |
+			  NOUVEAU_BO_LOW, 0, 0);
+		so_method(so, nv40->screen->curie, NV40TCL_ZETA_PITCH, 1);
+		so_data  (so, zeta->stride);
+	}
+
+	so_method(so, nv40->screen->curie, NV40TCL_RT_ENABLE, 1);
+	so_data  (so, rt_enable);
+	so_method(so, nv40->screen->curie, NV40TCL_RT_HORIZ, 3);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_data  (so, rt_format);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, (w << 16) | 0);
+	so_data  (so, (h << 16) | 0);
+	so_method(so, nv40->screen->curie, NV40TCL_VIEWPORT_CLIP_HORIZ(0), 2);
+	so_data  (so, ((w - 1) << 16) | 0);
+	so_data  (so, ((h - 1) << 16) | 0);
+	so_method(so, nv40->screen->curie, 0x1d88, 1);
+	so_data  (so, (1 << 12) | h);
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_FB]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_framebuffer = {
+	.validate = nv40_state_framebuffer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_FB,
+		.hw = NV40_STATE_FB
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_rasterizer.c b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
new file mode 100644
index 0000000000..9ecda5990f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_rasterizer.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_rasterizer_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->rasterizer->so,
+	       &nv40->state.hw[NV40_STATE_RAST]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_rasterizer = {
+	.validate = nv40_state_rasterizer_validate,
+	.dirty = {
+		.pipe = NV40_NEW_RAST,
+		.hw = NV40_STATE_RAST
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_scissor.c b/src/gallium/drivers/nv40/nv40_state_scissor.c
new file mode 100644
index 0000000000..285239ef41
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_scissor.c
@@ -0,0 +1,35 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_scissor_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct pipe_scissor_state *s = &nv40->scissor;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_SCISSOR] &&
+	    (rast->scissor == 0 && nv40->state.scissor_enabled == 0))
+		return FALSE;
+	nv40->state.scissor_enabled = rast->scissor;
+
+	so = so_new(3, 0);
+	so_method(so, nv40->screen->curie, NV40TCL_SCISSOR_HORIZ, 2);
+	if (nv40->state.scissor_enabled) {
+		so_data  (so, ((s->maxx - s->minx) << 16) | s->minx);
+		so_data  (so, ((s->maxy - s->miny) << 16) | s->miny);
+	} else {
+		so_data  (so, 4096 << 16);
+		so_data  (so, 4096 << 16);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_SCISSOR]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_scissor = {
+	.validate = nv40_state_scissor_validate,
+	.dirty = {
+		.pipe = NV40_NEW_SCISSOR | NV40_NEW_RAST,
+		.hw = NV40_STATE_SCISSOR
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_stipple.c b/src/gallium/drivers/nv40/nv40_state_stipple.c
new file mode 100644
index 0000000000..b51024ad9b
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_stipple.c
@@ -0,0 +1,39 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_stipple_validate(struct nv40_context *nv40)
+{
+	struct pipe_rasterizer_state *rast = &nv40->rasterizer->pipe;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nouveau_stateobj *so;
+
+	if (nv40->state.hw[NV40_STATE_STIPPLE] &&
+	   (rast->poly_stipple_enable == 0 && nv40->state.stipple_enabled == 0))
+		return FALSE;
+
+	if (rast->poly_stipple_enable) {
+		unsigned i;
+
+		so = so_new(35, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv40->stipple[i]);
+	} else {
+		so = so_new(2, 0);
+		so_method(so, curie, NV40TCL_POLYGON_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_STIPPLE]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_stipple = {
+	.validate = nv40_state_stipple_validate,
+	.dirty = {
+		.pipe = NV40_NEW_STIPPLE | NV40_NEW_RAST,
+		.hw = NV40_STATE_STIPPLE,
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_viewport.c b/src/gallium/drivers/nv40/nv40_state_viewport.c
new file mode 100644
index 0000000000..869a55b405
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_viewport.c
@@ -0,0 +1,67 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_viewport_validate(struct nv40_context *nv40)
+{
+	struct pipe_viewport_state *vpt = &nv40->viewport;
+	struct nouveau_stateobj *so;
+	unsigned bypass;
+
+	if (nv40->render_mode == HW && !nv40->rasterizer->pipe.bypass_clipping)
+		bypass = 0;
+	else
+		bypass = 1;
+
+	if (nv40->state.hw[NV40_STATE_VIEWPORT] &&
+	    (bypass || !(nv40->dirty & NV40_NEW_VIEWPORT)) &&
+	    nv40->state.viewport_bypass == bypass)
+		return FALSE;
+	nv40->state.viewport_bypass = bypass;
+
+	so = so_new(11, 0);
+	if (!bypass) {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(vpt->translate[0]));
+		so_data  (so, fui(vpt->translate[1]));
+		so_data  (so, fui(vpt->translate[2]));
+		so_data  (so, fui(vpt->translate[3]));
+		so_data  (so, fui(vpt->scale[0]));
+		so_data  (so, fui(vpt->scale[1]));
+		so_data  (so, fui(vpt->scale[2]));
+		so_data  (so, fui(vpt->scale[3]));
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 1);
+	} else {
+		so_method(so, nv40->screen->curie,
+			  NV40TCL_VIEWPORT_TRANSLATE_X, 8);
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(0.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(1.0));
+		so_data  (so, fui(0.0));
+		/* Not entirely certain what this is yet.  The DDX uses this
+		 * value also as it fixes rendering when you pass
+		 * pre-transformed vertices to the GPU.  My best gusss is that
+		 * this bypasses some culling/clipping stage.  Might be worth
+		 * noting that points/lines are uneffected by whatever this
+		 * value fixes, only filled polygons are effected.
+		 */
+		so_method(so, nv40->screen->curie, 0x1d78, 1);
+		so_data  (so, 0x110);
+	}
+
+	so_ref(so, &nv40->state.hw[NV40_STATE_VIEWPORT]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_viewport = {
+	.validate = nv40_state_viewport_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VIEWPORT | NV40_NEW_RAST,
+		.hw = NV40_STATE_VIEWPORT
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_state_zsa.c b/src/gallium/drivers/nv40/nv40_state_zsa.c
new file mode 100644
index 0000000000..fb760677c8
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_state_zsa.c
@@ -0,0 +1,17 @@
+#include "nv40_context.h"
+
+static boolean
+nv40_state_zsa_validate(struct nv40_context *nv40)
+{
+	so_ref(nv40->zsa->so,
+	       &nv40->state.hw[NV40_STATE_ZSA]);
+	return TRUE;
+}
+
+struct nv40_state_entry nv40_state_zsa = {
+	.validate = nv40_state_zsa_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ZSA,
+		.hw = NV40_STATE_ZSA
+	}
+};
diff --git a/src/gallium/drivers/nv40/nv40_surface.c b/src/gallium/drivers/nv40/nv40_surface.c
new file mode 100644
index 0000000000..576af7c59e
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_surface.c
@@ -0,0 +1,77 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "nv40_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv40_surface_copy(struct pipe_context *pipe, boolean do_flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	if (do_flip) {
+		/*XXX: This dodgyness will do for now for correctness.  But,
+		 *     need to investigate whether the 2D engine is able to
+		 *     manage a flip (perhaps SIFM?), if not, use the 3D engine
+		 */
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv40_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_winsys *nvws = nv40->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+void
+nv40_init_surface_functions(struct nv40_context *nv40)
+{
+	nv40->pipe.surface_copy = nv40_surface_copy;
+	nv40->pipe.surface_fill = nv40_surface_fill;
+}
diff --git a/src/gallium/drivers/nv40/nv40_vbo.c b/src/gallium/drivers/nv40/nv40_vbo.c
new file mode 100644
index 0000000000..09f6e79d32
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vbo.c
@@ -0,0 +1,555 @@
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_pushbuf.h"
+#include "nouveau/nouveau_util.h"
+
+#define FORCE_SWTNL 0
+
+static INLINE int
+nv40_vbo_format_to_hw(enum pipe_format pipe, unsigned *fmt, unsigned *ncomp)
+{
+	switch (pipe) {
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+		*fmt = NV40TCL_VTXFMT_TYPE_FLOAT;
+		break;
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+		*fmt = NV40TCL_VTXFMT_TYPE_UBYTE;
+		break;
+	case PIPE_FORMAT_R16_SSCALED:
+	case PIPE_FORMAT_R16G16_SSCALED:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*fmt = NV40TCL_VTXFMT_TYPE_USHORT;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	switch (pipe) {
+	case PIPE_FORMAT_R8_UNORM:
+	case PIPE_FORMAT_R32_FLOAT:
+	case PIPE_FORMAT_R16_SSCALED:
+		*ncomp = 1;
+		break;
+	case PIPE_FORMAT_R8G8_UNORM:
+	case PIPE_FORMAT_R32G32_FLOAT:
+	case PIPE_FORMAT_R16G16_SSCALED:
+		*ncomp = 2;
+		break;
+	case PIPE_FORMAT_R8G8B8_UNORM:
+	case PIPE_FORMAT_R32G32B32_FLOAT:
+	case PIPE_FORMAT_R16G16B16_SSCALED:
+		*ncomp = 3;
+		break;
+	case PIPE_FORMAT_R8G8B8A8_UNORM:
+	case PIPE_FORMAT_R32G32B32A32_FLOAT:
+	case PIPE_FORMAT_R16G16B16A16_SSCALED:
+		*ncomp = 4;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown format %s\n", pf_name(pipe));
+		return 1;
+	}
+
+	return 0;
+}
+
+static boolean
+nv40_vbo_set_idxbuf(struct nv40_context *nv40, struct pipe_buffer *ib,
+		    unsigned ib_size)
+{
+	struct pipe_screen *pscreen = &nv40->screen->pipe;
+	unsigned type;
+
+	if (!ib) {
+		nv40->idxbuf = NULL;
+		nv40->idxbuf_format = 0xdeadbeef;
+		return FALSE;
+	}
+
+	if (!pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF) || ib_size == 1)
+		return FALSE;
+
+	switch (ib_size) {
+	case 2:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U16;
+		break;
+	case 4:
+		type = NV40TCL_IDXBUF_FORMAT_TYPE_U32;
+		break;
+	default:
+		return FALSE;
+	}
+
+	if (ib != nv40->idxbuf ||
+	    type != nv40->idxbuf_format) {
+		nv40->dirty |= NV40_NEW_ARRAYS;
+		nv40->idxbuf = ib;
+		nv40->idxbuf_format = type;
+	}
+
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_static_attrib(struct nv40_context *nv40, struct nouveau_stateobj *so,
+		       int attrib, struct pipe_vertex_element *ve,
+		       struct pipe_vertex_buffer *vb)
+{
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	unsigned type, ncomp;
+	void *map;
+
+	if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp))
+		return FALSE;
+
+	map  = ws->buffer_map(ws, vb->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+	map += vb->buffer_offset + ve->src_offset;
+
+	switch (type) {
+	case NV40TCL_VTXFMT_TYPE_FLOAT:
+	{
+		float *v = map;
+
+		switch (ncomp) {
+		case 4:
+			so_method(so, curie, NV40TCL_VTX_ATTR_4F_X(attrib), 4);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			so_data  (so, fui(v[3]));
+			break;
+		case 3:
+			so_method(so, curie, NV40TCL_VTX_ATTR_3F_X(attrib), 3);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			so_data  (so, fui(v[2]));
+			break;
+		case 2:
+			so_method(so, curie, NV40TCL_VTX_ATTR_2F_X(attrib), 2);
+			so_data  (so, fui(v[0]));
+			so_data  (so, fui(v[1]));
+			break;
+		case 1:
+			so_method(so, curie, NV40TCL_VTX_ATTR_1F(attrib), 1);
+			so_data  (so, fui(v[0]));
+			break;
+		default:
+			ws->buffer_unmap(ws, vb->buffer);
+			return FALSE;
+		}
+	}
+		break;
+	default:
+		ws->buffer_unmap(ws, vb->buffer);
+		return FALSE;
+	}
+
+	ws->buffer_unmap(ws, vb->buffer);
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_arrays(struct pipe_context *pipe,
+		 unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	nv40_vbo_set_idxbuf(nv40, NULL, 0);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	while (count) {
+		unsigned vc, nr;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_VERTEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_VERTEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv40_draw_elements_u08(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint8_t *elts = (uint8_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u16(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint16_t *elts = (uint16_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 2,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		if (vc & 1) {
+			BEGIN_RING(curie, NV40TCL_VB_ELEMENT_U32, 1);
+			OUT_RING  (elts[0]);
+			elts++; vc--;
+		}
+
+		while (vc) {
+			unsigned i;
+
+			push = MIN2(vc, 2047 * 2);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U16, push >> 1);
+			for (i = 0; i < push; i+=2)
+				OUT_RING((elts[i+1] << 16) | elts[i]);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static INLINE void
+nv40_draw_elements_u32(struct nv40_context *nv40, void *ib,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nouveau_channel *chan = nv40->nvws->channel;
+
+	while (count) {
+		uint32_t *elts = (uint32_t *)ib + start;
+		unsigned vc, push, restart;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 5, 1,
+					mode, start, count, &restart);
+		if (vc == 0) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		count -= vc;
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		while (vc) {
+			push = MIN2(vc, 2047);
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_ELEMENT_U32, push);
+			OUT_RINGp    (elts, push);
+
+			vc -= push;
+			elts += push;
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		start = restart;
+	}
+}
+
+static boolean
+nv40_draw_elements_inline(struct pipe_context *pipe,
+			  struct pipe_buffer *ib, unsigned ib_size,
+			  unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map;
+
+	map = ws->buffer_map(ws, ib, PIPE_BUFFER_USAGE_CPU_READ);
+	if (!ib) {
+		NOUVEAU_ERR("failed mapping ib\n");
+		return FALSE;
+	}
+
+	switch (ib_size) {
+	case 1:
+		nv40_draw_elements_u08(nv40, map, mode, start, count);
+		break;
+	case 2:
+		nv40_draw_elements_u16(nv40, map, mode, start, count);
+		break;
+	case 4:
+		nv40_draw_elements_u32(nv40, map, mode, start, count);
+		break;
+	default:
+		NOUVEAU_ERR("invalid idxbuf fmt %d\n", ib_size);
+		break;
+	}
+
+	ws->buffer_unmap(ws, ib);
+	return TRUE;
+}
+
+static boolean
+nv40_draw_elements_vbo(struct pipe_context *pipe,
+		       unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	struct nouveau_channel *chan = nv40->nvws->channel;
+	unsigned restart;
+
+	while (count) {
+		unsigned nr, vc;
+
+		nv40_state_emit(nv40);
+
+		vc = nouveau_vbuf_split(chan->pushbuf->remaining, 6, 256,
+					mode, start, count, &restart);
+		if (!vc) {
+			FIRE_RING(NULL);
+			continue;
+		}
+		
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (nvgl_primitive(mode));
+
+		nr = (vc & 0xff);
+		if (nr) {
+			BEGIN_RING(curie, NV40TCL_VB_INDEX_BATCH, 1);
+			OUT_RING  (((nr - 1) << 24) | start);
+			start += nr;
+		}
+
+		nr = vc >> 8;
+		while (nr) {
+			unsigned push = nr > 2047 ? 2047 : nr;
+
+			nr -= push;
+
+			BEGIN_RING_NI(curie, NV40TCL_VB_INDEX_BATCH, push);
+			while (push--) {
+				OUT_RING(((0x100 - 1) << 24) | start);
+				start += 0x100;
+			}
+		}
+
+		BEGIN_RING(curie, NV40TCL_BEGIN_END, 1);
+		OUT_RING  (0);
+
+		count -= vc;
+		start = restart;
+	}
+
+	return TRUE;
+}
+
+boolean
+nv40_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv40_context *nv40 = nv40_context(pipe);
+	boolean idxbuf;
+
+	idxbuf = nv40_vbo_set_idxbuf(nv40, indexBuffer, indexSize);
+	if (FORCE_SWTNL || !nv40_state_validate(nv40)) {
+		return nv40_draw_elements_swtnl(pipe, NULL, 0,
+						mode, start, count);
+	}
+
+	if (idxbuf) {
+		nv40_draw_elements_vbo(pipe, mode, start, count);
+	} else {
+		nv40_draw_elements_inline(pipe, indexBuffer, indexSize,
+					  mode, start, count);
+	}
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static boolean
+nv40_vbo_validate(struct nv40_context *nv40)
+{
+	struct nouveau_stateobj *vtxbuf, *vtxfmt, *sattr = NULL;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct pipe_buffer *ib = nv40->idxbuf;
+	unsigned ib_format = nv40->idxbuf_format;
+	unsigned vb_flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD;
+	int hw;
+
+	if (nv40->edgeflags) {
+		nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+		return FALSE;
+	}
+
+	vtxbuf = so_new(20, 18);
+	so_method(vtxbuf, curie, NV40TCL_VTXBUF_ADDRESS(0), nv40->vtxelt_nr);
+	vtxfmt = so_new(17, 0);
+	so_method(vtxfmt, curie, NV40TCL_VTXFMT(0), nv40->vtxelt_nr);
+
+	for (hw = 0; hw < nv40->vtxelt_nr; hw++) {
+		struct pipe_vertex_element *ve;
+		struct pipe_vertex_buffer *vb;
+		unsigned type, ncomp;
+
+		ve = &nv40->vtxelt[hw];
+		vb = &nv40->vtxbuf[ve->vertex_buffer_index];
+
+		if (!vb->pitch) {
+			if (!sattr)
+				sattr = so_new(16 * 5, 0);
+
+			if (nv40_vbo_static_attrib(nv40, sattr, hw, ve, vb)) {
+				so_data(vtxbuf, 0);
+				so_data(vtxfmt, NV40TCL_VTXFMT_TYPE_FLOAT);
+				continue;
+			}
+		}
+
+		if (nv40_vbo_format_to_hw(ve->src_format, &type, &ncomp)) {
+			nv40->fallback_swtnl |= NV40_NEW_ARRAYS;
+			so_ref(NULL, &vtxbuf);
+			so_ref(NULL, &vtxfmt);
+			return FALSE;
+		}
+
+		so_reloc(vtxbuf, vb->buffer, vb->buffer_offset + ve->src_offset,
+			 vb_flags | NOUVEAU_BO_LOW | NOUVEAU_BO_OR,
+			 0, NV40TCL_VTXBUF_ADDRESS_DMA1);
+		so_data (vtxfmt, ((vb->pitch << NV40TCL_VTXFMT_STRIDE_SHIFT) |
+				  (ncomp << NV40TCL_VTXFMT_SIZE_SHIFT) | type));
+	}
+
+	if (ib) {
+		so_method(vtxbuf, curie, NV40TCL_IDXBUF_ADDRESS, 2);
+		so_reloc (vtxbuf, ib, 0, vb_flags | NOUVEAU_BO_LOW, 0, 0);
+		so_reloc (vtxbuf, ib, ib_format, vb_flags | NOUVEAU_BO_OR,
+			  0, NV40TCL_IDXBUF_FORMAT_DMA1);
+	}
+
+	so_method(vtxbuf, curie, 0x1710, 1);
+	so_data  (vtxbuf, 0);
+
+	so_ref(vtxbuf, &nv40->state.hw[NV40_STATE_VTXBUF]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXBUF);
+	so_ref(vtxfmt, &nv40->state.hw[NV40_STATE_VTXFMT]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXFMT);
+	so_ref(sattr, &nv40->state.hw[NV40_STATE_VTXATTR]);
+	nv40->state.dirty |= (1ULL << NV40_STATE_VTXATTR);
+	return FALSE;
+}
+
+struct nv40_state_entry nv40_state_vbo = {
+	.validate = nv40_vbo_validate,
+	.dirty = {
+		.pipe = NV40_NEW_ARRAYS,
+		.hw = 0,
+	}
+};
+
diff --git a/src/gallium/drivers/nv40/nv40_vertprog.c b/src/gallium/drivers/nv40/nv40_vertprog.c
new file mode 100644
index 0000000000..ff988e6a5f
--- /dev/null
+++ b/src/gallium/drivers/nv40/nv40_vertprog.c
@@ -0,0 +1,1070 @@
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv40_context.h"
+#include "nv40_state.h"
+
+/* TODO (at least...):
+ *  1. Indexed consts  + ARL
+ *  3. NV_vp11, NV_vp2, NV_vp3 features
+ *       - extra arith opcodes
+ *       - branching
+ *       - texture sampling
+ *       - indexed attribs
+ *       - indexed results
+ *  4. bugs
+ */
+
+#define SWZ_X 0
+#define SWZ_Y 1
+#define SWZ_Z 2
+#define SWZ_W 3
+#define MASK_X 8
+#define MASK_Y 4
+#define MASK_Z 2
+#define MASK_W 1
+#define MASK_ALL (MASK_X|MASK_Y|MASK_Z|MASK_W)
+#define DEF_SCALE 0
+#define DEF_CTEST 0
+#include "nv40_shader.h"
+
+#define swz(s,x,y,z,w) nv40_sr_swz((s), SWZ_##x, SWZ_##y, SWZ_##z, SWZ_##w)
+#define neg(s) nv40_sr_neg((s))
+#define abs(s) nv40_sr_abs((s))
+
+#define NV40_VP_INST_DEST_CLIP(n) ((~0 - 6) + (n))
+
+struct nv40_vpc {
+	struct nv40_vertex_program *vp;
+
+	struct nv40_vertex_program_exec *vpi;
+
+	unsigned r_temps;
+	unsigned r_temps_discard;
+	struct nv40_sreg r_result[PIPE_MAX_SHADER_OUTPUTS];
+	struct nv40_sreg *r_address;
+	struct nv40_sreg *r_temp;
+
+	struct nv40_sreg *imm;
+	unsigned nr_imm;
+
+	unsigned hpos_idx;
+};
+
+static struct nv40_sreg
+temp(struct nv40_vpc *vpc)
+{
+	int idx = ffs(~vpc->r_temps) - 1;
+
+	if (idx < 0) {
+		NOUVEAU_ERR("out of temps!!\n");
+		assert(0);
+		return nv40_sr(NV40SR_TEMP, 0);
+	}
+
+	vpc->r_temps |= (1 << idx);
+	vpc->r_temps_discard |= (1 << idx);
+	return nv40_sr(NV40SR_TEMP, idx);
+}
+
+static INLINE void
+release_temps(struct nv40_vpc *vpc)
+{
+	vpc->r_temps &= ~vpc->r_temps_discard;
+	vpc->r_temps_discard = 0;
+}
+
+static struct nv40_sreg
+constant(struct nv40_vpc *vpc, int pipe, float x, float y, float z, float w)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	struct nv40_vertex_program_data *vpd;
+	int idx;
+
+	if (pipe >= 0) {
+		for (idx = 0; idx < vp->nr_consts; idx++) {
+			if (vp->consts[idx].index == pipe)
+				return nv40_sr(NV40SR_CONST, idx);
+		}
+	}
+
+	idx = vp->nr_consts++;
+	vp->consts = realloc(vp->consts, sizeof(*vpd) * vp->nr_consts);
+	vpd = &vp->consts[idx];
+
+	vpd->index = pipe;
+	vpd->value[0] = x;
+	vpd->value[1] = y;
+	vpd->value[2] = z;
+	vpd->value[3] = w;
+	return nv40_sr(NV40SR_CONST, idx);
+}
+
+#define arith(cc,s,o,d,m,s0,s1,s2) \
+	nv40_vp_arith((cc), (s), NV40_VP_INST_##o, (d), (m), (s0), (s1), (s2))
+
+static void
+emit_src(struct nv40_vpc *vpc, uint32_t *hw, int pos, struct nv40_sreg src)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t sr = 0;
+
+	switch (src.type) {
+	case NV40SR_TEMP:
+		sr |= (NV40_VP_SRC_REG_TYPE_TEMP << NV40_VP_SRC_REG_TYPE_SHIFT);
+		sr |= (src.index << NV40_VP_SRC_TEMP_SRC_SHIFT);
+		break;
+	case NV40SR_INPUT:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		vp->ir |= (1 << src.index);
+		hw[1] |= (src.index << NV40_VP_INST_INPUT_SRC_SHIFT);
+		break;
+	case NV40SR_CONST:
+		sr |= (NV40_VP_SRC_REG_TYPE_CONST <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		assert(vpc->vpi->const_index == -1 ||
+		       vpc->vpi->const_index == src.index);
+		vpc->vpi->const_index = src.index;
+		break;
+	case NV40SR_NONE:
+		sr |= (NV40_VP_SRC_REG_TYPE_INPUT <<
+		       NV40_VP_SRC_REG_TYPE_SHIFT);
+		break;
+	default:
+		assert(0);
+	}
+
+	if (src.negate)
+		sr |= NV40_VP_SRC_NEGATE;
+
+	if (src.abs)
+		hw[0] |= (1 << (21 + pos));
+
+	sr |= ((src.swz[0] << NV40_VP_SRC_SWZ_X_SHIFT) |
+	       (src.swz[1] << NV40_VP_SRC_SWZ_Y_SHIFT) |
+	       (src.swz[2] << NV40_VP_SRC_SWZ_Z_SHIFT) |
+	       (src.swz[3] << NV40_VP_SRC_SWZ_W_SHIFT));
+
+	switch (pos) {
+	case 0:
+		hw[1] |= ((sr & NV40_VP_SRC0_HIGH_MASK) >>
+			  NV40_VP_SRC0_HIGH_SHIFT) << NV40_VP_INST_SRC0H_SHIFT;
+		hw[2] |= (sr & NV40_VP_SRC0_LOW_MASK) <<
+			  NV40_VP_INST_SRC0L_SHIFT;
+		break;
+	case 1:
+		hw[2] |= sr << NV40_VP_INST_SRC1_SHIFT;
+		break;
+	case 2:
+		hw[2] |= ((sr & NV40_VP_SRC2_HIGH_MASK) >>
+			  NV40_VP_SRC2_HIGH_SHIFT) << NV40_VP_INST_SRC2H_SHIFT;
+		hw[3] |= (sr & NV40_VP_SRC2_LOW_MASK) <<
+			  NV40_VP_INST_SRC2L_SHIFT;
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+emit_dst(struct nv40_vpc *vpc, uint32_t *hw, int slot, struct nv40_sreg dst)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+
+	switch (dst.type) {
+	case NV40SR_TEMP:
+		hw[3] |= NV40_VP_INST_DEST_MASK;
+		if (slot == 0) {
+			hw[0] |= (dst.index <<
+				  NV40_VP_INST_VEC_DEST_TEMP_SHIFT);
+		} else {
+			hw[3] |= (dst.index << 
+				  NV40_VP_INST_SCA_DEST_TEMP_SHIFT);
+		}
+		break;
+	case NV40SR_OUTPUT:
+		switch (dst.index) {
+		case NV40_VP_INST_DEST_COL0 : vp->or |= (1 << 0); break;
+		case NV40_VP_INST_DEST_COL1 : vp->or |= (1 << 1); break;
+		case NV40_VP_INST_DEST_BFC0 : vp->or |= (1 << 2); break;
+		case NV40_VP_INST_DEST_BFC1 : vp->or |= (1 << 3); break;
+		case NV40_VP_INST_DEST_FOGC : vp->or |= (1 << 4); break;
+		case NV40_VP_INST_DEST_PSZ  : vp->or |= (1 << 5); break;
+		case NV40_VP_INST_DEST_TC(0): vp->or |= (1 << 14); break;
+		case NV40_VP_INST_DEST_TC(1): vp->or |= (1 << 15); break;
+		case NV40_VP_INST_DEST_TC(2): vp->or |= (1 << 16); break;
+		case NV40_VP_INST_DEST_TC(3): vp->or |= (1 << 17); break;
+		case NV40_VP_INST_DEST_TC(4): vp->or |= (1 << 18); break;
+		case NV40_VP_INST_DEST_TC(5): vp->or |= (1 << 19); break;
+		case NV40_VP_INST_DEST_TC(6): vp->or |= (1 << 20); break;
+		case NV40_VP_INST_DEST_TC(7): vp->or |= (1 << 21); break;
+		case NV40_VP_INST_DEST_CLIP(0):
+			vp->or |= (1 << 6);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE0;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(1):
+			vp->or |= (1 << 7);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE1;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(2):
+			vp->or |= (1 << 8);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE2;
+			dst.index = NV40_VP_INST_DEST_FOGC;
+			break;
+		case NV40_VP_INST_DEST_CLIP(3):
+			vp->or |= (1 << 9);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE3;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(4):
+			vp->or |= (1 << 10);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE4;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		case NV40_VP_INST_DEST_CLIP(5):
+			vp->or |= (1 << 11);
+			vp->clip_ctrl |= NV40TCL_CLIP_PLANE_ENABLE_PLANE5;
+			dst.index = NV40_VP_INST_DEST_PSZ;
+			break;
+		default:
+			break;
+		}
+
+		hw[3] |= (dst.index << NV40_VP_INST_DEST_SHIFT);
+		if (slot == 0) {
+			hw[0] |= NV40_VP_INST_VEC_RESULT;
+			hw[0] |= NV40_VP_INST_VEC_DEST_TEMP_MASK | (1<<20);
+		} else {
+			hw[3] |= NV40_VP_INST_SCA_RESULT;
+			hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		}
+		break;
+	default:
+		assert(0);
+	}
+}
+
+static void
+nv40_vp_arith(struct nv40_vpc *vpc, int slot, int op,
+	      struct nv40_sreg dst, int mask,
+	      struct nv40_sreg s0, struct nv40_sreg s1,
+	      struct nv40_sreg s2)
+{
+	struct nv40_vertex_program *vp = vpc->vp;
+	uint32_t *hw;
+
+	vp->insns = realloc(vp->insns, ++vp->nr_insns * sizeof(*vpc->vpi));
+	vpc->vpi = &vp->insns[vp->nr_insns - 1];
+	memset(vpc->vpi, 0, sizeof(*vpc->vpi));
+	vpc->vpi->const_index = -1;
+
+	hw = vpc->vpi->data;
+
+	hw[0] |= (NV40_VP_INST_COND_TR << NV40_VP_INST_COND_SHIFT);
+	hw[0] |= ((0 << NV40_VP_INST_COND_SWZ_X_SHIFT) |
+		  (1 << NV40_VP_INST_COND_SWZ_Y_SHIFT) |
+		  (2 << NV40_VP_INST_COND_SWZ_Z_SHIFT) |
+		  (3 << NV40_VP_INST_COND_SWZ_W_SHIFT));
+
+	if (slot == 0) {
+		hw[1] |= (op << NV40_VP_INST_VEC_OPCODE_SHIFT);
+		hw[3] |= NV40_VP_INST_SCA_DEST_TEMP_MASK;
+		hw[3] |= (mask << NV40_VP_INST_VEC_WRITEMASK_SHIFT);
+	} else {
+		hw[1] |= (op << NV40_VP_INST_SCA_OPCODE_SHIFT);
+		hw[0] |= (NV40_VP_INST_VEC_DEST_TEMP_MASK | (1 << 20));
+		hw[3] |= (mask << NV40_VP_INST_SCA_WRITEMASK_SHIFT);
+	}
+
+	emit_dst(vpc, hw, slot, dst);
+	emit_src(vpc, hw, 0, s0);
+	emit_src(vpc, hw, 1, s1);
+	emit_src(vpc, hw, 2, s2);
+}
+
+static INLINE struct nv40_sreg
+tgsi_src(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc) {
+	struct nv40_sreg src;
+
+	switch (fsrc->SrcRegister.File) {
+	case TGSI_FILE_INPUT:
+		src = nv40_sr(NV40SR_INPUT, fsrc->SrcRegister.Index);
+		break;
+	case TGSI_FILE_CONSTANT:
+		src = constant(vpc, fsrc->SrcRegister.Index, 0, 0, 0, 0);
+		break;
+	case TGSI_FILE_IMMEDIATE:
+		src = vpc->imm[fsrc->SrcRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		src = vpc->r_temp[fsrc->SrcRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad src file\n");
+		break;
+	}
+
+	src.abs = fsrc->SrcRegisterExtMod.Absolute;
+	src.negate = fsrc->SrcRegister.Negate;
+	src.swz[0] = fsrc->SrcRegister.SwizzleX;
+	src.swz[1] = fsrc->SrcRegister.SwizzleY;
+	src.swz[2] = fsrc->SrcRegister.SwizzleZ;
+	src.swz[3] = fsrc->SrcRegister.SwizzleW;
+	return src;
+}
+
+static INLINE struct nv40_sreg
+tgsi_dst(struct nv40_vpc *vpc, const struct tgsi_full_dst_register *fdst) {
+	struct nv40_sreg dst;
+
+	switch (fdst->DstRegister.File) {
+	case TGSI_FILE_OUTPUT:
+		dst = vpc->r_result[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_TEMPORARY:
+		dst = vpc->r_temp[fdst->DstRegister.Index];
+		break;
+	case TGSI_FILE_ADDRESS:
+		dst = vpc->r_address[fdst->DstRegister.Index];
+		break;
+	default:
+		NOUVEAU_ERR("bad dst file\n");
+		break;
+	}
+
+	return dst;
+}
+
+static INLINE int
+tgsi_mask(uint tgsi)
+{
+	int mask = 0;
+
+	if (tgsi & TGSI_WRITEMASK_X) mask |= MASK_X;
+	if (tgsi & TGSI_WRITEMASK_Y) mask |= MASK_Y;
+	if (tgsi & TGSI_WRITEMASK_Z) mask |= MASK_Z;
+	if (tgsi & TGSI_WRITEMASK_W) mask |= MASK_W;
+	return mask;
+}
+
+static boolean
+src_native_swz(struct nv40_vpc *vpc, const struct tgsi_full_src_register *fsrc,
+	       struct nv40_sreg *src)
+{
+	const struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	struct nv40_sreg tgsi = tgsi_src(vpc, fsrc);
+	uint mask = 0, zero_mask = 0, one_mask = 0, neg_mask = 0;
+	uint neg[4] = { fsrc->SrcRegisterExtSwz.NegateX,
+			fsrc->SrcRegisterExtSwz.NegateY,
+			fsrc->SrcRegisterExtSwz.NegateZ,
+			fsrc->SrcRegisterExtSwz.NegateW };
+	uint c;
+
+	for (c = 0; c < 4; c++) {
+		switch (tgsi_util_get_full_src_register_extswizzle(fsrc, c)) {
+		case TGSI_EXTSWIZZLE_X:
+		case TGSI_EXTSWIZZLE_Y:
+		case TGSI_EXTSWIZZLE_Z:
+		case TGSI_EXTSWIZZLE_W:
+			mask |= tgsi_mask(1 << c);
+			break;
+		case TGSI_EXTSWIZZLE_ZERO:
+			zero_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		case TGSI_EXTSWIZZLE_ONE:
+			one_mask |= tgsi_mask(1 << c);
+			tgsi.swz[c] = SWZ_X;
+			break;
+		default:
+			assert(0);
+		}
+
+		if (!tgsi.negate && neg[c])
+			neg_mask |= tgsi_mask(1 << c);
+	}
+
+	if (mask == MASK_ALL && !neg_mask)
+		return TRUE;
+
+	*src = temp(vpc);
+
+	if (mask)
+		arith(vpc, 0, OP_MOV, *src, mask, tgsi, none, none);
+
+	if (zero_mask)
+		arith(vpc, 0, OP_SFL, *src, zero_mask, *src, none, none);
+
+	if (one_mask)
+		arith(vpc, 0, OP_STR, *src, one_mask, *src, none, none);
+
+	if (neg_mask) {
+		struct nv40_sreg one = temp(vpc);
+		arith(vpc, 0, OP_STR, one, neg_mask, one, none, none);
+		arith(vpc, 0, OP_MUL, *src, neg_mask, *src, neg(one), none);
+	}
+
+	return FALSE;
+}
+
+static boolean
+nv40_vertprog_parse_instruction(struct nv40_vpc *vpc,
+				const struct tgsi_full_instruction *finst)
+{
+	struct nv40_sreg src[3], dst, tmp;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int mask;
+	int ai = -1, ci = -1, ii = -1;
+	int i;
+
+	if (finst->Instruction.Opcode == TGSI_OPCODE_END)
+		return TRUE;
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+		if (fsrc->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+			src[i] = tgsi_src(vpc, fsrc);
+		}
+	}
+
+	for (i = 0; i < finst->Instruction.NumSrcRegs; i++) {
+		const struct tgsi_full_src_register *fsrc;
+
+		fsrc = &finst->FullSrcRegisters[i];
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+		case TGSI_FILE_CONSTANT:
+		case TGSI_FILE_TEMPORARY:
+			if (!src_native_swz(vpc, fsrc, &src[i]))
+				continue;
+			break;
+		default:
+			break;
+		}
+
+		switch (fsrc->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			if (ai == -1 || ai == fsrc->SrcRegister.Index) {
+				ai = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_CONSTANT:
+			if ((ci == -1 && ii == -1) ||
+			    ci == fsrc->SrcRegister.Index) {
+				ci = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			if ((ci == -1 && ii == -1) ||
+			    ii == fsrc->SrcRegister.Index) {
+				ii = fsrc->SrcRegister.Index;
+				src[i] = tgsi_src(vpc, fsrc);
+			} else {
+				src[i] = temp(vpc);
+				arith(vpc, 0, OP_MOV, src[i], MASK_ALL,
+				      tgsi_src(vpc, fsrc), none, none);
+			}
+			break;
+		case TGSI_FILE_TEMPORARY:
+			/* handled above */
+			break;
+		default:
+			NOUVEAU_ERR("bad src file\n");
+			return FALSE;
+		}
+	}
+
+	dst  = tgsi_dst(vpc, &finst->FullDstRegisters[0]);
+	mask = tgsi_mask(finst->FullDstRegisters[0].DstRegister.WriteMask);
+
+	switch (finst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		arith(vpc, 0, OP_MOV, dst, mask, abs(src[0]), none, none);
+		break;
+	case TGSI_OPCODE_ADD:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, src[1]);
+		break;
+	case TGSI_OPCODE_ARL:
+		arith(vpc, 0, OP_ARL, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_DP3:
+		arith(vpc, 0, OP_DP3, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DP4:
+		arith(vpc, 0, OP_DP4, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DPH:
+		arith(vpc, 0, OP_DPH, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_DST:
+		arith(vpc, 0, OP_DST, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_EX2:
+		arith(vpc, 1, OP_EX2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_EXP:
+		arith(vpc, 1, OP_EXP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_FLR:
+		arith(vpc, 0, OP_FLR, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_FRC:
+		arith(vpc, 0, OP_FRC, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_LG2:
+		arith(vpc, 1, OP_LG2, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LIT:
+		arith(vpc, 1, OP_LIT, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_LOG:
+		arith(vpc, 1, OP_LOG, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_MAD:
+		arith(vpc, 0, OP_MAD, dst, mask, src[0], src[1], src[2]);
+		break;
+	case TGSI_OPCODE_MAX:
+		arith(vpc, 0, OP_MAX, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MIN:
+		arith(vpc, 0, OP_MIN, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_MOV:
+		arith(vpc, 0, OP_MOV, dst, mask, src[0], none, none);
+		break;
+	case TGSI_OPCODE_MUL:
+		arith(vpc, 0, OP_MUL, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_POW:
+		tmp = temp(vpc);
+		arith(vpc, 1, OP_LG2, tmp, MASK_X, none, none,
+		      swz(src[0], X, X, X, X));
+		arith(vpc, 0, OP_MUL, tmp, MASK_X, swz(tmp, X, X, X, X),
+		      swz(src[1], X, X, X, X), none);
+		arith(vpc, 1, OP_EX2, dst, mask, none, none,
+		      swz(tmp, X, X, X, X));
+		break;
+	case TGSI_OPCODE_RCP:
+		arith(vpc, 1, OP_RCP, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_RET:
+		break;
+	case TGSI_OPCODE_RSQ:
+		arith(vpc, 1, OP_RSQ, dst, mask, none, none, src[0]);
+		break;
+	case TGSI_OPCODE_SGE:
+		arith(vpc, 0, OP_SGE, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SLT:
+		arith(vpc, 0, OP_SLT, dst, mask, src[0], src[1], none);
+		break;
+	case TGSI_OPCODE_SUB:
+		arith(vpc, 0, OP_ADD, dst, mask, src[0], none, neg(src[1]));
+		break;
+	case TGSI_OPCODE_XPD:
+		tmp = temp(vpc);
+		arith(vpc, 0, OP_MUL, tmp, mask,
+		      swz(src[0], Z, X, Y, Y), swz(src[1], Y, Z, X, X), none);
+		arith(vpc, 0, OP_MAD, dst, (mask & ~MASK_W),
+		      swz(src[0], Y, Z, X, X), swz(src[1], Z, X, Y, Y),
+		      neg(tmp));
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", finst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	release_temps(vpc);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_parse_decl_output(struct nv40_vpc *vpc,
+				const struct tgsi_full_declaration *fdec)
+{
+	unsigned idx = fdec->DeclarationRange.First;
+	int hw;
+
+	switch (fdec->Semantic.SemanticName) {
+	case TGSI_SEMANTIC_POSITION:
+		hw = NV40_VP_INST_DEST_POS;
+		vpc->hpos_idx = idx;
+		break;
+	case TGSI_SEMANTIC_COLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_COL0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_COL1;
+		} else {
+			NOUVEAU_ERR("bad colour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_BCOLOR:
+		if (fdec->Semantic.SemanticIndex == 0) {
+			hw = NV40_VP_INST_DEST_BFC0;
+		} else
+		if (fdec->Semantic.SemanticIndex == 1) {
+			hw = NV40_VP_INST_DEST_BFC1;
+		} else {
+			NOUVEAU_ERR("bad bcolour semantic index\n");
+			return FALSE;
+		}
+		break;
+	case TGSI_SEMANTIC_FOG:
+		hw = NV40_VP_INST_DEST_FOGC;
+		break;
+	case TGSI_SEMANTIC_PSIZE:
+		hw = NV40_VP_INST_DEST_PSZ;
+		break;
+	case TGSI_SEMANTIC_GENERIC:
+		if (fdec->Semantic.SemanticIndex <= 7) {
+			hw = NV40_VP_INST_DEST_TC(fdec->Semantic.SemanticIndex);
+		} else {
+			NOUVEAU_ERR("bad generic semantic index\n");
+			return FALSE;
+		}
+		break;
+	default:
+		NOUVEAU_ERR("bad output semantic\n");
+		return FALSE;
+	}
+
+	vpc->r_result[idx] = nv40_sr(NV40SR_OUTPUT, hw);
+	return TRUE;
+}
+
+static boolean
+nv40_vertprog_prepare(struct nv40_vpc *vpc)
+{
+	struct tgsi_parse_context p;
+	int high_temp = -1, high_addr = -1, nr_imm = 0, i;
+
+	tgsi_parse_init(&p, vpc->vp->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch(tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+			nr_imm++;
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *fdec;
+
+			fdec = &p.FullToken.FullDeclaration;
+			switch (fdec->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (fdec->DeclarationRange.Last > high_temp) {
+					high_temp =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#if 0 /* this would be nice.. except gallium doesn't track it */
+			case TGSI_FILE_ADDRESS:
+				if (fdec->DeclarationRange.Last > high_addr) {
+					high_addr =
+						fdec->DeclarationRange.Last;
+				}
+				break;
+#endif
+			case TGSI_FILE_OUTPUT:
+				if (!nv40_vertprog_parse_decl_output(vpc, fdec))
+					return FALSE;
+				break;
+			default:
+				break;
+			}
+		}
+			break;
+#if 1 /* yay, parse instructions looking for address regs instead */
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			const struct tgsi_full_dst_register *fdst;
+
+			finst = &p.FullToken.FullInstruction;
+			fdst = &finst->FullDstRegisters[0];
+
+			if (fdst->DstRegister.File == TGSI_FILE_ADDRESS) {
+				if (fdst->DstRegister.Index > high_addr)
+					high_addr = fdst->DstRegister.Index;
+			}
+		
+		}
+			break;
+#endif
+		default:
+			break;
+		}
+	}
+	tgsi_parse_free(&p);
+
+	if (nr_imm) {
+		vpc->imm = CALLOC(nr_imm, sizeof(struct nv40_sreg));
+		assert(vpc->imm);
+	}
+
+	if (++high_temp) {
+		vpc->r_temp = CALLOC(high_temp, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_temp; i++)
+			vpc->r_temp[i] = temp(vpc);
+	}
+
+	if (++high_addr) {
+		vpc->r_address = CALLOC(high_addr, sizeof(struct nv40_sreg));
+		for (i = 0; i < high_addr; i++)
+			vpc->r_address[i] = temp(vpc);
+	}
+
+	vpc->r_temps_discard = 0;
+	return TRUE;
+}
+
+static void
+nv40_vertprog_translate(struct nv40_context *nv40,
+			struct nv40_vertex_program *vp)
+{
+	struct tgsi_parse_context parse;
+	struct nv40_vpc *vpc = NULL;
+	struct nv40_sreg none = nv40_sr(NV40SR_NONE, 0);
+	int i;
+
+	vpc = CALLOC(1, sizeof(struct nv40_vpc));
+	if (!vpc)
+		return;
+	vpc->vp = vp;
+
+	if (!nv40_vertprog_prepare(vpc)) {
+		FREE(vpc);
+		return;
+	}
+
+	/* Redirect post-transform vertex position to a temp if user clip
+	 * planes are enabled.  We need to append code the the vtxprog
+	 * to handle clip planes later.
+	 */
+	if (vp->ucp.nr)  {
+		vpc->r_result[vpc->hpos_idx] = temp(vpc);
+		vpc->r_temps_discard = 0;
+	}
+
+	tgsi_parse_init(&parse, vp->pipe.tokens);
+
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		tgsi_parse_token(&parse);
+
+		switch (parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm;
+
+			imm = &parse.FullToken.FullImmediate;
+			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
+//			assert(imm->Immediate.Size == 4);
+			vpc->imm[vpc->nr_imm++] =
+				constant(vpc, -1,
+					 imm->u.ImmediateFloat32[0].Float,
+					 imm->u.ImmediateFloat32[1].Float,
+					 imm->u.ImmediateFloat32[2].Float,
+					 imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+		{
+			const struct tgsi_full_instruction *finst;
+			finst = &parse.FullToken.FullInstruction;
+			if (!nv40_vertprog_parse_instruction(vpc, finst))
+				goto out_err;
+		}
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Write out HPOS if it was redirected to a temp earlier */
+	if (vpc->r_result[vpc->hpos_idx].type != NV40SR_OUTPUT) {
+		struct nv40_sreg hpos = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_POS);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+
+		arith(vpc, 0, OP_MOV, hpos, MASK_ALL, htmp, none, none);
+	}
+
+	/* Insert code to handle user clip planes */
+	for (i = 0; i < vp->ucp.nr; i++) {
+		struct nv40_sreg cdst = nv40_sr(NV40SR_OUTPUT,
+						NV40_VP_INST_DEST_CLIP(i));
+		struct nv40_sreg ceqn = constant(vpc, -1,
+						 nv40->clip.ucp[i][0],
+						 nv40->clip.ucp[i][1],
+						 nv40->clip.ucp[i][2],
+						 nv40->clip.ucp[i][3]);
+		struct nv40_sreg htmp = vpc->r_result[vpc->hpos_idx];
+		unsigned mask;
+
+		switch (i) {
+		case 0: case 3: mask = MASK_Y; break;
+		case 1: case 4: mask = MASK_Z; break;
+		case 2: case 5: mask = MASK_W; break;
+		default:
+			NOUVEAU_ERR("invalid clip dist #%d\n", i);
+			goto out_err;
+		}
+
+		arith(vpc, 0, OP_DP4, cdst, mask, htmp, ceqn, none);
+	}
+
+	vp->insns[vp->nr_insns - 1].data[3] |= NV40_VP_INST_LAST;
+	vp->translated = TRUE;
+out_err:
+	tgsi_parse_free(&parse);
+	if (vpc->r_temp)
+		FREE(vpc->r_temp); 
+	if (vpc->r_address)
+		FREE(vpc->r_address); 
+	if (vpc->imm)	
+		FREE(vpc->imm); 
+	FREE(vpc);
+}
+
+static boolean
+nv40_vertprog_validate(struct nv40_context *nv40)
+{ 
+	struct nouveau_winsys *nvws = nv40->nvws;
+	struct pipe_winsys *ws = nv40->pipe.winsys;
+	struct nouveau_grobj *curie = nv40->screen->curie;
+	struct nv40_vertex_program *vp;
+	struct pipe_buffer *constbuf;
+	boolean upload_code = FALSE, upload_data = FALSE;
+	int i;
+
+	if (nv40->render_mode == HW) {
+		vp = nv40->vertprog;
+		constbuf = nv40->constbuf[PIPE_SHADER_VERTEX];
+
+		if ((nv40->dirty & NV40_NEW_UCP) ||
+		    memcmp(&nv40->clip, &vp->ucp, sizeof(vp->ucp))) {
+			nv40_vertprog_destroy(nv40, vp);
+			memcpy(&vp->ucp, &nv40->clip, sizeof(vp->ucp));
+		}
+	} else {
+		vp = nv40->swtnl.vertprog;
+		constbuf = NULL;
+	}
+
+	/* Translate TGSI shader into hw bytecode */
+	if (vp->translated)
+		goto check_gpu_resources;
+
+	nv40->fallback_swtnl &= ~NV40_NEW_VERTPROG;
+	nv40_vertprog_translate(nv40, vp);
+	if (!vp->translated) {
+		nv40->fallback_swtnl |= NV40_NEW_VERTPROG;
+		return FALSE;
+	}
+
+check_gpu_resources:
+	/* Allocate hw vtxprog exec slots */
+	if (!vp->exec) {
+		struct nouveau_resource *heap = nv40->screen->vp_exec_heap;
+		struct nouveau_stateobj *so;
+		uint vplen = vp->nr_insns;
+
+		if (nvws->res_alloc(heap, vplen, vp, &vp->exec)) {
+			while (heap->next && heap->size < vplen) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->exec);
+			}
+
+			if (nvws->res_alloc(heap, vplen, vp, &vp->exec))
+				assert(0);
+		}
+
+		so = so_new(7, 0);
+		so_method(so, curie, NV40TCL_VP_START_FROM_ID, 1);
+		so_data  (so, vp->exec->start);
+		so_method(so, curie, NV40TCL_VP_ATTRIB_EN, 2);
+		so_data  (so, vp->ir);
+		so_data  (so, vp->or);
+		so_method(so, curie,  NV40TCL_CLIP_PLANE_ENABLE, 1);
+		so_data  (so, vp->clip_ctrl);
+		so_ref(so, &vp->so);
+
+		upload_code = TRUE;
+	}
+
+	/* Allocate hw vtxprog const slots */
+	if (vp->nr_consts && !vp->data) {
+		struct nouveau_resource *heap = nv40->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data)) {
+			while (heap->next && heap->size < vp->nr_consts) {
+				struct nv40_vertex_program *evict;
+				
+				evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, vp->nr_consts, vp, &vp->data))
+				assert(0);
+		}
+
+		/*XXX: handle this some day */
+		assert(vp->data->start >= vp->data_start_min);
+
+		upload_data = TRUE;
+		if (vp->data_start != vp->data->start)
+			upload_code = TRUE;
+	}
+
+	/* If exec or data segments moved we need to patch the program to
+	 * fixup offsets and register IDs.
+	 */
+	if (vp->exec_start != vp->exec->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->has_branch_offset) {
+				assert(0);
+			}
+		}
+
+		vp->exec_start = vp->exec->start;
+	}
+
+	if (vp->nr_consts && vp->data_start != vp->data->start) {
+		for (i = 0; i < vp->nr_insns; i++) {
+			struct nv40_vertex_program_exec *vpi = &vp->insns[i];
+
+			if (vpi->const_index >= 0) {
+				vpi->data[1] &= ~NV40_VP_INST_CONST_SRC_MASK;
+				vpi->data[1] |=
+					(vpi->const_index + vp->data->start) <<
+					NV40_VP_INST_CONST_SRC_SHIFT;
+
+			}
+		}
+
+		vp->data_start = vp->data->start;
+	}
+
+	/* Update + Upload constant values */
+	if (vp->nr_consts) {
+		float *map = NULL;
+
+		if (constbuf) {
+			map = ws->buffer_map(ws, constbuf,
+					     PIPE_BUFFER_USAGE_CPU_READ);
+		}
+
+		for (i = 0; i < vp->nr_consts; i++) {
+			struct nv40_vertex_program_data *vpd = &vp->consts[i];
+
+			if (vpd->index >= 0) {
+				if (!upload_data &&
+				    !memcmp(vpd->value, &map[vpd->index * 4],
+					    4 * sizeof(float)))
+					continue;
+				memcpy(vpd->value, &map[vpd->index * 4],
+				       4 * sizeof(float));
+			}
+
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_CONST_ID, 5);
+			OUT_RING  (i + vp->data->start);
+			OUT_RINGp ((uint32_t *)vpd->value, 4);
+		}
+
+		if (constbuf)
+			ws->buffer_unmap(ws, constbuf);
+	}
+
+	/* Upload vtxprog */
+	if (upload_code) {
+#if 0
+		for (i = 0; i < vp->nr_insns; i++) {
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[0]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[1]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[2]);
+			NOUVEAU_MSG("VP %d: 0x%08x\n", i, vp->insns[i].data[3]);
+		}
+#endif
+		BEGIN_RING(curie, NV40TCL_VP_UPLOAD_FROM_ID, 1);
+		OUT_RING  (vp->exec->start);
+		for (i = 0; i < vp->nr_insns; i++) {
+			BEGIN_RING(curie, NV40TCL_VP_UPLOAD_INST(0), 4);
+			OUT_RINGp (vp->insns[i].data, 4);
+		}
+	}
+
+	if (vp->so != nv40->state.hw[NV40_STATE_VERTPROG]) {
+		so_ref(vp->so, &nv40->state.hw[NV40_STATE_VERTPROG]);
+		return TRUE;
+	}
+
+	return FALSE;
+}
+
+void
+nv40_vertprog_destroy(struct nv40_context *nv40, struct nv40_vertex_program *vp)
+{
+	struct nouveau_winsys *nvws = nv40->screen->nvws;
+
+	vp->translated = FALSE;
+
+	if (vp->nr_insns) {
+		FREE(vp->insns);
+		vp->insns = NULL;
+		vp->nr_insns = 0;
+	}
+
+	if (vp->nr_consts) {
+		FREE(vp->consts);
+		vp->consts = NULL;
+		vp->nr_consts = 0;
+	}
+
+	nvws->res_free(&vp->exec);
+	vp->exec_start = 0;
+	nvws->res_free(&vp->data);
+	vp->data_start = 0;
+	vp->data_start_min = 0;
+
+	vp->ir = vp->or = vp->clip_ctrl = 0;
+	so_ref(NULL, &vp->so);
+}
+
+struct nv40_state_entry nv40_state_vertprog = {
+	.validate = nv40_vertprog_validate,
+	.dirty = {
+		.pipe = NV40_NEW_VERTPROG | NV40_NEW_UCP,
+		.hw = NV40_STATE_VERTPROG,
+	}
+};
+
diff --git a/src/gallium/drivers/nv50/Makefile b/src/gallium/drivers/nv50/Makefile
new file mode 100644
index 0000000000..be30400c03
--- /dev/null
+++ b/src/gallium/drivers/nv50/Makefile
@@ -0,0 +1,29 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nv50
+
+DRIVER_SOURCES = \
+	nv50_clear.c \
+	nv50_context.c \
+	nv50_draw.c \
+	nv50_miptree.c \
+	nv50_query.c \
+	nv50_program.c \
+	nv50_screen.c \
+	nv50_state.c \
+	nv50_state_validate.c \
+	nv50_surface.c \
+	nv50_tex.c \
+	nv50_vbo.c
+
+C_SOURCES = \
+	$(COMMON_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/nv50/nv50_clear.c b/src/gallium/drivers/nv50/nv50_clear.c
new file mode 100644
index 0000000000..a31a42d6b5
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_clear.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+void
+nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+	   unsigned clearValue)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct pipe_framebuffer_state fb, s_fb = nv50->framebuffer;
+	struct pipe_scissor_state sc, s_sc = nv50->scissor;
+	unsigned dirty = nv50->dirty;
+
+	nv50->dirty = 0;
+
+	if (ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+	    ps->format == PIPE_FORMAT_Z16_UNORM) {
+		fb.num_cbufs = 0;
+		fb.zsbuf = ps;
+	} else {
+		fb.num_cbufs = 1;
+		fb.cbufs[0] = ps;
+		fb.zsbuf = NULL;
+	}
+	fb.width = ps->width;
+	fb.height = ps->height;
+	pipe->set_framebuffer_state(pipe, &fb);
+
+	sc.minx = sc.miny = 0;
+	sc.maxx = fb.width;
+	sc.maxy = fb.height;
+	pipe->set_scissor_state(pipe, &sc);
+
+	nv50_state_validate(nv50);
+
+	switch (ps->format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		BEGIN_RING(tesla, 0x0d80, 4);
+		OUT_RINGf (ubyte_to_float((clearValue >> 16) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >>  8) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >>  0) & 0xff));
+		OUT_RINGf (ubyte_to_float((clearValue >> 24) & 0xff));
+		BEGIN_RING(tesla, 0x19d0, 1);
+		OUT_RING  (0x3c);
+		break;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		BEGIN_RING(tesla, 0x0d90, 1);
+		OUT_RINGf ((float)(clearValue >> 8) * (1.0 / 16777215.0));
+		BEGIN_RING(tesla, 0x0da0, 1);
+		OUT_RING  (clearValue & 0xff);
+		BEGIN_RING(tesla, 0x19d0, 1);
+		OUT_RING  (0x03);
+		break;
+	default:
+		pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height,
+				   clearValue);
+		break;
+	}
+
+	pipe->set_framebuffer_state(pipe, &s_fb);
+	pipe->set_scissor_state(pipe, &s_sc);
+	nv50->dirty |= dirty;
+
+	ps->status = PIPE_SURFACE_STATUS_CLEAR;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
new file mode 100644
index 0000000000..b02c53f209
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+static void
+nv50_flush(struct pipe_context *pipe, unsigned flags,
+	   struct pipe_fence_handle **fence)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	
+	FIRE_RING(fence);
+}
+
+static void
+nv50_destroy(struct pipe_context *pipe)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+
+	draw_destroy(nv50->draw);
+	FREE(nv50);
+}
+
+
+static void
+nv50_set_edgeflags(struct pipe_context *pipe, const unsigned *bitfield)
+{
+}
+
+struct pipe_context *
+nv50_create(struct pipe_screen *pscreen, unsigned pctx_id)
+{
+	struct pipe_winsys *pipe_winsys = pscreen->winsys;
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nv50_context *nv50;
+
+	nv50 = CALLOC_STRUCT(nv50_context);
+	if (!nv50)
+		return NULL;
+	nv50->screen = screen;
+	nv50->pctx_id = pctx_id;
+
+	nv50->pipe.winsys = pipe_winsys;
+	nv50->pipe.screen = pscreen;
+
+	nv50->pipe.destroy = nv50_destroy;
+
+	nv50->pipe.set_edgeflags = nv50_set_edgeflags;
+	nv50->pipe.draw_arrays = nv50_draw_arrays;
+	nv50->pipe.draw_elements = nv50_draw_elements;
+	nv50->pipe.clear = nv50_clear;
+
+	nv50->pipe.flush = nv50_flush;
+
+	nv50_init_surface_functions(nv50);
+	nv50_init_state_functions(nv50);
+	nv50_init_query_functions(nv50);
+
+	nv50->draw = draw_create();
+	assert(nv50->draw);
+	draw_set_rasterize_stage(nv50->draw, nv50_draw_render_stage(nv50));
+
+	return &nv50->pipe;
+}
+
+		
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
new file mode 100644
index 0000000000..5d377f2d06
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -0,0 +1,185 @@
+#ifndef __NV50_CONTEXT_H__
+#define __NV50_CONTEXT_H__
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_compiler.h"
+
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "draw/draw_vertex.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau/nouveau_gldefs.h"
+#include "nouveau/nouveau_stateobj.h"
+
+#define NOUVEAU_PUSH_CONTEXT(ctx)                                              \
+	struct nv50_screen *ctx = nv50->screen
+#include "nouveau/nouveau_push.h"
+
+#include "nv50_screen.h"
+#include "nv50_program.h"
+
+#define NOUVEAU_ERR(fmt, args...) \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);
+#define NOUVEAU_MSG(fmt, args...) \
+	fprintf(stderr, "nouveau: "fmt, ##args);
+
+/* Constant buffer assignment */
+#define NV50_CB_PMISC		0
+#define NV50_CB_PVP		1
+#define NV50_CB_PFP		2
+#define NV50_CB_PGP		3
+#define NV50_CB_TIC		4
+#define NV50_CB_TSC		5
+#define NV50_CB_PUPLOAD         6
+
+#define NV50_NEW_BLEND		(1 << 0)
+#define NV50_NEW_ZSA		(1 << 1)
+#define NV50_NEW_BLEND_COLOUR	(1 << 2)
+#define NV50_NEW_STIPPLE	(1 << 3)
+#define NV50_NEW_SCISSOR	(1 << 4)
+#define NV50_NEW_VIEWPORT	(1 << 5)
+#define NV50_NEW_RASTERIZER	(1 << 6)
+#define NV50_NEW_FRAMEBUFFER	(1 << 7)
+#define NV50_NEW_VERTPROG	(1 << 8)
+#define NV50_NEW_VERTPROG_CB	(1 << 9)
+#define NV50_NEW_FRAGPROG	(1 << 10)
+#define NV50_NEW_FRAGPROG_CB	(1 << 11)
+#define NV50_NEW_ARRAYS		(1 << 12)
+#define NV50_NEW_SAMPLER	(1 << 13)
+#define NV50_NEW_TEXTURE	(1 << 14)
+
+struct nv50_blend_stateobj {
+	struct pipe_blend_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_zsa_stateobj {
+	struct pipe_depth_stencil_alpha_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_rasterizer_stateobj {
+	struct pipe_rasterizer_state pipe;
+	struct nouveau_stateobj *so;
+};
+
+struct nv50_miptree {
+	struct pipe_texture base;
+	struct pipe_buffer *buffer;
+};
+
+static INLINE struct nv50_miptree *
+nv50_miptree(struct pipe_texture *pt)
+{
+	return (struct nv50_miptree *)pt;
+}
+
+struct nv50_surface {
+	struct pipe_surface base;
+	struct pipe_buffer *untiled;
+};
+
+static INLINE struct nv50_surface *
+nv50_surface(struct pipe_surface *pt)
+{
+	return (struct nv50_surface *)pt;
+}
+
+struct nv50_state {
+	unsigned dirty;
+
+	struct nouveau_stateobj *fb;
+	struct nouveau_stateobj *blend;
+	struct nouveau_stateobj *blend_colour;
+	struct nouveau_stateobj *zsa;
+	struct nouveau_stateobj *rast;
+	struct nouveau_stateobj *stipple;
+	struct nouveau_stateobj *scissor;
+	unsigned scissor_enabled;
+	struct nouveau_stateobj *viewport;
+	unsigned viewport_bypass;
+	struct nouveau_stateobj *tsc_upload;
+	struct nouveau_stateobj *tic_upload;
+	struct nouveau_stateobj *vertprog;
+	struct nouveau_stateobj *fragprog;
+	struct nouveau_stateobj *vtxfmt;
+	struct nouveau_stateobj *vtxbuf;
+};
+
+struct nv50_context {
+	struct pipe_context pipe;
+
+	struct nv50_screen *screen;
+	unsigned pctx_id;
+
+	struct draw_context *draw;
+
+	struct nv50_state state;
+
+	unsigned dirty;
+	struct nv50_blend_stateobj *blend;
+	struct nv50_zsa_stateobj *zsa;
+	struct nv50_rasterizer_stateobj *rasterizer;
+	struct pipe_blend_color blend_colour;
+	struct pipe_poly_stipple stipple;
+	struct pipe_scissor_state scissor;
+	struct pipe_viewport_state viewport;
+	struct pipe_framebuffer_state framebuffer;
+	struct nv50_program *vertprog;
+	struct nv50_program *fragprog;
+	struct pipe_buffer *constbuf[PIPE_SHADER_TYPES];
+	struct pipe_vertex_buffer vtxbuf[PIPE_MAX_ATTRIBS];
+	unsigned vtxbuf_nr;
+	struct pipe_vertex_element vtxelt[PIPE_MAX_ATTRIBS];
+	unsigned vtxelt_nr;
+	unsigned *sampler[PIPE_MAX_SAMPLERS];
+	unsigned sampler_nr;
+	struct nv50_miptree *miptree[PIPE_MAX_SAMPLERS];
+	unsigned miptree_nr;
+};
+
+static INLINE struct nv50_context *
+nv50_context(struct pipe_context *pipe)
+{
+	return (struct nv50_context *)pipe;
+}
+
+extern void nv50_init_surface_functions(struct nv50_context *nv50);
+extern void nv50_init_state_functions(struct nv50_context *nv50);
+extern void nv50_init_query_functions(struct nv50_context *nv50);
+
+extern void nv50_screen_init_miptree_functions(struct pipe_screen *pscreen);
+
+/* nv50_draw.c */
+extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *nv50);
+
+/* nv50_vbo.c */
+extern boolean nv50_draw_arrays(struct pipe_context *, unsigned mode,
+				unsigned start, unsigned count);
+extern boolean nv50_draw_elements(struct pipe_context *pipe,
+				  struct pipe_buffer *indexBuffer,
+				  unsigned indexSize,
+				  unsigned mode, unsigned start,
+				  unsigned count);
+extern void nv50_vbo_validate(struct nv50_context *nv50);
+
+/* nv50_clear.c */
+extern void nv50_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+		       unsigned clearValue);
+
+/* nv50_program.c */
+extern void nv50_vertprog_validate(struct nv50_context *nv50);
+extern void nv50_fragprog_validate(struct nv50_context *nv50);
+extern void nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p);
+
+/* nv50_state_validate.c */
+extern boolean nv50_state_validate(struct nv50_context *nv50);
+
+/* nv50_tex.c */
+extern void nv50_tex_validate(struct nv50_context *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_draw.c b/src/gallium/drivers/nv50/nv50_draw.c
new file mode 100644
index 0000000000..2f6f607261
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_draw.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "draw/draw_pipe.h"
+
+#include "nv50_context.h"
+
+struct nv50_render_stage {
+	struct draw_stage stage;
+	struct nv50_context *nv50;
+};
+
+static INLINE struct nv50_render_stage *
+nv50_render_stage(struct draw_stage *stage)
+{
+	return (struct nv50_render_stage *)stage;
+}
+
+static void
+nv50_render_point(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_line(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_flush(struct draw_stage *stage, unsigned flags)
+{
+}
+
+static void
+nv50_render_reset_stipple_counter(struct draw_stage *stage)
+{
+	NOUVEAU_ERR("\n");
+}
+
+static void
+nv50_render_destroy(struct draw_stage *stage)
+{
+	FREE(stage);
+}
+
+struct draw_stage *
+nv50_draw_render_stage(struct nv50_context *nv50)
+{
+	struct nv50_render_stage *rs = CALLOC_STRUCT(nv50_render_stage);
+
+	rs->nv50 = nv50;
+	rs->stage.draw = nv50->draw;
+	rs->stage.destroy = nv50_render_destroy;
+	rs->stage.point = nv50_render_point;
+	rs->stage.line = nv50_render_line;
+	rs->stage.tri = nv50_render_tri;
+	rs->stage.flush = nv50_render_flush;
+	rs->stage.reset_stipple_counter = nv50_render_reset_stipple_counter;
+
+	return &rs->stage;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
new file mode 100644
index 0000000000..28a8bdc0fa
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "nv50_context.h"
+
+static struct pipe_texture *
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+{
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
+	unsigned usage, pitch;
+
+	mt->base = *pt;
+	mt->base.refcount = 1;
+	mt->base.screen = pscreen;
+
+	usage = PIPE_BUFFER_USAGE_PIXEL;
+	switch (pt->format) {
+	case PIPE_FORMAT_Z24S8_UNORM:
+	case PIPE_FORMAT_Z16_UNORM:
+		usage |= NOUVEAU_BUFFER_USAGE_ZETA;
+		break;
+	default:
+		break;
+	}
+
+	pitch = ((pt->width[0] + 63) & ~63) * pt->block.size;
+	/*XXX*/
+	pitch *= 2;
+
+	mt->buffer = ws->buffer_create(ws, 256, usage, pitch * pt->height[0]);
+	if (!mt->buffer) {
+		FREE(mt);
+		return NULL;
+	}
+
+	return &mt->base;
+}
+
+static void
+nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
+{
+	struct pipe_texture *pt = *ppt;
+
+	*ppt = NULL;
+
+	if (--pt->refcount <= 0) {
+		struct nv50_miptree *mt = nv50_miptree(pt);
+
+		pipe_buffer_reference(pscreen, &mt->buffer, NULL);
+		FREE(mt);
+	}
+}
+
+static struct pipe_surface *
+nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
+			 unsigned face, unsigned level, unsigned zslice,
+			 unsigned flags)
+{
+	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_surface *s;
+	struct pipe_surface *ps;
+
+	s = CALLOC_STRUCT(nv50_surface);
+	if (!s)
+		return NULL;
+	ps = &s->base;
+
+	ps->refcount = 1;
+	ps->winsys = pscreen->winsys;
+	ps->format = pt->format;
+	ps->width = pt->width[level];
+	ps->height = pt->height[level];
+	ps->block = pt->block;
+	ps->nblocksx = pt->nblocksx[level];
+	ps->nblocksy = pt->nblocksy[level];
+	ps->stride = ps->width * ps->block.size;
+	ps->offset = 0;
+	ps->usage = flags;
+	ps->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	pipe_texture_reference(&ps->texture, pt);
+	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+
+	return ps;
+}
+
+static void
+nv50_miptree_surface_del(struct pipe_screen *pscreen,
+			 struct pipe_surface **psurface)
+{
+	struct pipe_surface *ps = *psurface;
+	struct nv50_surface *s = nv50_surface(ps);
+
+	*psurface = NULL;
+
+	if (--ps->refcount <= 0) {
+		pipe_texture_reference(&ps->texture, NULL);
+		pipe_buffer_reference(pscreen, &ps->buffer, NULL);
+		FREE(s);
+	}
+}
+
+void
+nv50_screen_init_miptree_functions(struct pipe_screen *pscreen)
+{
+	pscreen->texture_create = nv50_miptree_create;
+	pscreen->texture_release = nv50_miptree_release;
+	pscreen->get_tex_surface = nv50_miptree_surface_new;
+	pscreen->tex_surface_release = nv50_miptree_surface_del;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
new file mode 100644
index 0000000000..d6fbdd1824
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -0,0 +1,1708 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_util.h"
+
+#include "nv50_context.h"
+
+#define NV50_SU_MAX_TEMP 64
+#define NV50_PROGRAM_DUMP
+
+/* ARL - gallium craps itself on progs/vp/arl.txt
+ *
+ * MSB - Like MAD, but MUL+SUB
+ * 	- Fuck it off, introduce a way to negate args for ops that
+ * 	  support it.
+ *
+ * Look into inlining IMMD for ops other than MOV (make it general?)
+ * 	- Maybe even relax restrictions a bit, can't do P_RESULT + P_IMMD,
+ * 	  but can emit to P_TEMP first - then MOV later. NVIDIA does this
+ *
+ * In ops such as ADD it's possible to construct a bad opcode in the !is_long()
+ * case, if the emit_src() causes the inst to suddenly become long.
+ *
+ * Verify half-insns work where expected - and force disable them where they
+ * don't work - MUL has it forcibly disabled atm as it fixes POW..
+ *
+ * FUCK! watch dst==src vectors, can overwrite components that are needed.
+ * 	ie. SUB R0, R0.yzxw, R0
+ *
+ * Things to check with renouveau:
+ * 	FP attr/result assignment - how?
+ * 		attrib
+ * 			- 0x16bc maps vp output onto fp hpos
+ * 			- 0x16c0 maps vp output onto fp col0
+ * 		result
+ * 			- colr always 0-3
+ * 			- depr always 4
+ * 0x16bc->0x16e8 --> some binding between vp/fp regs
+ * 0x16b8 --> VP output count
+ *
+ * 0x1298 --> "MOV rcol.x, fcol.y" "MOV depr, fcol.y" = 0x00000005
+ * 	      "MOV rcol.x, fcol.y" = 0x00000004
+ * 0x19a8 --> as above but 0x00000100 and 0x00000000
+ * 	- 0x00100000 used when KIL used
+ * 0x196c --> as above but 0x00000011 and 0x00000000
+ *
+ * 0x1988 --> 0xXXNNNNNN
+ * 	- XX == FP high something
+ */
+struct nv50_reg {
+	enum {
+		P_TEMP,
+		P_ATTR,
+		P_RESULT,
+		P_CONST,
+		P_IMMD
+	} type;
+	int index;
+
+	int hw;
+	int neg;
+};
+
+struct nv50_pc {
+	struct nv50_program *p;
+
+	/* hw resources */
+	struct nv50_reg *r_temp[NV50_SU_MAX_TEMP];
+
+	/* tgsi resources */
+	struct nv50_reg *temp;
+	int temp_nr;
+	struct nv50_reg *attr;
+	int attr_nr;
+	struct nv50_reg *result;
+	int result_nr;
+	struct nv50_reg *param;
+	int param_nr;
+	struct nv50_reg *immd;
+	float *immd_buf;
+	int immd_nr;
+
+	struct nv50_reg *temp_temp[16];
+	unsigned temp_temp_nr;
+};
+
+static void
+alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	int i;
+
+	if (reg->type == P_RESULT) {
+		if (pc->p->cfg.high_result < (reg->hw + 1))
+			pc->p->cfg.high_result = reg->hw + 1;
+	}
+
+	if (reg->type != P_TEMP)
+		return;
+
+	if (reg->hw >= 0) {
+		/*XXX: do this here too to catch FP temp-as-attr usage..
+		 *     not clean, but works */
+		if (pc->p->cfg.high_temp < (reg->hw + 1))
+			pc->p->cfg.high_temp = reg->hw + 1;
+		return;
+	}
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!(pc->r_temp[i])) {
+			pc->r_temp[i] = reg;
+			reg->hw = i;
+			if (pc->p->cfg.high_temp < (i + 1))
+				pc->p->cfg.high_temp = i + 1;
+			return;
+		}
+	}
+
+	assert(0);
+}
+
+static struct nv50_reg *
+alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
+{
+	struct nv50_reg *r;
+	int i;
+
+	if (dst && dst->type == P_TEMP && dst->hw == -1)
+		return dst;
+
+	for (i = 0; i < NV50_SU_MAX_TEMP; i++) {
+		if (!pc->r_temp[i]) {
+			r = CALLOC_STRUCT(nv50_reg);
+			r->type = P_TEMP;
+			r->index = -1;
+			r->hw = i;
+			pc->r_temp[i] = r;
+			return r;
+		}
+	}
+
+	assert(0);
+	return NULL;
+}
+
+static void
+free_temp(struct nv50_pc *pc, struct nv50_reg *r)
+{
+	if (r->index == -1) {
+		unsigned hw = r->hw;
+
+		FREE(pc->r_temp[hw]);
+		pc->r_temp[hw] = NULL;
+	}
+}
+
+static struct nv50_reg *
+temp_temp(struct nv50_pc *pc)
+{
+	if (pc->temp_temp_nr >= 16)
+		assert(0);
+
+	pc->temp_temp[pc->temp_temp_nr] = alloc_temp(pc, NULL);
+	return pc->temp_temp[pc->temp_temp_nr++];
+}
+
+static void
+kill_temp_temp(struct nv50_pc *pc)
+{
+	int i;
+	
+	for (i = 0; i < pc->temp_temp_nr; i++)
+		free_temp(pc, pc->temp_temp[i]);
+	pc->temp_temp_nr = 0;
+}
+
+static int
+ctor_immd(struct nv50_pc *pc, float x, float y, float z, float w)
+{
+	pc->immd_buf = REALLOC(pc->immd_buf, (pc->immd_nr * r * sizeof(float)),
+			       (pc->immd_nr + 1) * 4 * sizeof(float));
+	pc->immd_buf[(pc->immd_nr * 4) + 0] = x;
+	pc->immd_buf[(pc->immd_nr * 4) + 1] = y;
+	pc->immd_buf[(pc->immd_nr * 4) + 2] = z;
+	pc->immd_buf[(pc->immd_nr * 4) + 3] = w;
+	
+	return pc->immd_nr++;
+}
+
+static struct nv50_reg *
+alloc_immd(struct nv50_pc *pc, float f)
+{
+	struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);
+	unsigned hw;
+
+	hw = ctor_immd(pc, f, 0, 0, 0) * 4;
+	r->type = P_IMMD;
+	r->hw = hw;
+	r->index = -1;
+	return r;
+}
+
+static struct nv50_program_exec *
+exec(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = CALLOC_STRUCT(nv50_program_exec);
+
+	e->param.index = -1;
+	return e;
+}
+
+static void
+emit(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	struct nv50_program *p = pc->p;
+
+	if (p->exec_tail)
+		p->exec_tail->next = e;
+	if (!p->exec_head)
+		p->exec_head = e;
+	p->exec_tail = e;
+	p->exec_size += (e->inst[0] & 1) ? 2 : 1;
+}
+
+static INLINE void set_long(struct nv50_pc *, struct nv50_program_exec *);
+
+static boolean
+is_long(struct nv50_program_exec *e)
+{
+	if (e->inst[0] & 1)
+		return TRUE;
+	return FALSE;
+}
+
+static boolean
+is_immd(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 3)
+		return TRUE;
+	return FALSE;
+}
+
+static INLINE void
+set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
+	e->inst[1] |= (pred << 7) | (idx << 12);
+}
+
+static INLINE void
+set_pred_wr(struct nv50_pc *pc, unsigned on, unsigned idx,
+	    struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+	e->inst[1] &= ~((0x3 << 4) | (1 << 6));
+	e->inst[1] |= (idx << 4) | (on << 6);
+}
+
+static INLINE void
+set_long(struct nv50_pc *pc, struct nv50_program_exec *e)
+{
+	if (is_long(e))
+		return;
+
+	e->inst[0] |= 1;
+	set_pred(pc, 0xf, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+}
+
+static INLINE void
+set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
+{
+	if (dst->type == P_RESULT) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00000008;
+	}
+
+	alloc_reg(pc, dst);
+	e->inst[0] |= (dst->hw << 2);
+}
+
+static INLINE void
+set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
+{
+	unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */
+
+	set_long(pc, e);
+	/*XXX: can't be predicated - bits overlap.. catch cases where both
+	 *     are required and avoid them. */
+	set_pred(pc, 0, 0, e);
+	set_pred_wr(pc, 0, 0, e);
+
+	e->inst[1] |= 0x00000002 | 0x00000001;
+	e->inst[0] |= (val & 0x3f) << 16;
+	e->inst[1] |= (val >> 6) << 2;
+}
+
+static void
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,
+	    struct nv50_reg *src, struct nv50_reg *iv)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x80000000;
+	set_dst(pc, dst, e);
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+	if (iv) {
+		e->inst[0] |= (1 << 25);
+		alloc_reg(pc, iv);
+		e->inst[0] |= (iv->hw << 9);
+	}
+
+	emit(pc, e);
+}
+
+static void
+set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
+	 struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+#if 1
+	e->inst[1] |= (1 << 22);
+#else
+	if (src->type == P_IMMD) {
+		e->inst[1] |= (NV50_CB_PMISC << 22);
+	} else {
+		if (pc->p->type == PIPE_SHADER_VERTEX)
+			e->inst[1] |= (NV50_CB_PVP << 22);
+		else
+			e->inst[1] |= (NV50_CB_PFP << 22);
+	}
+#endif
+
+	e->param.index = src->hw;
+	e->param.shift = s;
+	e->param.mask = m << (s % 32);
+}
+
+static void
+emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x10000000;
+
+	set_dst(pc, dst, e);
+
+	if (0 && dst->type != P_RESULT && src->type == P_IMMD) {
+		set_immd(pc, src, e);
+		/*XXX: 32-bit, but steals part of "half" reg space - need to
+		 *     catch and handle this case if/when we do half-regs
+		 */
+		e->inst[0] |= 0x00008000;
+	} else
+	if (src->type == P_IMMD || src->type == P_CONST) {
+		set_long(pc, e);
+		set_data(pc, src, 0x7f, 9, e);
+		e->inst[1] |= 0x20000000; /* src0 const? */
+	} else {
+		if (src->type == P_ATTR) {
+			set_long(pc, e);
+			e->inst[1] |= 0x00200000;
+		}
+
+		alloc_reg(pc, src);
+		e->inst[0] |= (src->hw << 9);
+	}
+
+	/* We really should support "half" instructions here at some point,
+	 * but I don't feel confident enough about them yet.
+	 */
+	set_long(pc, e);
+	if (is_long(e) && !is_immd(e)) {
+		e->inst[1] |= 0x04000000; /* 32-bit */
+		e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */
+	}
+
+	emit(pc, e);
+}
+
+static boolean
+check_swap_src_0_1(struct nv50_pc *pc,
+		   struct nv50_reg **s0, struct nv50_reg **s1)
+{
+	struct nv50_reg *src0 = *s0, *src1 = *s1;
+
+	if (src0->type == P_CONST) {
+		if (src1->type != P_CONST) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	} else
+	if (src1->type == P_ATTR) {
+		if (src0->type != P_ATTR) {
+			*s0 = src1;
+			*s1 = src0;
+			return TRUE;
+		}
+	}
+
+	return FALSE;
+}
+
+static void
+set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		set_long(pc, e);
+		e->inst[1] |= 0x00200000;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
+set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x00800000));
+		if (e->inst[0] & 0x01000000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 16, e);
+			e->inst[0] |= 0x00800000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 16);
+}
+
+static void
+set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
+{
+	set_long(pc, e);
+
+	if (src->type == P_ATTR) {
+		struct nv50_reg *temp = temp_temp(pc);
+
+		emit_mov(pc, temp, src);
+		src = temp;
+	} else
+	if (src->type == P_CONST || src->type == P_IMMD) {
+		assert(!(e->inst[0] & 0x01000000));
+		if (e->inst[0] & 0x00800000) {
+			struct nv50_reg *temp = temp_temp(pc);
+
+			emit_mov(pc, temp, src);
+			src = temp;
+		} else {
+			set_data(pc, src, 0x7f, 32+14, e);
+			e->inst[0] |= 0x01000000;
+		}
+	}
+
+	alloc_reg(pc, src);
+	e->inst[1] |= (src->hw << 14);
+}
+
+static void
+emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xc0000000;
+	set_long(pc, e);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_add(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	if (is_long(e))
+		set_src_2(pc, src1, e);
+	else
+		set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_minmax(struct nv50_pc *pc, unsigned sub, struct nv50_reg *dst,
+	    struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (sub << 29);
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_sub(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_long(pc, e);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		e->inst[1] |= 0x04000000;
+	else
+		e->inst[1] |= 0x08000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_2(pc, src1, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_mad(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
+	 struct nv50_reg *src1, struct nv50_reg *src2)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xe0000000;
+	set_long(pc, e);
+	e->inst[1] |= 0x08000000; /* src0 * src1 - src2 */
+
+	check_swap_src_0_1(pc, &src0, &src1);
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	set_src_2(pc, src2, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_flop(struct nv50_pc *pc, unsigned sub,
+	  struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0x90000000;
+	if (sub) {
+		set_long(pc, e);
+		e->inst[1] |= (sub << 29);
+	}
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29) | 0x00004000;
+
+	emit(pc, e);
+}
+
+static void
+emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] |= 0xb0000000;
+
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29);
+
+	emit(pc, e);
+}
+
+static void
+emit_set(struct nv50_pc *pc, unsigned c_op, struct nv50_reg *dst,
+	 struct nv50_reg *src0, struct nv50_reg *src1)
+{
+	struct nv50_program_exec *e = exec(pc);
+	unsigned inv_cop[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+	struct nv50_reg *rdst;
+
+	assert(c_op <= 7);
+	if (check_swap_src_0_1(pc, &src0, &src1))
+		c_op = inv_cop[c_op];
+
+	rdst = dst;
+	if (dst->type != P_TEMP)
+		dst = alloc_temp(pc, NULL);
+
+	/* set.u32 */
+	set_long(pc, e);
+	e->inst[0] |= 0xb0000000;
+	e->inst[1] |= (3 << 29);
+	e->inst[1] |= (c_op << 14);
+	/*XXX: breaks things, .u32 by default?
+	 *     decuda will disasm as .u16 and use .lo/.hi regs, but this
+	 *     doesn't seem to match what the hw actually does.
+	inst[1] |= 0x04000000; << breaks things.. .u32 by default?
+	 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src0, e);
+	set_src_1(pc, src1, e);
+	emit(pc, e);
+
+	/* cvt.f32.u32 */
+	e = exec(pc);
+	e->inst[0] = 0xa0000001;
+	e->inst[1] = 0x64014780;
+	set_dst(pc, rdst, e);
+	set_src_0(pc, dst, e);
+	emit(pc, e);
+
+	if (dst != rdst)
+		free_temp(pc, dst);
+}
+
+static void
+emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x08000000; /* integer mode */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= ((0x1 << 3)) << 14; /* .rn */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
+	 struct nv50_reg *v, struct nv50_reg *e)
+{
+	struct nv50_reg *temp = alloc_temp(pc, NULL);
+
+	emit_flop(pc, 3, temp, v);
+	emit_mul(pc, temp, temp, e);
+	emit_preex2(pc, temp, temp);
+	emit_flop(pc, 6, dst, temp);
+
+	free_temp(pc, temp);
+}
+
+static void
+emit_abs(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xa0000000; /* cvt */
+	set_long(pc, e);
+	e->inst[1] |= (6 << 29); /* cvt */
+	e->inst[1] |= 0x04000000; /* 32 bit */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	e->inst[1] |= ((1 << 6) << 14); /* .abs */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static void
+emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
+	 struct nv50_reg **src)
+{
+	struct nv50_reg *one = alloc_immd(pc, 1.0);
+	struct nv50_reg *zero = alloc_immd(pc, 0.0);
+	struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);
+	struct nv50_reg *pos128 = alloc_immd(pc,  127.999999);
+	struct nv50_reg *tmp[4];
+
+	if (mask & (1 << 0))
+		emit_mov(pc, dst[0], one);
+
+	if (mask & (1 << 3))
+		emit_mov(pc, dst[3], one);
+
+	if (mask & (3 << 1)) {
+		if (mask & (1 << 1))
+			tmp[0] = dst[1];
+		else
+			tmp[0] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[0], src[0], zero);
+	}
+
+	if (mask & (1 << 2)) {
+		set_pred_wr(pc, 1, 0, pc->p->exec_tail);
+
+		tmp[1] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[1], src[1], zero);
+
+		tmp[3] = temp_temp(pc);
+		emit_minmax(pc, 4, tmp[3], src[3], neg128);
+		emit_minmax(pc, 5, tmp[3], tmp[3], pos128);
+
+		emit_pow(pc, dst[2], tmp[1], tmp[3]);
+		emit_mov(pc, dst[2], zero);
+		set_pred(pc, 3, 0, pc->p->exec_tail);
+	}
+}
+
+static void
+emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	set_long(pc, e);
+	e->inst[0] |= 0xa0000000; /* delta */
+	e->inst[1] |= (7 << 29); /* delta */
+	e->inst[1] |= 0x04000000; /* negate arg0? probably not */
+	e->inst[1] |= (1 << 14); /* src .f32 */
+	set_dst(pc, dst, e);
+	set_src_0(pc, src, e);
+
+	emit(pc, e);
+}
+
+static struct nv50_reg *
+tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
+{
+	switch (dst->DstRegister.File) {
+	case TGSI_FILE_TEMPORARY:
+		return &pc->temp[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_OUTPUT:
+		return &pc->result[dst->DstRegister.Index * 4 + c];
+	case TGSI_FILE_NULL:
+		return NULL;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+static struct nv50_reg *
+tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)
+{
+	struct nv50_reg *r = NULL;
+	struct nv50_reg *temp;
+	unsigned c;
+
+	c = tgsi_util_get_full_src_register_extswizzle(src, chan);
+	switch (c) {
+	case TGSI_EXTSWIZZLE_X:
+	case TGSI_EXTSWIZZLE_Y:
+	case TGSI_EXTSWIZZLE_Z:
+	case TGSI_EXTSWIZZLE_W:
+		switch (src->SrcRegister.File) {
+		case TGSI_FILE_INPUT:
+			r = &pc->attr[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_TEMPORARY:
+			r = &pc->temp[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_CONSTANT:
+			r = &pc->param[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_IMMEDIATE:
+			r = &pc->immd[src->SrcRegister.Index * 4 + c];
+			break;
+		case TGSI_FILE_SAMPLER:
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		break;
+	case TGSI_EXTSWIZZLE_ZERO:
+		r = alloc_immd(pc, 0.0);
+		break;
+	case TGSI_EXTSWIZZLE_ONE:
+		r = alloc_immd(pc, 1.0);
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {
+	case TGSI_UTIL_SIGN_KEEP:
+		break;
+	case TGSI_UTIL_SIGN_CLEAR:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_TOGGLE:
+		temp = temp_temp(pc);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	case TGSI_UTIL_SIGN_SET:
+		temp = temp_temp(pc);
+		emit_abs(pc, temp, r);
+		emit_neg(pc, temp, r);
+		r = temp;
+		break;
+	default:
+		assert(0);
+		break;
+	}
+
+	return r;
+}
+
+static boolean
+nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
+{
+	const struct tgsi_full_instruction *inst = &tok->FullInstruction;
+	struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;
+	unsigned mask, sat;
+	int i, c;
+
+	mask = inst->FullDstRegisters[0].DstRegister.WriteMask;
+	sat = inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE;
+
+	for (c = 0; c < 4; c++) {
+		if (mask & (1 << c))
+			dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);
+		else
+			dst[c] = NULL;
+	}
+
+	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
+		for (c = 0; c < 4; c++)
+			src[i][c] = tgsi_src(pc, c, &inst->FullSrcRegisters[i]);
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			rdst[c] = dst[c];
+			dst[c] = temp_temp(pc);
+		}
+	}
+
+	switch (inst->Instruction.Opcode) {
+	case TGSI_OPCODE_ABS:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_abs(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_ADD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_add(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_COS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 5, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_DP3:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DP4:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_mad(pc, temp, src[0][3], src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DPH:
+		temp = alloc_temp(pc, NULL);
+		emit_mul(pc, temp, src[0][0], src[1][0]);
+		emit_mad(pc, temp, src[0][1], src[1][1], temp);
+		emit_mad(pc, temp, src[0][2], src[1][2], temp);
+		emit_add(pc, temp, src[1][3], temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_DST:
+	{
+		struct nv50_reg *one = alloc_immd(pc, 1.0);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], one);
+		if (mask & (1 << 1))
+			emit_mul(pc, dst[1], src[0][1], src[1][1]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], src[0][2]);
+		if (mask & (1 << 3))
+			emit_mov(pc, dst[3], src[1][3]);
+		FREE(one);
+	}
+		break;
+	case TGSI_OPCODE_EX2:
+		temp = alloc_temp(pc, NULL);
+		emit_preex2(pc, temp, src[0][0]);
+		emit_flop(pc, 6, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_FLR:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_FRC:
+		temp = alloc_temp(pc, NULL);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flr(pc, temp, src[0][c]);
+			emit_sub(pc, dst[c], src[0][c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_LIT:
+		emit_lit(pc, &dst[0], mask, &src[0][0]);
+		break;
+	case TGSI_OPCODE_LG2:
+		temp = alloc_temp(pc, NULL);
+		emit_flop(pc, 3, temp, src[0][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_LRP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			/*XXX: we can do better than this */
+			temp = alloc_temp(pc, NULL);
+			emit_neg(pc, temp, src[0][c]);
+			emit_mad(pc, temp, temp, src[2][c], src[2][c]);
+			emit_mad(pc, dst[c], src[0][c], src[1][c], temp);
+			free_temp(pc, temp);
+		}
+		break;
+	case TGSI_OPCODE_MAD:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mad(pc, dst[c], src[0][c], src[1][c], src[2][c]);
+		}
+		break;
+	case TGSI_OPCODE_MAX:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 4, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MIN:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_minmax(pc, 5, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_MOV:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], src[0][c]);
+		}
+		break;
+	case TGSI_OPCODE_MUL:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mul(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_POW:
+		temp = alloc_temp(pc, NULL);
+		emit_pow(pc, temp, src[0][0], src[1][0]);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_RCP:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 0, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_RSQ:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_flop(pc, 2, dst[c], src[0][0]);
+		}
+		break;
+	case TGSI_OPCODE_SCS:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		if (mask & (1 << 0))
+			emit_flop(pc, 5, dst[0], temp);
+		if (mask & (1 << 1))
+			emit_flop(pc, 4, dst[1], temp);
+		break;
+	case TGSI_OPCODE_SGE:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 6, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SIN:
+		temp = alloc_temp(pc, NULL);
+		emit_precossin(pc, temp, src[0][0]);
+		emit_flop(pc, 4, temp, temp);
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_mov(pc, dst[c], temp);
+		}
+		break;
+	case TGSI_OPCODE_SLT:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_set(pc, 1, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_SUB:
+		for (c = 0; c < 4; c++) {
+			if (!(mask & (1 << c)))
+				continue;
+			emit_sub(pc, dst[c], src[0][c], src[1][c]);
+		}
+		break;
+	case TGSI_OPCODE_TEX:
+		{
+			struct nv50_reg *t0, *t1, *t2, *t3;
+			struct nv50_program_exec *e;
+
+			t0 = alloc_temp(pc, NULL);
+			t0 = alloc_temp(pc, NULL);
+			t1 = alloc_temp(pc, NULL);
+			t2 = alloc_temp(pc, NULL);
+			t3 = alloc_temp(pc, NULL);
+			emit_mov(pc, t0, src[0][0]);
+			emit_mov(pc, t1, src[0][1]);
+
+			e = exec(pc);
+			e->inst[0] = 0xf6400000;
+			set_long(pc, e);
+			e->inst[1] |= 0x0000c004;
+			set_dst(pc, t0, e);
+			emit(pc, e);
+
+			if (mask & (1 << 0)) emit_mov(pc, dst[0], t0);
+			if (mask & (1 << 1)) emit_mov(pc, dst[1], t1);
+			if (mask & (1 << 2)) emit_mov(pc, dst[2], t2);
+			if (mask & (1 << 3)) emit_mov(pc, dst[3], t3);
+
+			free_temp(pc, t0);
+			free_temp(pc, t1);
+			free_temp(pc, t2);
+			free_temp(pc, t3);
+		}
+		break;
+	case TGSI_OPCODE_XPD:
+		temp = alloc_temp(pc, NULL);
+		if (mask & (1 << 0)) {
+			emit_mul(pc, temp, src[0][2], src[1][1]);
+			emit_msb(pc, dst[0], src[0][1], src[1][2], temp);
+		}
+		if (mask & (1 << 1)) {
+			emit_mul(pc, temp, src[0][0], src[1][2]);
+			emit_msb(pc, dst[1], src[0][2], src[1][0], temp);
+		}
+		if (mask & (1 << 2)) {
+			emit_mul(pc, temp, src[0][1], src[1][0]);
+			emit_msb(pc, dst[2], src[0][0], src[1][1], temp);
+		}
+		free_temp(pc, temp);
+		break;
+	case TGSI_OPCODE_END:
+		break;
+	default:
+		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
+		return FALSE;
+	}
+
+	if (sat) {
+		for (c = 0; c < 4; c++) {
+			struct nv50_program_exec *e;
+
+			if (!(mask & (1 << c)))
+				continue;
+			e = exec(pc);
+
+			e->inst[0] = 0xa0000000; /* cvt */
+			set_long(pc, e);
+			e->inst[1] |= (6 << 29); /* cvt */
+			e->inst[1] |= 0x04000000; /* 32 bit */
+			e->inst[1] |= (1 << 14); /* src .f32 */
+			e->inst[1] |= ((1 << 5) << 14); /* .sat */
+			set_dst(pc, rdst[c], e);
+			set_src_0(pc, dst[c], e);
+			emit(pc, e);
+		}
+	}
+
+	kill_temp_temp(pc);
+	return TRUE;
+}
+
+static boolean
+nv50_program_tx_prep(struct nv50_pc *pc)
+{
+	struct tgsi_parse_context p;
+	boolean ret = FALSE;
+	unsigned i, c;
+
+	tgsi_parse_init(&p, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&p)) {
+		const union tgsi_full_token *tok = &p.FullToken;
+
+		tgsi_parse_token(&p);
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_IMMEDIATE:
+		{
+			const struct tgsi_full_immediate *imm =
+				&p.FullToken.FullImmediate;
+
+			ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,
+				      imm->u.ImmediateFloat32[1].Float,
+				      imm->u.ImmediateFloat32[2].Float,
+				      imm->u.ImmediateFloat32[3].Float);
+		}
+			break;
+		case TGSI_TOKEN_TYPE_DECLARATION:
+		{
+			const struct tgsi_full_declaration *d;
+			unsigned last;
+
+			d = &p.FullToken.FullDeclaration;
+			last = d->DeclarationRange.Last;
+
+			switch (d->Declaration.File) {
+			case TGSI_FILE_TEMPORARY:
+				if (pc->temp_nr < (last + 1))
+					pc->temp_nr = last + 1;
+				break;
+			case TGSI_FILE_OUTPUT:
+				if (pc->result_nr < (last + 1))
+					pc->result_nr = last + 1;
+				break;
+			case TGSI_FILE_INPUT:
+				if (pc->attr_nr < (last + 1))
+					pc->attr_nr = last + 1;
+				break;
+			case TGSI_FILE_CONSTANT:
+				if (pc->param_nr < (last + 1))
+					pc->param_nr = last + 1;
+				break;
+			case TGSI_FILE_SAMPLER:
+				break;
+			default:
+				NOUVEAU_ERR("bad decl file %d\n",
+					    d->Declaration.File);
+				goto out_err;
+			}
+		}
+			break;
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (pc->temp_nr) {
+		pc->temp = CALLOC(pc->temp_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->temp)
+			goto out_err;
+
+		for (i = 0; i < pc->temp_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->temp[i*4+c].type = P_TEMP;
+				pc->temp[i*4+c].hw = -1;
+				pc->temp[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->attr_nr) {
+		struct nv50_reg *iv = NULL;
+		int aid = 0;
+
+		pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->attr)
+			goto out_err;
+
+		if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+			iv = alloc_temp(pc, NULL);
+			emit_interp(pc, iv, iv, NULL);
+			emit_flop(pc, 0, iv, iv);
+			aid++;
+		}
+
+		for (i = 0; i < pc->attr_nr; i++) {
+			struct nv50_reg *a = &pc->attr[i*4];
+
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					struct nv50_reg *at =
+						alloc_temp(pc, NULL);
+					pc->attr[i*4+c].type = at->type;
+					pc->attr[i*4+c].hw = at->hw;
+					pc->attr[i*4+c].index = at->index;
+				} else {
+					pc->p->cfg.vp.attr[aid/32] |=
+						(1 << (aid % 32));
+					pc->attr[i*4+c].type = P_ATTR;
+					pc->attr[i*4+c].hw = aid++;
+					pc->attr[i*4+c].index = i;
+				}
+			}
+
+			if (pc->p->type != PIPE_SHADER_FRAGMENT)
+				continue;
+
+			emit_interp(pc, &a[0], &a[0], iv);
+			emit_interp(pc, &a[1], &a[1], iv);
+			emit_interp(pc, &a[2], &a[2], iv);
+			emit_interp(pc, &a[3], &a[3], iv);
+		}
+
+		if (iv)
+			free_temp(pc, iv);
+	}
+
+	if (pc->result_nr) {
+		int rid = 0;
+
+		pc->result = CALLOC(pc->result_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->result)
+			goto out_err;
+
+		for (i = 0; i < pc->result_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				if (pc->p->type == PIPE_SHADER_FRAGMENT) {
+					pc->result[i*4+c].type = P_TEMP;
+					pc->result[i*4+c].hw = -1;
+				} else {
+					pc->result[i*4+c].type = P_RESULT;
+					pc->result[i*4+c].hw = rid++;
+				}
+				pc->result[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->param_nr) {
+		int rid = 0;
+
+		pc->param = CALLOC(pc->param_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->param)
+			goto out_err;
+
+		for (i = 0; i < pc->param_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->param[i*4+c].type = P_CONST;
+				pc->param[i*4+c].hw = rid++;
+				pc->param[i*4+c].index = i;
+			}
+		}
+	}
+
+	if (pc->immd_nr) {
+		int rid = pc->param_nr * 4;
+
+		pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));
+		if (!pc->immd)
+			goto out_err;
+
+		for (i = 0; i < pc->immd_nr; i++) {
+			for (c = 0; c < 4; c++) {
+				pc->immd[i*4+c].type = P_IMMD;
+				pc->immd[i*4+c].hw = rid++;
+				pc->immd[i*4+c].index = i;
+			}
+		}
+	}
+
+	ret = TRUE;
+out_err:
+	tgsi_parse_free(&p);
+	return ret;
+}
+
+static boolean
+nv50_program_tx(struct nv50_program *p)
+{
+	struct tgsi_parse_context parse;
+	struct nv50_pc *pc;
+	boolean ret;
+
+	pc = CALLOC_STRUCT(nv50_pc);
+	if (!pc)
+		return FALSE;
+	pc->p = p;
+	pc->p->cfg.high_temp = 4;
+
+	ret = nv50_program_tx_prep(pc);
+	if (ret == FALSE)
+		goto out_cleanup;
+
+	tgsi_parse_init(&parse, pc->p->pipe.tokens);
+	while (!tgsi_parse_end_of_tokens(&parse)) {
+		const union tgsi_full_token *tok = &parse.FullToken;
+
+		tgsi_parse_token(&parse);
+
+		switch (tok->Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			ret = nv50_program_tx_insn(pc, tok);
+			if (ret == FALSE)
+				goto out_err;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (p->type == PIPE_SHADER_FRAGMENT) {
+		struct nv50_reg out;
+
+		out.type = P_TEMP;
+		for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)
+			emit_mov(pc, &out, &pc->result[out.hw]);
+	}
+
+	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
+	pc->p->exec_tail->inst[1] |= 0x00000001;
+
+	p->param_nr = pc->param_nr * 4;
+	p->immd_nr = pc->immd_nr * 4;
+	p->immd = pc->immd_buf;
+
+out_err:
+	tgsi_parse_free(&parse);
+
+out_cleanup:
+	return ret;
+}
+
+static void
+nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)
+{
+	if (nv50_program_tx(p) == FALSE)
+		assert(0);
+	p->translated = TRUE;
+}
+
+static void
+nv50_program_upload_data(struct nv50_context *nv50, float *map,
+			 unsigned start, unsigned count)
+{
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(tesla, 0x00000f00, 1);
+		OUT_RING  ((NV50_CB_PMISC << 0) | (start << 8));
+		BEGIN_RING(tesla, 0x40000f04, nr);
+		OUT_RINGp (map, nr);
+
+		map += nr;
+		start += nr;
+		count -= nr;
+	}
+}
+
+static void
+nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	unsigned nr = p->param_nr + p->immd_nr;
+
+	if (!p->data && nr) {
+		struct nouveau_resource *heap = nv50->screen->vp_data_heap;
+
+		if (nvws->res_alloc(heap, nr, p, &p->data)) {
+			while (heap->next && heap->size < nr) {
+				struct nv50_program *evict = heap->next->priv;
+				nvws->res_free(&evict->data);
+			}
+
+			if (nvws->res_alloc(heap, nr, p, &p->data))
+				assert(0);
+		}
+	}
+
+	if (p->param_nr) {
+		float *map = ws->buffer_map(ws, nv50->constbuf[p->type],
+					    PIPE_BUFFER_USAGE_CPU_READ);
+		nv50_program_upload_data(nv50, map, p->data->start,
+					 p->param_nr);
+		ws->buffer_unmap(ws, nv50->constbuf[p->type]);
+	}
+
+	if (p->immd_nr) {
+		nv50_program_upload_data(nv50, p->immd,
+					 p->data->start + p->param_nr,
+					 p->immd_nr);
+	}
+}
+
+static void
+nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+	struct nv50_program_exec *e;
+	struct nouveau_stateobj *so;
+	const unsigned flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_WR;
+	unsigned start, count, *up, *ptr;
+	boolean upload = FALSE;
+
+	if (!p->buffer) {
+		p->buffer = ws->buffer_create(ws, 0x100, 0, p->exec_size * 4);
+		upload = TRUE;
+	}
+
+	if (p->data && p->data->start != p->data_start) {
+		for (e = p->exec_head; e; e = e->next) {
+			unsigned ei, ci;
+
+			if (e->param.index < 0)
+				continue;
+			ei = e->param.shift >> 5;
+			ci = e->param.index + p->data->start;
+
+			e->inst[ei] &= ~e->param.mask;
+			e->inst[ei] |= (ci << e->param.shift);
+		}
+
+		p->data_start = p->data->start;
+		upload = TRUE;
+	}
+
+	if (!upload)
+		return;
+
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
+		*(ptr++) = e->inst[0];
+		if (is_long(e))
+			*(ptr++) = e->inst[1];
+	}
+
+	so = so_new(4,2);
+	so_method(so, nv50->screen->tesla, 0x1280, 3);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, flags | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PUPLOAD << 16) | 0x0800); //(p->exec_size * 4));
+
+	start = 0; count = p->exec_size;
+	while (count) {
+		struct nouveau_winsys *nvws = nv50->screen->nvws;
+		unsigned nr;
+
+		so_emit(nvws, so);
+
+		nr = MIN2(count, 2047);
+		nr = MIN2(nvws->channel->pushbuf->remaining, nr);
+		if (nvws->channel->pushbuf->remaining < (nr + 3)) {
+			FIRE_RING(NULL);
+			continue;
+		}
+
+		BEGIN_RING(tesla, 0x0f00, 1);
+		OUT_RING  ((start << 8) | NV50_CB_PUPLOAD);
+		BEGIN_RING(tesla, 0x40000f04, nr);	
+		OUT_RINGp (up + start, nr);
+
+		start += nr;
+		count -= nr;
+	}
+
+	FREE(up);
+	so_ref(NULL, &so);
+}
+
+void
+nv50_vertprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->vertprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(13, 2);
+	so_method(so, tesla, NV50TCL_VP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1650, 2);
+	so_data  (so, p->cfg.vp.attr[0]);
+	so_data  (so, p->cfg.vp.attr[1]);
+	so_method(so, tesla, 0x16b8, 1);
+	so_data  (so, p->cfg.high_result);
+	so_method(so, tesla, 0x16ac, 2);
+	so_data  (so, p->cfg.high_result); //8);
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x140c, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.vertprog);
+}
+
+void
+nv50_fragprog_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nv50_program *p = nv50->fragprog;
+	struct nouveau_stateobj *so;
+
+	if (!p->translated) {
+		nv50_program_validate(nv50, p);
+		if (!p->translated)
+			assert(0);
+	}
+
+	nv50_program_validate_data(nv50, p);
+	nv50_program_validate_code(nv50, p);
+
+	so = so_new(64, 2);
+	so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
+		  NOUVEAU_BO_LOW, 0, 0);
+	so_method(so, tesla, 0x1904, 4);
+	so_data  (so, 0x01040404); /* p: 0x01000404 */
+	so_data  (so, 0x00000004);
+	so_data  (so, 0x00000000);
+	so_data  (so, 0x00000000);
+	so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */
+	so_data  (so, 0x03020100);
+	so_data  (so, 0x07060504);
+	so_data  (so, 0x0b0a0908);
+	so_method(so, tesla, 0x1988, 2);
+	so_data  (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */
+	so_data  (so, p->cfg.high_temp);
+	so_method(so, tesla, 0x1414, 1);
+	so_data  (so, 0); /* program start offset */
+	so_ref(so, &nv50->state.fragprog);
+}
+
+void
+nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)
+{
+	struct pipe_winsys *ws = nv50->pipe.winsys;
+
+	while (p->exec_head) {
+		struct nv50_program_exec *e = p->exec_head;
+
+		p->exec_head = e->next;
+		FREE(e);
+	}
+	p->exec_tail = NULL;
+	p->exec_size = 0;
+
+	if (p->buffer)
+		pipe_buffer_reference(ws, &p->buffer, NULL);
+
+	nv50->screen->nvws->res_free(&p->data);
+
+	p->translated = 0;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
new file mode 100644
index 0000000000..78deed6a38
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -0,0 +1,45 @@
+#ifndef __NV50_PROGRAM_H__
+#define __NV50_PROGRAM_H__
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+struct nv50_program_exec {
+	struct nv50_program_exec *next;
+
+	unsigned inst[2];
+	struct {
+		int index;
+		unsigned mask;
+		unsigned shift;
+	} param;
+};
+
+struct nv50_program {
+	struct pipe_shader_state pipe;
+	struct tgsi_shader_info info;
+	boolean translated;
+
+	unsigned type;
+	struct nv50_program_exec *exec_head;
+	struct nv50_program_exec *exec_tail;
+	unsigned exec_size;
+	struct nouveau_resource *data;
+	unsigned data_start;
+
+	struct pipe_buffer *buffer;
+
+	float *immd;
+	unsigned immd_nr;
+	unsigned param_nr;
+
+	struct {
+		unsigned high_temp;
+		unsigned high_result;
+		struct {
+			unsigned attr[2];
+		} vp;
+	} cfg;
+};
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
new file mode 100644
index 0000000000..26bd90ccc5
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+
+#include "nv50_context.h"
+
+static struct pipe_query *
+nv50_query_create(struct pipe_context *pipe, unsigned type)
+{
+	NOUVEAU_ERR("unimplemented\n");
+	return NULL;
+}
+
+static void
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static void
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static void
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *q)
+{
+	NOUVEAU_ERR("unimplemented\n");
+}
+
+static boolean
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *q,
+		  boolean wait, uint64 *result)
+{
+	NOUVEAU_ERR("unimplemented\n");
+	*result = 0xdeadcafe;
+	return TRUE;
+}
+
+void
+nv50_init_query_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_query = nv50_query_create;
+	nv50->pipe.destroy_query = nv50_query_destroy;
+	nv50->pipe.begin_query = nv50_query_begin;
+	nv50->pipe.end_query = nv50_query_end;
+	nv50->pipe.get_query_result = nv50_query_result;
+}
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
new file mode 100644
index 0000000000..b5aef7dadd
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_screen.h"
+
+#include "nv50_context.h"
+#include "nv50_screen.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+#define NV5X_GRCLASS5097_CHIPSETS 0x00000001
+#define NV8X_GRCLASS8297_CHIPSETS 0x00000050
+#define NV9X_GRCLASS8297_CHIPSETS 0x00000014
+
+static boolean
+nv50_screen_is_format_supported(struct pipe_screen *pscreen,
+				enum pipe_format format,
+				enum pipe_texture_target target,
+				unsigned tex_usage, unsigned geom_flags)
+{
+	if (tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET) {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_Z24S8_UNORM:
+		case PIPE_FORMAT_Z16_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	} else {
+		switch (format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+		case PIPE_FORMAT_A1R5G5B5_UNORM:
+		case PIPE_FORMAT_A4R4G4B4_UNORM:
+		case PIPE_FORMAT_R5G6B5_UNORM:
+		case PIPE_FORMAT_L8_UNORM:
+		case PIPE_FORMAT_A8_UNORM:
+		case PIPE_FORMAT_I8_UNORM:
+		case PIPE_FORMAT_A8L8_UNORM:
+			return TRUE;
+		default:
+			break;
+		}
+	}
+
+	return FALSE;
+}
+
+static const char *
+nv50_screen_get_name(struct pipe_screen *pscreen)
+{
+	struct nv50_screen *screen = nv50_screen(pscreen);
+	struct nouveau_device *dev = screen->nvws->channel->device;
+	static char buffer[128];
+
+	snprintf(buffer, sizeof(buffer), "NV%02X", dev->chipset);
+	return buffer;
+}
+
+static const char *
+nv50_screen_get_vendor(struct pipe_screen *pscreen)
+{
+	return "nouveau";
+}
+
+static int
+nv50_screen_get_param(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+		return 32;
+	case PIPE_CAP_NPOT_TEXTURES:
+		return 0;
+	case PIPE_CAP_TWO_SIDED_STENCIL:
+		return 1;
+	case PIPE_CAP_GLSL:
+		return 0;
+	case PIPE_CAP_S3TC:
+		return 0;
+	case PIPE_CAP_ANISOTROPIC_FILTER:
+		return 0;
+	case PIPE_CAP_POINT_SPRITE:
+		return 0;
+	case PIPE_CAP_MAX_RENDER_TARGETS:
+		return 8;
+	case PIPE_CAP_OCCLUSION_QUERY:
+		return 0;
+	case PIPE_CAP_TEXTURE_SHADOW_MAP:
+		return 0;
+	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+		return 13;
+	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+		return 10;
+	case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+		return 13;
+	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+		return 1;
+	case NOUVEAU_CAP_HW_VTXBUF:	
+		return 1;
+	case NOUVEAU_CAP_HW_IDXBUF:	
+		return 0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0;
+	}
+}
+
+static float
+nv50_screen_get_paramf(struct pipe_screen *pscreen, int param)
+{
+	switch (param) {
+	case PIPE_CAP_MAX_LINE_WIDTH:
+	case PIPE_CAP_MAX_LINE_WIDTH_AA:
+		return 10.0;
+	case PIPE_CAP_MAX_POINT_WIDTH:
+	case PIPE_CAP_MAX_POINT_WIDTH_AA:
+		return 64.0;
+	case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+		return 16.0;
+	case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+		return 4.0;
+	default:
+		NOUVEAU_ERR("Unknown PIPE_CAP %d\n", param);
+		return 0.0;
+	}
+}
+
+static void
+nv50_screen_destroy(struct pipe_screen *pscreen)
+{
+	FREE(pscreen);
+}
+
+struct pipe_screen *
+nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)
+{
+	struct nv50_screen *screen = CALLOC_STRUCT(nv50_screen);
+	struct nouveau_stateobj *so;
+	unsigned tesla_class = 0, ret;
+	unsigned chipset = nvws->channel->device->chipset;
+	int i;
+
+	if (!screen)
+		return NULL;
+	screen->nvws = nvws;
+
+	/* 3D object */
+	if ((chipset & 0xf0) != 0x50 && (chipset & 0xf0) != 0x80) {
+		NOUVEAU_ERR("Not a G8x chipset\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	switch (chipset & 0xf0) {
+	case 0x50:
+		if (NV5X_GRCLASS5097_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x5097;
+		break;
+	case 0x80:
+		if (NV8X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	case 0x90:
+		if (NV9X_GRCLASS8297_CHIPSETS & (1 << (chipset & 0x0f)))
+			tesla_class = 0x8297;
+		break;
+	default:
+		break;
+	}
+
+	if (tesla_class == 0) {
+		NOUVEAU_ERR("Unknown G8x chipset: NV%02x\n", chipset);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	ret = nvws->grobj_alloc(nvws, tesla_class, &screen->tesla);
+	if (ret) {
+		NOUVEAU_ERR("Error creating 3D object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Sync notifier */
+	ret = nvws->notifier_alloc(nvws, 1, &screen->sync);
+	if (ret) {
+		NOUVEAU_ERR("Error creating notifier object: %d\n", ret);
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	/* Static tesla init */
+	so = so_new(256, 20);
+
+	so_method(so, screen->tesla, 0x1558, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
+	so_data  (so, screen->sync->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
+				     NV50TCL_DMA_UNK0__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
+				     NV50TCL_DMA_UNK1__SIZE);
+	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+		so_data(so, nvws->channel->vram->handle);
+	so_method(so, screen->tesla, 0x121c, 1);
+	so_data  (so, 1);
+
+	so_method(so, screen->tesla, 0x13bc, 1);
+	so_data  (so, 0x54);
+	so_method(so, screen->tesla, 0x13ac, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x16b8, 1);
+	so_data  (so, 8);
+
+	/* Shared constant buffer */
+	screen->constbuf = ws->buffer_create(ws, 0, 0, 128 * 4 * 4);
+	if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {
+		NOUVEAU_ERR("Error initialising constant buffer\n");
+		nv50_screen_destroy(&screen->pipe);
+		return NULL;
+	}
+
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_PMISC << 16) | 0x00001000);
+
+	/* Texture sampler/image unit setup - we abuse the constant buffer
+	 * upload mechanism for the moment to upload data to the tex config
+	 * blocks.  At some point we *may* want to go the NVIDIA way of doing
+	 * things?
+	 */
+	screen->tic = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TIC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x1574, 3);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tic, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+	screen->tsc = ws->buffer_create(ws, 0, 0, 32 * 8 * 4);
+	so_method(so, screen->tesla, 0x1280, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, (NV50_CB_TSC << 16) | 0x0800);
+	so_method(so, screen->tesla, 0x155c, 3);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+	so_reloc (so, screen->tsc, 0, NOUVEAU_BO_VRAM |
+		  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	so_data  (so, 0x00000800);
+
+
+	/* Vertex array limits - max them out */
+	for (i = 0; i < 16; i++) {
+		so_method(so, screen->tesla, 0x1080 + (i * 8), 2);
+		so_data  (so, 0x000000ff);
+		so_data  (so, 0xffffffff);
+	}
+
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_data  (so, fui(0.0));
+	so_data  (so, fui(1.0));
+
+	so_method(so, screen->tesla, 0x1234, 1);
+	so_data  (so, 1);
+	so_method(so, screen->tesla, 0x1458, 1);
+	so_data  (so, 1);
+
+	so_emit(nvws, so);
+	so_ref(so, &screen->static_init);
+	nvws->push_flush(nvws, 0, NULL);
+
+	screen->pipe.winsys = ws;
+
+	screen->pipe.destroy = nv50_screen_destroy;
+
+	screen->pipe.get_name = nv50_screen_get_name;
+	screen->pipe.get_vendor = nv50_screen_get_vendor;
+	screen->pipe.get_param = nv50_screen_get_param;
+	screen->pipe.get_paramf = nv50_screen_get_paramf;
+
+	screen->pipe.is_format_supported = nv50_screen_is_format_supported;
+
+	nv50_screen_init_miptree_functions(&screen->pipe);
+	nv50_surface_init_screen_functions(&screen->pipe);
+
+	return &screen->pipe;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h
new file mode 100644
index 0000000000..400ddcef06
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_screen.h
@@ -0,0 +1,33 @@
+#ifndef __NV50_SCREEN_H__
+#define __NV50_SCREEN_H__
+
+#include "pipe/p_screen.h"
+
+struct nv50_screen {
+	struct pipe_screen pipe;
+
+	struct nouveau_winsys *nvws;
+
+	unsigned cur_pctx;
+
+	struct nouveau_grobj *tesla;
+	struct nouveau_notifier *sync;
+
+	struct pipe_buffer *constbuf;
+	struct nouveau_resource *vp_data_heap;
+
+	struct pipe_buffer *tic;
+	struct pipe_buffer *tsc;
+
+	struct nouveau_stateobj *static_init;
+};
+
+static INLINE struct nv50_screen *
+nv50_screen(struct pipe_screen *screen)
+{
+	return (struct nv50_screen *)screen;
+}
+
+void nv50_surface_init_screen_functions(struct pipe_screen *);
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
new file mode 100644
index 0000000000..95f9d408b5
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -0,0 +1,638 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "tgsi/tgsi_parse.h"
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static void *
+nv50_blend_state_create(struct pipe_context *pipe,
+			const struct pipe_blend_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_blend_stateobj *bso = CALLOC_STRUCT(nv50_blend_stateobj);
+	unsigned cmask = 0, i;
+
+	/*XXX ignored:
+	 * 	- dither
+	 */
+
+	if (cso->blend_enable == 0) {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_BLEND_ENABLE(0), 8);
+		for (i = 0; i < 8; i++)
+			so_data(so, 1);
+		so_method(so, tesla, NV50TCL_BLEND_EQUATION_RGB, 5);
+		so_data  (so, nvgl_blend_eqn(cso->rgb_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_src_factor));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->rgb_dst_factor));
+		so_data  (so, nvgl_blend_eqn(cso->alpha_func));
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_src_factor));
+		so_method(so, tesla, NV50TCL_BLEND_FUNC_DST_ALPHA, 1);
+		so_data  (so, 0x4000 | nvgl_blend_func(cso->alpha_dst_factor));
+	}
+
+	if (cso->logicop_enable == 0 ) {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 1);
+		so_data  (so, 0);
+	} else {
+		so_method(so, tesla, NV50TCL_LOGIC_OP_ENABLE, 2);
+		so_data  (so, 1);
+		so_data  (so, nvgl_logicop_func(cso->logicop_func));
+	}
+
+	if (cso->colormask & PIPE_MASK_R)
+		cmask |= (1 << 0);
+	if (cso->colormask & PIPE_MASK_G)
+		cmask |= (1 << 4);
+	if (cso->colormask & PIPE_MASK_B)
+		cmask |= (1 << 8);
+	if (cso->colormask & PIPE_MASK_A)
+		cmask |= (1 << 12);
+	so_method(so, tesla, NV50TCL_COLOR_MASK(0), 8);
+	for (i = 0; i < 8; i++)
+		so_data(so, cmask);
+
+	bso->pipe = *cso;
+	so_ref(so, &bso->so);
+	return (void *)bso;
+}
+
+static void
+nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend = hwcso;
+	nv50->dirty |= NV50_NEW_BLEND;
+}
+
+static void
+nv50_blend_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_blend_stateobj *bso = hwcso;
+
+	so_ref(NULL, &bso->so);
+	FREE(bso);
+}
+
+static INLINE unsigned
+wrap_mode(unsigned wrap)
+{
+	switch (wrap) {
+	case PIPE_TEX_WRAP_REPEAT:
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	case PIPE_TEX_WRAP_MIRROR_REPEAT:
+		return NV50TSC_1_0_WRAPS_MIRROR_REPEAT;
+	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_CLAMP:
+		return NV50TSC_1_0_WRAPS_CLAMP;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER;
+	case PIPE_TEX_WRAP_MIRROR_CLAMP:
+		return NV50TSC_1_0_WRAPS_MIRROR_CLAMP;
+	default:
+		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
+		return NV50TSC_1_0_WRAPS_REPEAT;
+	}
+}
+static void *
+nv50_sampler_state_create(struct pipe_context *pipe,
+			  const struct pipe_sampler_state *cso)
+{
+	unsigned *tsc = CALLOC(8, sizeof(unsigned));
+
+	tsc[0] = (0x00024000 |
+		  (wrap_mode(cso->wrap_s) << 0) |
+		  (wrap_mode(cso->wrap_t) << 3) |
+		  (wrap_mode(cso->wrap_r) << 6));
+
+	switch (cso->mag_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MAGF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MAGF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_img_filter) {
+	case PIPE_TEX_FILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MINF_LINEAR;
+		break;
+	case PIPE_TEX_FILTER_NEAREST:
+	default:
+		tsc[1] |= NV50TSC_1_1_MINF_NEAREST;
+		break;
+	}
+
+	switch (cso->min_mip_filter) {
+	case PIPE_TEX_MIPFILTER_LINEAR:
+		tsc[1] |= NV50TSC_1_1_MIPF_LINEAR;
+		break;
+	case PIPE_TEX_MIPFILTER_NEAREST:
+		tsc[1] |= NV50TSC_1_1_MIPF_NEAREST;
+		break;
+	case PIPE_TEX_MIPFILTER_NONE:
+	default:
+		tsc[1] |= NV50TSC_1_1_MIPF_NONE;
+		break;
+	}
+
+	return (void *)tsc;
+}
+
+static void
+nv50_sampler_state_bind(struct pipe_context *pipe, unsigned nr, void **sampler)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	nv50->sampler_nr = nr;
+	for (i = 0; i < nv50->sampler_nr; i++)
+		nv50->sampler[i] = sampler[i];
+
+	nv50->dirty |= NV50_NEW_SAMPLER;
+}
+
+static void
+nv50_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	FREE(hwcso);
+}
+
+static void
+nv50_set_sampler_texture(struct pipe_context *pipe, unsigned nr,
+			 struct pipe_texture **pt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	int i;
+
+	for (i = 0; i < nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], pt[i]);
+	for (i = nr; i < nv50->miptree_nr; i++)
+		pipe_texture_reference((void *)&nv50->miptree[i], NULL);
+
+	nv50->miptree_nr = nr;
+	nv50->dirty |= NV50_NEW_TEXTURE;
+}
+
+static void *
+nv50_rasterizer_state_create(struct pipe_context *pipe,
+			     const struct pipe_rasterizer_state *cso)
+{
+	struct nouveau_stateobj *so = so_new(64, 0);
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_rasterizer_stateobj *rso =
+		CALLOC_STRUCT(nv50_rasterizer_stateobj);
+
+	/*XXX: ignored
+	 * 	- light_twosize
+	 * 	- point_smooth
+	 * 	- multisample
+	 * 	- point_sprite / sprite_coord_mode
+	 */
+
+	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
+	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
+				       NV50TCL_SHADE_MODEL_SMOOTH);
+
+	so_method(so, tesla, NV50TCL_LINE_WIDTH, 1);
+	so_data  (so, fui(cso->line_width));
+	so_method(so, tesla, NV50TCL_LINE_SMOOTH_ENABLE, 1);
+	so_data  (so, cso->line_smooth ? 1 : 0);
+	if (cso->line_stipple_enable) {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_PATTERN, 1);
+		so_data  (so, (cso->line_stipple_pattern << 8) |
+			       cso->line_stipple_factor);
+	} else {
+		so_method(so, tesla, NV50TCL_LINE_STIPPLE_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	so_method(so, tesla, NV50TCL_POINT_SIZE, 1);
+	so_data  (so, fui(cso->point_size));
+
+	so_method(so, tesla, NV50TCL_POLYGON_MODE_FRONT, 3);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+	} else {
+		so_data(so, nvgl_polygon_mode(cso->fill_cw));
+		so_data(so, nvgl_polygon_mode(cso->fill_ccw));
+	}
+	so_data(so, cso->poly_smooth ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_CULL_FACE_ENABLE, 3);
+	so_data  (so, cso->cull_mode != PIPE_WINDING_NONE);
+	if (cso->front_winding == PIPE_WINDING_CCW) {
+		so_data(so, NV50TCL_FRONT_FACE_CCW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	} else {
+		so_data(so, NV50TCL_FRONT_FACE_CW);
+		switch (cso->cull_mode) {
+		case PIPE_WINDING_CCW:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		case PIPE_WINDING_CW:
+			so_data(so, NV50TCL_CULL_FACE_FRONT);
+			break;
+		case PIPE_WINDING_BOTH:
+			so_data(so, NV50TCL_CULL_FACE_FRONT_AND_BACK);
+			break;
+		default:
+			so_data(so, NV50TCL_CULL_FACE_BACK);
+			break;
+		}
+	}
+
+	so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_ENABLE, 1);
+	so_data  (so, cso->poly_stipple_enable ? 1 : 0);
+
+	so_method(so, tesla, NV50TCL_POLYGON_OFFSET_POINT_ENABLE, 3);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_POINT) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_POINT))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_LINE) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_LINE))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+	if ((cso->offset_cw && cso->fill_cw == PIPE_POLYGON_MODE_FILL) ||
+	    (cso->offset_ccw && cso->fill_ccw == PIPE_POLYGON_MODE_FILL))
+		so_data(so, 1);
+	else
+		so_data(so, 0);
+
+	if (cso->offset_cw || cso->offset_ccw) {
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
+		so_data  (so, fui(cso->offset_scale));
+		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
+		so_data  (so, fui(cso->offset_units));
+	}
+
+	rso->pipe = *cso;
+	so_ref(so, &rso->so);
+	return (void *)rso;
+}
+
+static void
+nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->rasterizer = hwcso;
+	nv50->dirty |= NV50_NEW_RASTERIZER;
+}
+
+static void
+nv50_rasterizer_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_rasterizer_stateobj *rso = hwcso;
+
+	so_ref(NULL, &rso->so);
+	FREE(rso);
+}
+
+static void *
+nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
+			const struct pipe_depth_stencil_alpha_state *cso)
+{
+	struct nouveau_grobj *tesla = nv50_context(pipe)->screen->tesla;
+	struct nv50_zsa_stateobj *zsa = CALLOC_STRUCT(nv50_zsa_stateobj);
+	struct nouveau_stateobj *so = so_new(64, 0);
+
+	so_method(so, tesla, NV50TCL_DEPTH_WRITE_ENABLE, 1);
+	so_data  (so, cso->depth.writemask ? 1 : 0);
+	if (cso->depth.enabled) {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_FUNC, 1);
+		so_data  (so, nvgl_comparison_op(cso->depth.func));
+	} else {
+		so_method(so, tesla, NV50TCL_DEPTH_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	/*XXX: yes, I know they're backwards.. header needs fixing */
+	if (cso->stencil[0].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[0].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[0].func));
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
+		so_data  (so, cso->stencil[0].ref_value);
+		so_data  (so, cso->stencil[0].write_mask);
+		so_data  (so, cso->stencil[0].value_mask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->stencil[1].enabled) {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
+		so_data  (so, 1);
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
+		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
+		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
+		so_data  (so, cso->stencil[1].ref_value);
+		so_data  (so, cso->stencil[1].write_mask);
+		so_data  (so, cso->stencil[1].value_mask);
+	} else {
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	if (cso->alpha.enabled) {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_REF, 2);
+		so_data  (so, fui(cso->alpha.ref));
+		so_data  (so, nvgl_comparison_op(cso->alpha.func));
+	} else {
+		so_method(so, tesla, NV50TCL_ALPHA_TEST_ENABLE, 1);
+		so_data  (so, 0);
+	}
+
+	zsa->pipe = *cso;
+	so_ref(so, &zsa->so);
+	return (void *)zsa;
+}
+
+static void
+nv50_depth_stencil_alpha_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->zsa = hwcso;
+	nv50->dirty |= NV50_NEW_ZSA;
+}
+
+static void
+nv50_depth_stencil_alpha_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_zsa_stateobj *zsa = hwcso;
+
+	so_ref(NULL, &zsa->so);
+	FREE(zsa);
+}
+
+static void *
+nv50_vp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_VERTEX;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->vertprog = hwcso;
+	nv50->dirty |= NV50_NEW_VERTPROG;
+}
+
+static void
+nv50_vp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void *
+nv50_fp_state_create(struct pipe_context *pipe,
+		     const struct pipe_shader_state *cso)
+{
+	struct nv50_program *p = CALLOC_STRUCT(nv50_program);
+
+	p->pipe.tokens = tgsi_dup_tokens(cso->tokens);
+	p->type = PIPE_SHADER_FRAGMENT;
+	tgsi_scan_shader(p->pipe.tokens, &p->info);
+	return (void *)p;
+}
+
+static void
+nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->fragprog = hwcso;
+	nv50->dirty |= NV50_NEW_FRAGPROG;
+}
+
+static void
+nv50_fp_state_delete(struct pipe_context *pipe, void *hwcso)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_program *p = hwcso;
+
+	nv50_program_destroy(nv50, p);
+	FREE((void*)p->pipe.tokens);
+	FREE(p);
+}
+
+static void
+nv50_set_blend_color(struct pipe_context *pipe,
+		     const struct pipe_blend_color *bcol)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->blend_colour = *bcol;
+	nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+}
+
+static void
+nv50_set_clip_state(struct pipe_context *pipe,
+		    const struct pipe_clip_state *clip)
+{
+}
+
+static void
+nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
+			 const struct pipe_constant_buffer *buf )
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	if (shader == PIPE_SHADER_VERTEX) {
+		nv50->constbuf[PIPE_SHADER_VERTEX] = buf->buffer;
+		nv50->dirty |= NV50_NEW_VERTPROG_CB;
+	} else
+	if (shader == PIPE_SHADER_FRAGMENT) {
+		nv50->constbuf[PIPE_SHADER_FRAGMENT] = buf->buffer;
+		nv50->dirty |= NV50_NEW_FRAGPROG_CB;
+	}
+}
+
+static void
+nv50_set_framebuffer_state(struct pipe_context *pipe,
+			   const struct pipe_framebuffer_state *fb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->framebuffer = *fb;
+	nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+}
+
+static void
+nv50_set_polygon_stipple(struct pipe_context *pipe,
+			 const struct pipe_poly_stipple *stipple)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->stipple = *stipple;
+	nv50->dirty |= NV50_NEW_STIPPLE;
+}
+
+static void
+nv50_set_scissor_state(struct pipe_context *pipe,
+		       const struct pipe_scissor_state *s)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->scissor = *s;
+	nv50->dirty |= NV50_NEW_SCISSOR;
+}
+
+static void
+nv50_set_viewport_state(struct pipe_context *pipe,
+			const struct pipe_viewport_state *vpt)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50->viewport = *vpt;
+	nv50->dirty |= NV50_NEW_VIEWPORT;
+}
+
+static void
+nv50_set_vertex_buffers(struct pipe_context *pipe, unsigned count,
+			const struct pipe_vertex_buffer *vb)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxbuf, vb, sizeof(*vb) * count);
+	nv50->vtxbuf_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+static void
+nv50_set_vertex_elements(struct pipe_context *pipe, unsigned count,
+			 const struct pipe_vertex_element *ve)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	memcpy(nv50->vtxelt, ve, sizeof(*ve) * count);
+	nv50->vtxelt_nr = count;
+
+	nv50->dirty |= NV50_NEW_ARRAYS;
+}
+
+void
+nv50_init_state_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.create_blend_state = nv50_blend_state_create;
+	nv50->pipe.bind_blend_state = nv50_blend_state_bind;
+	nv50->pipe.delete_blend_state = nv50_blend_state_delete;
+
+	nv50->pipe.create_sampler_state = nv50_sampler_state_create;
+	nv50->pipe.bind_sampler_states = nv50_sampler_state_bind;
+	nv50->pipe.delete_sampler_state = nv50_sampler_state_delete;
+	nv50->pipe.set_sampler_textures = nv50_set_sampler_texture;
+
+	nv50->pipe.create_rasterizer_state = nv50_rasterizer_state_create;
+	nv50->pipe.bind_rasterizer_state = nv50_rasterizer_state_bind;
+	nv50->pipe.delete_rasterizer_state = nv50_rasterizer_state_delete;
+
+	nv50->pipe.create_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_create;
+	nv50->pipe.bind_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_bind;
+	nv50->pipe.delete_depth_stencil_alpha_state =
+		nv50_depth_stencil_alpha_state_delete;
+
+	nv50->pipe.create_vs_state = nv50_vp_state_create;
+	nv50->pipe.bind_vs_state = nv50_vp_state_bind;
+	nv50->pipe.delete_vs_state = nv50_vp_state_delete;
+
+	nv50->pipe.create_fs_state = nv50_fp_state_create;
+	nv50->pipe.bind_fs_state = nv50_fp_state_bind;
+	nv50->pipe.delete_fs_state = nv50_fp_state_delete;
+
+	nv50->pipe.set_blend_color = nv50_set_blend_color;
+	nv50->pipe.set_clip_state = nv50_set_clip_state;
+	nv50->pipe.set_constant_buffer = nv50_set_constant_buffer;
+	nv50->pipe.set_framebuffer_state = nv50_set_framebuffer_state;
+	nv50->pipe.set_polygon_stipple = nv50_set_polygon_stipple;
+	nv50->pipe.set_scissor_state = nv50_set_scissor_state;
+	nv50->pipe.set_viewport_state = nv50_set_viewport_state;
+
+	nv50->pipe.set_vertex_buffers = nv50_set_vertex_buffers;
+	nv50->pipe.set_vertex_elements = nv50_set_vertex_elements;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
new file mode 100644
index 0000000000..198e25f448
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -0,0 +1,309 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nouveau/nouveau_stateobj.h"
+
+static void
+nv50_state_validate_fb(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so = so_new(128, 18);
+	struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	unsigned i, w, h, gw = 0;
+
+	for (i = 0; i < fb->num_cbufs; i++) {
+		if (!gw) {
+			w = fb->cbufs[i]->width;
+			h = fb->cbufs[i]->height;
+			gw = 1;
+		} else {
+			assert(w == fb->cbufs[i]->width);
+			assert(h == fb->cbufs[i]->height);
+		}
+
+		so_method(so, tesla, NV50TCL_RT_HORIZ(i), 2);
+		so_data  (so, fb->cbufs[i]->width);
+		so_data  (so, fb->cbufs[i]->height);
+
+		so_method(so, tesla, NV50TCL_RT_ADDRESS_HIGH(i), 5);
+		so_reloc (so, fb->cbufs[i]->buffer, fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, fb->cbufs[i]->buffer, fb->cbufs[i]->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+		switch (fb->cbufs[i]->format) {
+		case PIPE_FORMAT_A8R8G8B8_UNORM:
+			so_data(so, 0xcf);
+			break;
+		case PIPE_FORMAT_R5G6B5_UNORM:
+			so_data(so, 0xe8);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->cbufs[i]->format));
+			so_data(so, 0xe6);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1224, 1);
+		so_data  (so, 1);
+	}
+
+	if (fb->zsbuf) {
+		if (!gw) {
+			w = fb->zsbuf->width;
+			h = fb->zsbuf->height;
+			gw = 1;
+		} else {
+			assert(w == fb->zsbuf->width);
+			assert(h == fb->zsbuf->height);
+		}
+
+		so_method(so, tesla, NV50TCL_ZETA_ADDRESS_HIGH, 5);
+		so_reloc (so, fb->zsbuf->buffer, fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (so, fb->zsbuf->buffer, fb->zsbuf->offset,
+			  NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+		switch (fb->zsbuf->format) {
+		case PIPE_FORMAT_Z24S8_UNORM:
+			so_data(so, 0x16);
+			break;
+		case PIPE_FORMAT_Z16_UNORM:
+			so_data(so, 0x15);
+			break;
+		default:
+			NOUVEAU_ERR("AIIII unknown format %s\n",
+				    pf_name(fb->zsbuf->format));
+			so_data(so, 0x16);
+			break;
+		}
+		so_data(so, 0x00000000);
+		so_data(so, 0x00000000);
+
+		so_method(so, tesla, 0x1538, 1);
+		so_data  (so, 1);
+		so_method(so, tesla, 0x1228, 3);
+		so_data  (so, fb->zsbuf->width);
+		so_data  (so, fb->zsbuf->height);
+		so_data  (so, 0x00010001);
+	}
+
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0x0e04, 2);
+	so_data  (so, w << 16);
+	so_data  (so, h << 16);
+	so_method(so, tesla, 0xdf8, 2);
+	so_data  (so, 0);
+	so_data  (so, h);
+
+	so_ref(so, &nv50->state.fb);
+}
+
+static void
+nv50_state_emit(struct nv50_context *nv50)
+{
+	struct nv50_screen *screen = nv50->screen;
+	struct nouveau_winsys *nvws = screen->nvws;
+
+	if (nv50->pctx_id != screen->cur_pctx) {
+		nv50->state.dirty |= 0xffffffff;
+		screen->cur_pctx = nv50->pctx_id;
+	}
+
+	if (nv50->state.dirty & NV50_NEW_FRAMEBUFFER)
+		so_emit(nvws, nv50->state.fb);
+	if (nv50->state.dirty & NV50_NEW_BLEND)
+		so_emit(nvws, nv50->state.blend);
+	if (nv50->state.dirty & NV50_NEW_ZSA)
+		so_emit(nvws, nv50->state.zsa);
+	if (nv50->state.dirty & NV50_NEW_VERTPROG)
+		so_emit(nvws, nv50->state.vertprog);
+	if (nv50->state.dirty & NV50_NEW_FRAGPROG)
+		so_emit(nvws, nv50->state.fragprog);
+	if (nv50->state.dirty & NV50_NEW_RASTERIZER)
+		so_emit(nvws, nv50->state.rast);
+	if (nv50->state.dirty & NV50_NEW_BLEND_COLOUR)
+		so_emit(nvws, nv50->state.blend_colour);
+	if (nv50->state.dirty & NV50_NEW_STIPPLE)
+		so_emit(nvws, nv50->state.stipple);
+	if (nv50->state.dirty & NV50_NEW_SCISSOR)
+		so_emit(nvws, nv50->state.scissor);
+	if (nv50->state.dirty & NV50_NEW_VIEWPORT)
+		so_emit(nvws, nv50->state.viewport);
+	if (nv50->state.dirty & NV50_NEW_SAMPLER)
+		so_emit(nvws, nv50->state.tsc_upload);
+	if (nv50->state.dirty & NV50_NEW_TEXTURE)
+		so_emit(nvws, nv50->state.tic_upload);
+	if (nv50->state.dirty & NV50_NEW_ARRAYS) {
+		so_emit(nvws, nv50->state.vtxfmt);
+		so_emit(nvws, nv50->state.vtxbuf);
+	}
+	nv50->state.dirty = 0;
+
+	so_emit_reloc_markers(nvws, nv50->state.fb);
+	so_emit_reloc_markers(nvws, nv50->state.vertprog);
+	so_emit_reloc_markers(nvws, nv50->state.fragprog);
+	so_emit_reloc_markers(nvws, nv50->state.vtxbuf);
+	so_emit_reloc_markers(nvws, nv50->screen->static_init);
+}
+
+boolean
+nv50_state_validate(struct nv50_context *nv50)
+{
+	const struct pipe_framebuffer_state *fb = &nv50->framebuffer;
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	unsigned i;
+
+	for (i = 0; i < fb->num_cbufs; i++)
+		fb->cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (fb->zsbuf)
+		fb->zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+
+	if (nv50->dirty & NV50_NEW_FRAMEBUFFER)
+		nv50_state_validate_fb(nv50);
+
+	if (nv50->dirty & NV50_NEW_BLEND)
+		so_ref(nv50->blend->so, &nv50->state.blend);
+
+	if (nv50->dirty & NV50_NEW_ZSA)
+		so_ref(nv50->zsa->so, &nv50->state.zsa);
+
+	if (nv50->dirty & (NV50_NEW_VERTPROG | NV50_NEW_VERTPROG_CB))
+		nv50_vertprog_validate(nv50);
+
+	if (nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_FRAGPROG_CB))
+		nv50_fragprog_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_RASTERIZER)
+		so_ref(nv50->rasterizer->so, &nv50->state.rast);
+
+	if (nv50->dirty & NV50_NEW_BLEND_COLOUR) {
+		so = so_new(5, 0);
+		so_method(so, tesla, NV50TCL_BLEND_COLOR(0), 4);
+		so_data  (so, fui(nv50->blend_colour.color[0]));
+		so_data  (so, fui(nv50->blend_colour.color[1]));
+		so_data  (so, fui(nv50->blend_colour.color[2]));
+		so_data  (so, fui(nv50->blend_colour.color[3]));
+		so_ref(so, &nv50->state.blend_colour);
+	}
+
+	if (nv50->dirty & NV50_NEW_STIPPLE) {
+		so = so_new(33, 0);
+		so_method(so, tesla, NV50TCL_POLYGON_STIPPLE_PATTERN(0), 32);
+		for (i = 0; i < 32; i++)
+			so_data(so, nv50->stipple.stipple[i]);
+		so_ref(so, &nv50->state.stipple);
+	}
+
+	if (nv50->dirty & (NV50_NEW_SCISSOR | NV50_NEW_RASTERIZER)) {
+		struct pipe_rasterizer_state *rast = &nv50->rasterizer->pipe;
+		struct pipe_scissor_state *s = &nv50->scissor;
+
+		if (nv50->state.scissor &&
+		    (rast->scissor == 0 && nv50->state.scissor_enabled == 0))
+			goto scissor_uptodate;
+		nv50->state.scissor_enabled = rast->scissor;
+
+		so = so_new(3, 0);
+		so_method(so, tesla, 0x0ff4, 2);
+		if (nv50->state.scissor_enabled) {
+			so_data(so, ((s->maxx - s->minx) << 16) | s->minx);
+			so_data(so, ((s->maxy - s->miny) << 16) | s->miny);
+		} else {
+			so_data(so, (8192 << 16));
+			so_data(so, (8192 << 16));
+		}
+		so_ref(so, &nv50->state.scissor);
+		nv50->state.dirty |= NV50_NEW_SCISSOR;
+	}
+scissor_uptodate:
+
+	if (nv50->dirty & NV50_NEW_VIEWPORT) {
+		unsigned bypass;
+
+		if (!nv50->rasterizer->pipe.bypass_clipping)
+			bypass = 0;
+		else
+			bypass = 1;
+
+		if (nv50->state.viewport &&
+		    (bypass || !(nv50->dirty & NV50_NEW_VIEWPORT)) &&
+		    nv50->state.viewport_bypass == bypass)
+			goto viewport_uptodate;
+		nv50->state.viewport_bypass = bypass;
+
+		so = so_new(12, 0);
+		if (!bypass) {
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK1(0), 3);
+			so_data  (so, fui(nv50->viewport.translate[0]));
+			so_data  (so, fui(nv50->viewport.translate[1]));
+			so_data  (so, fui(nv50->viewport.translate[2]));
+			so_method(so, tesla, NV50TCL_VIEWPORT_UNK0(0), 3);
+			so_data  (so, fui(nv50->viewport.scale[0]));
+			so_data  (so, fui(-nv50->viewport.scale[1]));
+			so_data  (so, fui(nv50->viewport.scale[2]));
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 1);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 0);
+		} else {
+			so_method(so, tesla, 0x192c, 1);
+			so_data  (so, 0);
+			so_method(so, tesla, 0x0f90, 1);
+			so_data  (so, 1);
+		}
+
+		so_ref(so, &nv50->state.viewport);
+	}
+viewport_uptodate:
+
+	if (nv50->dirty & NV50_NEW_SAMPLER) {
+		int i;
+
+		so = so_new(nv50->sampler_nr * 8 + 3, 0);
+		so_method(so, tesla, 0x0f00, 1);
+		so_data  (so, NV50_CB_TSC);
+		so_method(so, tesla, 0x40000f04, nv50->sampler_nr * 8);
+		for (i = 0; i < nv50->sampler_nr; i++)
+			so_datap (so, nv50->sampler[i], 8);
+		so_ref(so, &nv50->state.tsc_upload);
+	}
+
+	if (nv50->dirty & NV50_NEW_TEXTURE)
+		nv50_tex_validate(nv50);
+
+	if (nv50->dirty & NV50_NEW_ARRAYS)
+		nv50_vbo_validate(nv50);
+
+	nv50->state.dirty |= nv50->dirty;
+	nv50->dirty = 0;
+	nv50_state_emit(nv50);
+
+	return TRUE;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
new file mode 100644
index 0000000000..5bf97d3a6b
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_tile.h"
+
+static void
+nv50_surface_copy(struct pipe_context *pipe, boolean flip,
+		  struct pipe_surface *dest, unsigned destx, unsigned desty,
+		  struct pipe_surface *src, unsigned srcx, unsigned srcy,
+		  unsigned width, unsigned height)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+
+	if (flip) {
+		desty += height;
+		while (height--) {
+			nvws->surface_copy(nvws, dest, destx, desty--, src,
+					   srcx, srcy++, width, 1);
+		}
+	} else {
+		nvws->surface_copy(nvws, dest, destx, desty, src, srcx, srcy,
+				   width, height);
+	}
+}
+
+static void
+nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
+		  unsigned destx, unsigned desty, unsigned width,
+		  unsigned height, unsigned value)
+{
+	struct nv50_context *nv50 = (struct nv50_context *)pipe;
+	struct nouveau_winsys *nvws = nv50->screen->nvws;
+
+	nvws->surface_fill(nvws, dest, destx, desty, width, height, value);
+}
+
+static void *
+nv50_surface_map(struct pipe_screen *screen, struct pipe_surface *ps,
+		 unsigned flags )
+{
+	struct nouveau_winsys *nvws = nv50_screen(screen)->nvws;
+	struct pipe_winsys *ws = screen->winsys;
+	struct nv50_surface *s = nv50_surface(ps);
+	struct nv50_surface m = *s;
+	void *map;
+
+	if (!s->untiled) {
+		s->untiled = ws->buffer_create(ws, 0, 0, ps->buffer->size);
+
+		m.base.buffer = s->untiled;
+		nvws->surface_copy(nvws, &m.base, 0, 0, &s->base, 0, 0,
+					 ps->width, ps->height);
+	}
+
+	/* Map original tiled surface to disallow it being validated while
+	 * untiled mirror is mapped.
+	 */
+	ws->buffer_map(ws, ps->buffer, flags);
+
+	map = ws->buffer_map(ws, s->untiled, flags);
+	if (!map)
+		return NULL;
+
+	return map;
+}
+
+static void
+nv50_surface_unmap(struct pipe_screen *pscreen, struct pipe_surface *ps)
+{
+	struct nouveau_winsys *nvws = nv50_screen(pscreen)->nvws;
+	struct pipe_winsys *ws = pscreen->winsys;
+	struct nv50_surface *s = nv50_surface(ps);
+	struct nv50_surface m = *s;
+
+	ws->buffer_unmap(ws, s->untiled);
+	ws->buffer_unmap(ws, ps->buffer);
+
+	m.base.buffer = s->untiled;
+	nvws->surface_copy(nvws, &s->base, 0, 0, &m.base, 0, 0,
+				 ps->width, ps->height);
+
+	pipe_buffer_reference(pscreen, &s->untiled, NULL);
+}
+
+void
+nv50_init_surface_functions(struct nv50_context *nv50)
+{
+	nv50->pipe.surface_copy = nv50_surface_copy;
+	nv50->pipe.surface_fill = nv50_surface_fill;
+}
+
+void
+nv50_surface_init_screen_functions(struct pipe_screen *pscreen)
+{
+	pscreen->surface_map = nv50_surface_map;
+	pscreen->surface_unmap = nv50_surface_unmap;
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
new file mode 100644
index 0000000000..fde3c97c05
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nv50_context.h"
+#include "nv50_texture.h"
+
+#include "nouveau/nouveau_stateobj.h"
+
+static int
+nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
+{
+	switch (mt->base.format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8_8_8);
+		break;
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_1_5_5_5);
+		break;
+	case PIPE_FORMAT_A4R4G4B4_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_4_4_4_4);
+		break;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_5_6_5);
+		break;
+	case PIPE_FORMAT_L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_ZERO | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_ZERO | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_ZERO | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_I8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C0 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8);
+		break;
+	case PIPE_FORMAT_A8L8_UNORM:
+		so_data(so, NV50TIC_0_0_MAPA_C1 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C0 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_8_8);
+		break;
+	default:
+		return 1;
+	}
+
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_LOW, 0, 0);
+	so_data (so, 0xd0005000);
+	so_data (so, 0x00300000);
+	so_data (so, mt->base.width[0]);
+	so_data (so, (mt->base.depth[0] << 16) | mt->base.height[0]);
+	so_data (so, 0x03000000);
+	so_reloc(so, mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_HIGH, 0, 0);
+
+	return 0;
+}
+
+void
+nv50_tex_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *so;
+	int i;
+
+	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
+	so_method(so, tesla, 0x0f00, 1);
+	so_data  (so, NV50_CB_TIC);
+	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
+	for (i = 0; i < nv50->miptree_nr; i++) {
+		if (nv50_tex_construct(so, nv50->miptree[i])) {
+			NOUVEAU_ERR("failed tex validate\n");
+			so_ref(NULL, &so);
+			return;
+		}
+	}
+
+	so_ref(so, &nv50->state.tic_upload);
+}
+
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
new file mode 100644
index 0000000000..6861d67b4d
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -0,0 +1,126 @@
+#ifndef __NV50_TEXTURE_H__
+#define __NV50_TEXTURE_H__
+
+/* It'd be really nice to have these in nouveau_class.h generated by
+ * renouveau like the rest of the object header - but not sure it can
+ * handle non-object stuff nicely - need to look into it.
+ */
+
+/* Texture image control block */
+#define NV50TIC_0_0_MAPA_MASK                                     0x38000000
+#define NV50TIC_0_0_MAPA_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPA_C0                                       0x10000000
+#define NV50TIC_0_0_MAPA_C1                                       0x18000000
+#define NV50TIC_0_0_MAPA_C2                                       0x20000000
+#define NV50TIC_0_0_MAPA_C3                                       0x28000000
+#define NV50TIC_0_0_MAPA_ONE                                      0x38000000
+#define NV50TIC_0_0_MAPR_MASK                                     0x07000000
+#define NV50TIC_0_0_MAPR_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPR_C0                                       0x02000000
+#define NV50TIC_0_0_MAPR_C1                                       0x03000000
+#define NV50TIC_0_0_MAPR_C2                                       0x04000000
+#define NV50TIC_0_0_MAPR_C3                                       0x05000000
+#define NV50TIC_0_0_MAPR_ONE                                      0x07000000
+#define NV50TIC_0_0_MAPG_MASK                                     0x00e00000
+#define NV50TIC_0_0_MAPG_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPG_C0                                       0x00400000
+#define NV50TIC_0_0_MAPG_C1                                       0x00600000
+#define NV50TIC_0_0_MAPG_C2                                       0x00800000
+#define NV50TIC_0_0_MAPG_C3                                       0x00a00000
+#define NV50TIC_0_0_MAPG_ONE                                      0x00e00000
+#define NV50TIC_0_0_MAPB_MASK                                     0x001c0000
+#define NV50TIC_0_0_MAPB_ZERO                                     0x00000000
+#define NV50TIC_0_0_MAPB_C0                                       0x00080000
+#define NV50TIC_0_0_MAPB_C1                                       0x000c0000
+#define NV50TIC_0_0_MAPB_C2                                       0x00100000
+#define NV50TIC_0_0_MAPB_C3                                       0x00140000
+#define NV50TIC_0_0_MAPB_ONE                                      0x001c0000
+#define NV50TIC_0_0_TYPEA_MASK                                    0x00038000
+#define NV50TIC_0_0_TYPEA_UNORM                                   0x00010000
+#define NV50TIC_0_0_TYPER_MASK                                    0x00007000
+#define NV50TIC_0_0_TYPER_UNORM                                   0x00002000
+#define NV50TIC_0_0_TYPEG_MASK                                    0x00000e00
+#define NV50TIC_0_0_TYPEG_UNORM                                   0x00000400
+#define NV50TIC_0_0_TYPEB_MASK                                    0x000001c0
+#define NV50TIC_0_0_TYPEB_UNORM                                   0x00000080
+#define NV50TIC_0_0_FMT_MASK                                      0x0000003c
+#define NV50TIC_0_0_FMT_8_8_8_8                                   0x00000008
+#define NV50TIC_0_0_FMT_4_4_4_4                                   0x00000012
+#define NV50TIC_0_0_FMT_1_5_5_5                                   0x00000013
+#define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
+#define NV50TIC_0_0_FMT_8_8                                       0x00000018
+#define NV50TIC_0_0_FMT_8                                         0x0000001d
+
+#define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
+#define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
+
+#define NV50TIC_0_2_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_3_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_4_WIDTH_MASK                                    0x0000ffff
+#define NV50TIC_0_4_WIDTH_SHIFT                                            0
+
+#define NV50TIC_0_5_DEPTH_MASK                                    0xffff0000
+#define NV50TIC_0_5_DEPTH_SHIFT                                           16
+#define NV50TIC_0_5_HEIGHT_MASK                                   0x0000ffff
+#define NV50TIC_0_5_HEIGHT_SHIFT                                           0
+
+#define NV50TIC_0_6_UNKNOWN_MASK                                  0xffffffff
+
+#define NV50TIC_0_7_OFFSET_HIGH_MASK                              0xffffffff
+#define NV50TIC_0_7_OFFSET_HIGH_SHIFT                                      0
+
+/* Texture sampler control block */
+#define NV50TSC_1_0_WRAPS_MASK                                   0x00000007
+#define NV50TSC_1_0_WRAPS_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPS_MIRROR_REPEAT                          0x00000001
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_EDGE                          0x00000002
+#define NV50TSC_1_0_WRAPS_CLAMP_TO_BORDER                        0x00000003
+#define NV50TSC_1_0_WRAPS_CLAMP                                  0x00000004
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_EDGE                   0x00000005
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP_TO_BORDER                 0x00000006
+#define NV50TSC_1_0_WRAPS_MIRROR_CLAMP                           0x00000007
+#define NV50TSC_1_0_WRAPT_MASK                                   0x00000038
+#define NV50TSC_1_0_WRAPT_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPT_MIRROR_REPEAT                          0x00000008
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_EDGE                          0x00000010
+#define NV50TSC_1_0_WRAPT_CLAMP_TO_BORDER                        0x00000018
+#define NV50TSC_1_0_WRAPT_CLAMP                                  0x00000020
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_EDGE                   0x00000028
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP_TO_BORDER                 0x00000030
+#define NV50TSC_1_0_WRAPT_MIRROR_CLAMP                           0x00000038
+#define NV50TSC_1_0_WRAPR_MASK                                   0x000001c0
+#define NV50TSC_1_0_WRAPR_REPEAT                                 0x00000000
+#define NV50TSC_1_0_WRAPR_MIRROR_REPEAT                          0x00000040
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_EDGE                          0x00000080
+#define NV50TSC_1_0_WRAPR_CLAMP_TO_BORDER                        0x000000c0
+#define NV50TSC_1_0_WRAPR_CLAMP                                  0x00000100
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_EDGE                   0x00000140
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP_TO_BORDER                 0x00000180
+#define NV50TSC_1_0_WRAPR_MIRROR_CLAMP                           0x000001c0
+
+#define NV50TSC_1_1_MAGF_MASK                                    0x00000003
+#define NV50TSC_1_1_MAGF_NEAREST                                 0x00000001
+#define NV50TSC_1_1_MAGF_LINEAR                                  0x00000002
+#define NV50TSC_1_1_MINF_MASK                                    0x00000030
+#define NV50TSC_1_1_MINF_NEAREST                                 0x00000010
+#define NV50TSC_1_1_MINF_LINEAR                                  0x00000020
+#define NV50TSC_1_1_MIPF_MASK                                    0x000000c0
+#define NV50TSC_1_1_MIPF_NONE                                    0x00000040
+#define NV50TSC_1_1_MIPF_NEAREST                                 0x00000080
+#define NV50TSC_1_1_MIPF_LINEAR                                  0x000000c0
+
+#define NV50TSC_1_2_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_3_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_4_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_5_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_6_UNKNOWN_MASK                                 0xffffffff
+
+#define NV50TSC_1_7_UNKNOWN_MASK                                 0xffffffff
+
+#endif
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
new file mode 100644
index 0000000000..584336682e
--- /dev/null
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -0,0 +1,237 @@
+/*
+ * Copyright 2008 Ben Skeggs
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+#include "nv50_context.h"
+
+static INLINE unsigned
+nv50_prim(unsigned mode)
+{
+	switch (mode) {
+	case PIPE_PRIM_POINTS: return NV50TCL_VERTEX_BEGIN_POINTS;
+	case PIPE_PRIM_LINES: return NV50TCL_VERTEX_BEGIN_LINES;
+	case PIPE_PRIM_LINE_LOOP: return NV50TCL_VERTEX_BEGIN_LINE_LOOP;
+	case PIPE_PRIM_LINE_STRIP: return NV50TCL_VERTEX_BEGIN_LINE_STRIP;
+	case PIPE_PRIM_TRIANGLES: return NV50TCL_VERTEX_BEGIN_TRIANGLES;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		return NV50TCL_VERTEX_BEGIN_TRIANGLE_STRIP;
+	case PIPE_PRIM_TRIANGLE_FAN: return NV50TCL_VERTEX_BEGIN_TRIANGLE_FAN;
+	case PIPE_PRIM_QUADS: return NV50TCL_VERTEX_BEGIN_QUADS;
+	case PIPE_PRIM_QUAD_STRIP: return NV50TCL_VERTEX_BEGIN_QUAD_STRIP;
+	case PIPE_PRIM_POLYGON: return NV50TCL_VERTEX_BEGIN_POLYGON;
+	default:
+		break;
+	}
+
+	NOUVEAU_ERR("invalid primitive type %d\n", mode);
+	return NV50TCL_VERTEX_BEGIN_POINTS;
+}
+
+boolean
+nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
+		 unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (nv50_prim(mode));
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
+	OUT_RING  (start);
+	OUT_RING  (count);
+	BEGIN_RING(tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+static INLINE void
+nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(tesla, 0x15e8, 1);
+		OUT_RING  (map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  ((map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	if (count & 1) {
+		BEGIN_RING(tesla, 0x15e8, 1);
+		OUT_RING  (map[0]);
+		map++;
+		count--;
+	}
+
+	while (count) {
+		unsigned nr = count > 2046 ? 2046 : count;
+		int i;
+
+		BEGIN_RING(tesla, 0x400015f0, nr >> 1);
+		for (i = 0; i < nr; i += 2)
+			OUT_RING  ((map[1] << 16) | map[0]);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+static INLINE void
+nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint8_t *map,
+			      unsigned start, unsigned count)
+{
+	map += start;
+
+	while (count) {
+		unsigned nr = count > 2047 ? 2047 : count;
+
+		BEGIN_RING(tesla, 0x400015e8, nr);
+		OUT_RINGp (map, nr);
+
+		count -= nr;
+		map += nr;
+	}
+}
+
+boolean
+nv50_draw_elements(struct pipe_context *pipe,
+		   struct pipe_buffer *indexBuffer, unsigned indexSize,
+		   unsigned mode, unsigned start, unsigned count)
+{
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct pipe_winsys *ws = pipe->winsys;
+	void *map = ws->buffer_map(ws, indexBuffer, PIPE_BUFFER_USAGE_CPU_READ);
+
+	nv50_state_validate(nv50);
+
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x142c, 1);
+	OUT_RING  (0);
+
+	BEGIN_RING(tesla, NV50TCL_VERTEX_BEGIN, 1);
+	OUT_RING  (nv50_prim(mode));
+	switch (indexSize) {
+	case 1:
+		nv50_draw_elements_inline_u08(nv50, map, start, count);
+		break;
+	case 2:
+		nv50_draw_elements_inline_u16(nv50, map, start, count);
+		break;
+	case 4:
+		nv50_draw_elements_inline_u32(nv50, map, start, count);
+		break;
+	default:
+		assert(0);
+	}
+	BEGIN_RING(tesla, NV50TCL_VERTEX_END, 1);
+	OUT_RING  (0);
+
+	pipe->flush(pipe, 0, NULL);
+	return TRUE;
+}
+
+void
+nv50_vbo_validate(struct nv50_context *nv50)
+{
+	struct nouveau_grobj *tesla = nv50->screen->tesla;
+	struct nouveau_stateobj *vtxbuf, *vtxfmt;
+	int i, vpi = 0;
+
+	vtxbuf = so_new(nv50->vtxelt_nr * 4, nv50->vtxelt_nr * 2);
+	vtxfmt = so_new(nv50->vtxelt_nr + 1, 0);
+	so_method(vtxfmt, tesla, 0x1ac0, nv50->vtxelt_nr);
+
+	for (i = 0; i < nv50->vtxelt_nr; i++) {
+		struct pipe_vertex_element *ve = &nv50->vtxelt[i];
+		struct pipe_vertex_buffer *vb =
+			&nv50->vtxbuf[ve->vertex_buffer_index];
+
+		switch (ve->src_format) {
+		case PIPE_FORMAT_R32G32B32A32_FLOAT:
+			so_data(vtxfmt, 0x7e080000 | i);
+			break;
+		case PIPE_FORMAT_R32G32B32_FLOAT:
+			so_data(vtxfmt, 0x7e100000 | i);
+			break;
+		case PIPE_FORMAT_R32G32_FLOAT:
+			so_data(vtxfmt, 0x7e200000 | i);
+			break;
+		case PIPE_FORMAT_R32_FLOAT:
+			so_data(vtxfmt, 0x7e900000 | i);
+			break;
+		case PIPE_FORMAT_R8G8B8A8_UNORM:
+			so_data(vtxfmt, 0x24500000 | i);
+			break;
+		default:
+		{
+			NOUVEAU_ERR("invalid vbo format %s\n",
+				    pf_name(ve->src_format));
+			assert(0);
+			return;
+		}
+		}
+
+		so_method(vtxbuf, tesla, 0x900 + (i * 16), 3);
+		so_data  (vtxbuf, 0x20000000 | vb->pitch);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);
+		so_reloc (vtxbuf, vb->buffer, vb->buffer_offset +
+			  ve->src_offset, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART |
+			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
+	}
+
+	so_ref (vtxfmt, &nv50->state.vtxfmt);
+	so_ref (vtxbuf, &nv50->state.vtxbuf);
+}
+
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
new file mode 100644
index 0000000000..120bdfd9dd
--- /dev/null
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -0,0 +1,47 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = softpipe
+
+C_SOURCES = \
+	sp_fs_exec.c \
+	sp_fs_sse.c \
+	sp_fs_llvm.c \
+	sp_clear.c \
+	sp_flush.c \
+	sp_query.c \
+	sp_context.c \
+	sp_draw_arrays.c \
+	sp_prim_setup.c \
+	sp_prim_vbuf.c \
+	sp_quad.c \
+	sp_quad_alpha_test.c \
+	sp_quad_blend.c \
+	sp_quad_colormask.c \
+	sp_quad_coverage.c \
+	sp_quad_depth_test.c \
+	sp_quad_earlyz.c \
+	sp_quad_fs.c \
+	sp_quad_occlusion.c \
+	sp_quad_output.c \
+	sp_quad_stencil.c \
+	sp_quad_stipple.c \
+	sp_screen.c \
+        sp_setup.c \
+	sp_state_blend.c \
+	sp_state_clip.c \
+	sp_state_derived.c \
+	sp_state_fs.c \
+	sp_state_sampler.c \
+	sp_state_rasterizer.c \
+	sp_state_surface.c \
+	sp_state_vertex.c \
+	sp_texture.c \
+	sp_tex_sample.c \
+	sp_tile_cache.c \
+	sp_surface.c 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
new file mode 100644
index 0000000000..c1f7daa8ab
--- /dev/null
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -0,0 +1,46 @@
+Import('*')
+
+env = env.Clone()
+
+softpipe = env.ConvenienceLibrary(
+	target = 'softpipe',
+	source = [
+		'sp_fs_exec.c',
+		'sp_fs_sse.c',
+		'sp_fs_llvm.c',
+		'sp_clear.c',
+		'sp_context.c',
+		'sp_draw_arrays.c',
+		'sp_flush.c',
+		'sp_prim_setup.c',
+		'sp_prim_vbuf.c',
+		'sp_setup.c',
+		'sp_quad_alpha_test.c',
+		'sp_quad_blend.c',
+		'sp_quad.c',
+		'sp_quad_colormask.c',
+		'sp_quad_coverage.c',
+		'sp_quad_depth_test.c',
+		'sp_quad_earlyz.c',
+		'sp_quad_fs.c',
+		'sp_quad_occlusion.c',
+		'sp_quad_output.c',
+		'sp_quad_stencil.c',
+		'sp_quad_stipple.c',
+		'sp_query.c',
+		'sp_screen.c',
+		'sp_state_blend.c',
+		'sp_state_clip.c',
+		'sp_state_derived.c',
+		'sp_state_fs.c',
+		'sp_state_rasterizer.c',
+		'sp_state_sampler.c',
+		'sp_state_surface.c',
+		'sp_state_vertex.c',
+		'sp_surface.c',
+		'sp_tex_sample.c',
+		'sp_texture.c',
+		'sp_tile_cache.c',
+	])
+
+Export('softpipe')
+\ No newline at end of file
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
new file mode 100644
index 0000000000..dfa46c9fb7
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Clear the given surface to the specified value.
+ * No masking, no scissor (clear entire buffer).
+ * Note: when clearing a color buffer, the clearValue is always
+ * encoded as PIPE_FORMAT_A8R8G8B8_UNORM.
+ */
+void
+softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+               unsigned clearValue)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   if (softpipe->no_rast)
+      return;
+
+#if 0
+   softpipe_update_derived(softpipe); /* not needed?? */
+#endif
+
+   if (ps == sp_tile_cache_get_surface(softpipe->zsbuf_cache)) {
+      sp_tile_cache_clear(softpipe->zsbuf_cache, clearValue);
+      softpipe->framebuffer.zsbuf->status = PIPE_SURFACE_STATUS_CLEAR;
+#if TILE_CLEAR_OPTIMIZATION
+      return;
+#endif
+   }
+
+   for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
+      if (ps == sp_tile_cache_get_surface(softpipe->cbuf_cache[i])) {
+         unsigned cv;
+         if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+            cv = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                               ps->format);
+         }
+         else {
+            cv = clearValue;
+         }
+         sp_tile_cache_clear(softpipe->cbuf_cache[i], cv);
+         softpipe->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_CLEAR;
+      }
+   }
+
+#if !TILE_CLEAR_OPTIMIZATION
+   /* non-cached surface */
+   pipe->surface_fill(pipe, ps, 0, 0, ps->width, ps->height, clearValue);
+#endif
+}
diff --git a/src/gallium/drivers/softpipe/sp_clear.h b/src/gallium/drivers/softpipe/sp_clear.h
new file mode 100644
index 0000000000..a8ed1c4ecc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_clear.h
@@ -0,0 +1,43 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ */
+
+#ifndef SP_CLEAR_H
+#define SP_CLEAR_H
+
+#include "pipe/p_state.h"
+struct pipe_context;
+
+extern void
+softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
+               unsigned clearValue);
+
+
+#endif /* SP_CLEAR_H */
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
new file mode 100644
index 0000000000..99b5274857
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -0,0 +1,275 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_clear.h"
+#include "sp_context.h"
+#include "sp_flush.h"
+#include "sp_prim_setup.h"
+#include "sp_prim_vbuf.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_texture.h"
+#include "sp_winsys.h"
+#include "sp_query.h"
+
+
+
+/**
+ * Map any drawing surfaces which aren't already mapped
+ */
+void
+softpipe_map_surfaces(struct softpipe_context *sp)
+{
+   unsigned i;
+
+   for (i = 0; i < sp->framebuffer.num_cbufs; i++) {
+      sp_tile_cache_map_surfaces(sp->cbuf_cache[i]);
+   }
+
+   sp_tile_cache_map_surfaces(sp->zsbuf_cache);
+}
+
+
+/**
+ * Unmap any mapped drawing surfaces
+ */
+void
+softpipe_unmap_surfaces(struct softpipe_context *sp)
+{
+   uint i;
+
+   for (i = 0; i < sp->framebuffer.num_cbufs; i++)
+      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+   sp_flush_tile_cache(sp, sp->zsbuf_cache);
+
+   for (i = 0; i < sp->framebuffer.num_cbufs; i++) {
+      sp_tile_cache_unmap_surfaces(sp->cbuf_cache[i]);
+   }
+   sp_tile_cache_unmap_surfaces(sp->zsbuf_cache);
+}
+
+
+static void softpipe_destroy( struct pipe_context *pipe )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct pipe_winsys *ws = pipe->winsys;
+   uint i;
+
+   if (softpipe->draw)
+      draw_destroy( softpipe->draw );
+
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
+      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
+      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
+      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
+      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
+      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
+      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
+      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
+      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
+      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
+      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
+   }
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
+   sp_destroy_tile_cache(softpipe->zsbuf_cache);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+
+   for (i = 0; i < Elements(softpipe->constants); i++) {
+      if (softpipe->constants[i].buffer) {
+         winsys_buffer_reference(ws, &softpipe->constants[i].buffer, NULL);
+      }
+   }
+
+   FREE( softpipe );
+}
+
+
+struct pipe_context *
+softpipe_create( struct pipe_screen *screen,
+                 struct pipe_winsys *pipe_winsys,
+                 void *unused )
+{
+   struct softpipe_context *softpipe = CALLOC_STRUCT(softpipe_context);
+   uint i;
+
+   util_init_math();
+
+#ifdef PIPE_ARCH_X86
+   softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE );
+#else
+   softpipe->use_sse = FALSE;
+#endif
+
+   softpipe->dump_fs = debug_get_bool_option( "GALLIUM_DUMP_FS", FALSE );
+
+   softpipe->pipe.winsys = pipe_winsys;
+   softpipe->pipe.screen = screen;
+   softpipe->pipe.destroy = softpipe_destroy;
+
+   /* state setters */
+   softpipe->pipe.create_blend_state = softpipe_create_blend_state;
+   softpipe->pipe.bind_blend_state   = softpipe_bind_blend_state;
+   softpipe->pipe.delete_blend_state = softpipe_delete_blend_state;
+
+   softpipe->pipe.create_sampler_state = softpipe_create_sampler_state;
+   softpipe->pipe.bind_sampler_states  = softpipe_bind_sampler_states;
+   softpipe->pipe.delete_sampler_state = softpipe_delete_sampler_state;
+
+   softpipe->pipe.create_depth_stencil_alpha_state = softpipe_create_depth_stencil_state;
+   softpipe->pipe.bind_depth_stencil_alpha_state   = softpipe_bind_depth_stencil_state;
+   softpipe->pipe.delete_depth_stencil_alpha_state = softpipe_delete_depth_stencil_state;
+
+   softpipe->pipe.create_rasterizer_state = softpipe_create_rasterizer_state;
+   softpipe->pipe.bind_rasterizer_state   = softpipe_bind_rasterizer_state;
+   softpipe->pipe.delete_rasterizer_state = softpipe_delete_rasterizer_state;
+
+   softpipe->pipe.create_fs_state = softpipe_create_fs_state;
+   softpipe->pipe.bind_fs_state   = softpipe_bind_fs_state;
+   softpipe->pipe.delete_fs_state = softpipe_delete_fs_state;
+
+   softpipe->pipe.create_vs_state = softpipe_create_vs_state;
+   softpipe->pipe.bind_vs_state   = softpipe_bind_vs_state;
+   softpipe->pipe.delete_vs_state = softpipe_delete_vs_state;
+
+   softpipe->pipe.set_blend_color = softpipe_set_blend_color;
+   softpipe->pipe.set_clip_state = softpipe_set_clip_state;
+   softpipe->pipe.set_constant_buffer = softpipe_set_constant_buffer;
+   softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
+   softpipe->pipe.set_polygon_stipple = softpipe_set_polygon_stipple;
+   softpipe->pipe.set_scissor_state = softpipe_set_scissor_state;
+   softpipe->pipe.set_sampler_textures = softpipe_set_sampler_textures;
+   softpipe->pipe.set_viewport_state = softpipe_set_viewport_state;
+
+   softpipe->pipe.set_vertex_buffers = softpipe_set_vertex_buffers;
+   softpipe->pipe.set_vertex_elements = softpipe_set_vertex_elements;
+
+   softpipe->pipe.draw_arrays = softpipe_draw_arrays;
+   softpipe->pipe.draw_elements = softpipe_draw_elements;
+   softpipe->pipe.draw_range_elements = softpipe_draw_range_elements;
+   softpipe->pipe.set_edgeflags = softpipe_set_edgeflags;
+
+
+   softpipe->pipe.clear = softpipe_clear;
+   softpipe->pipe.flush = softpipe_flush;
+
+   softpipe_init_query_funcs( softpipe );
+   softpipe_init_texture_funcs( softpipe );
+
+   /*
+    * Alloc caches for accessing drawing surfaces and textures.
+    * Must be before quad stage setup!
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+      softpipe->cbuf_cache[i] = sp_create_tile_cache( screen );
+   softpipe->zsbuf_cache = sp_create_tile_cache( screen );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+
+
+   /* setup quad rendering stages */
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
+      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
+      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
+      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
+      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
+      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
+      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
+      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
+      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
+   }
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      softpipe->tgsi.samplers[i].base.get_samples = sp_get_samples;
+      softpipe->tgsi.samplers[i].unit = i;
+      softpipe->tgsi.samplers[i].sp = softpipe;
+      softpipe->tgsi.samplers[i].cache = softpipe->tex_cache[i];
+      softpipe->tgsi.samplers_list[i] = &softpipe->tgsi.samplers[i];
+   }
+
+   /*
+    * Create drawing context and plug our rendering stage into it.
+    */
+   softpipe->draw = draw_create();
+   if (!softpipe->draw) 
+      goto fail;
+
+   draw_texture_samplers(softpipe->draw,
+                         PIPE_MAX_SAMPLERS, softpipe->tgsi.samplers_list);
+
+   softpipe->setup = sp_draw_render_stage(softpipe);
+   if (!softpipe->setup)
+      goto fail;
+
+   if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
+      softpipe->no_rast = TRUE;
+
+   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
+      /* Deprecated path -- vbuf is the intended interface to the draw module:
+       */
+      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
+   }
+   else {
+      sp_init_vbuf(softpipe);
+   }
+
+   /* plug in AA line/point stages */
+   draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
+   draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
+
+#if USE_DRAW_STAGE_PSTIPPLE
+   /* Do polygon stipple w/ texture map + frag prog? */
+   draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
+#endif
+
+   sp_init_surface_functions(softpipe);
+
+   return &softpipe->pipe;
+
+ fail:
+   softpipe_destroy(&softpipe->pipe);
+   return NULL;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
new file mode 100644
index 0000000000..790143aecc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -0,0 +1,172 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_CONTEXT_H
+#define SP_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+
+#include "draw/draw_vertex.h"
+
+#include "sp_quad.h"
+#include "sp_tex_sample.h"
+
+
+/**
+ * This is a temporary variable for testing draw-stage polygon stipple.
+ * If zero, do stipple in sp_quad_stipple.c
+ */
+#define USE_DRAW_STAGE_PSTIPPLE 1
+
+/* Number of threads working on individual quads.
+ * Setting to 1 disables this feature.
+ */
+#define SP_NUM_QUAD_THREADS 1
+
+struct softpipe_winsys;
+struct softpipe_vbuf_render;
+struct draw_context;
+struct draw_stage;
+struct softpipe_tile_cache;
+struct sp_fragment_shader;
+struct sp_vertex_shader;
+
+
+struct softpipe_context {
+   struct pipe_context pipe;  /**< base class */
+
+   /* The most recent drawing state as set by the driver:
+    */
+   const struct pipe_blend_state   *blend;
+   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   const struct pipe_depth_stencil_alpha_state   *depth_stencil;
+   const struct pipe_rasterizer_state *rasterizer;
+   const struct sp_fragment_shader *fs;
+   const struct sp_vertex_shader *vs;
+
+   struct pipe_blend_color blend_color;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer constants[PIPE_SHADER_TYPES];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+   unsigned dirty;
+
+   unsigned num_samplers;
+   unsigned num_textures;
+   unsigned num_vertex_elements;
+   unsigned num_vertex_buffers;
+
+   boolean no_rast;
+
+   /* Counter for occlusion queries.  Note this supports overlapping
+    * queries.
+    */
+   uint64 occlusion_count;
+
+   /*
+    * Mapped vertex buffers
+    */
+   ubyte *mapped_vbuffer[PIPE_MAX_ATTRIBS];
+   
+   /** Mapped constant buffers */
+   void *mapped_constants[PIPE_SHADER_TYPES];
+
+   /** Vertex format */
+   struct vertex_info vertex_info;
+   struct vertex_info vertex_info_vbuf;
+
+   int psize_slot;
+
+   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+
+#if 0
+   /* Stipple derived state:
+    */
+   ubyte stipple_masks[16][16];
+#endif
+
+   /** Derived from scissor and surface bounds: */
+   struct pipe_scissor_state cliprect;
+
+   unsigned line_stipple_counter;
+
+   /** Software quad rendering pipeline */
+   struct {
+      struct quad_stage *polygon_stipple;
+      struct quad_stage *earlyz;
+      struct quad_stage *shade;
+      struct quad_stage *alpha_test;
+      struct quad_stage *stencil_test;
+      struct quad_stage *depth_test;
+      struct quad_stage *occlusion;
+      struct quad_stage *coverage;
+      struct quad_stage *blend;
+      struct quad_stage *colormask;
+      struct quad_stage *output;
+
+      struct quad_stage *first; /**< points to one of the above stages */
+   } quad[SP_NUM_QUAD_THREADS];
+
+   /** TGSI exec things */
+   struct {
+      struct sp_shader_sampler samplers[PIPE_MAX_SAMPLERS];
+      struct sp_shader_sampler *samplers_list[PIPE_MAX_SAMPLERS];
+   } tgsi;
+
+   /** The primitive drawing context */
+   struct draw_context *draw;
+   struct draw_stage *setup;
+   struct draw_stage *vbuf;
+   struct softpipe_vbuf_render *vbuf_render;
+
+   struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
+   struct softpipe_tile_cache *zsbuf_cache;
+
+   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+
+   int use_sse : 1;
+   int dump_fs : 1;
+};
+
+
+static INLINE struct softpipe_context *
+softpipe_context( struct pipe_context *pipe )
+{
+   return (struct softpipe_context *)pipe;
+}
+
+#endif /* SP_CONTEXT_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
new file mode 100644
index 0000000000..424bd56846
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -0,0 +1,202 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+
+#include "draw/draw_context.h"
+
+
+
+static void
+softpipe_map_constant_buffers(struct softpipe_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      if (sp->constants[i].size)
+         sp->mapped_constants[i] = ws->buffer_map(ws, sp->constants[i].buffer,
+                                                  PIPE_BUFFER_USAGE_CPU_READ);
+   }
+
+   draw_set_mapped_constant_buffer(sp->draw,
+                                   sp->mapped_constants[PIPE_SHADER_VERTEX],
+                                   sp->constants[PIPE_SHADER_VERTEX].size);
+}
+
+static void
+softpipe_unmap_constant_buffers(struct softpipe_context *sp)
+{
+   struct pipe_winsys *ws = sp->pipe.winsys;
+   uint i;
+
+   /* really need to flush all prims since the vert/frag shaders const buffers
+    * are going away now.
+    */
+   draw_flush(sp->draw);
+
+   draw_set_mapped_constant_buffer(sp->draw, NULL, 0);
+
+   for (i = 0; i < 2; i++) {
+      if (sp->constants[i].size)
+         ws->buffer_unmap(ws, sp->constants[i].buffer);
+      sp->mapped_constants[i] = NULL;
+   }
+}
+
+
+static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES
+};
+
+
+boolean
+softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return softpipe_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
+
+
+/**
+ * Draw vertex arrays, with optional indexing.
+ * Basically, map the vertex buffers (and drawing surfaces), then hand off
+ * the drawing to the 'draw' module.
+ *
+ * XXX should the element buffer be specified/bound with a separate function?
+ */
+
+boolean
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   struct draw_context *draw = sp->draw;
+   unsigned i;
+
+   sp->reduced_api_prim = reduced_prim[mode];
+
+   if (sp->dirty)
+      softpipe_update_derived( sp );
+
+   softpipe_map_surfaces(sp);
+   softpipe_map_constant_buffers(sp);
+
+   /*
+    * Map vertex buffers
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      void *buf
+         = pipe_buffer_map(pipe->screen,
+                                    sp->vertex_buffer[i].buffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_vertex_buffer(draw, i, buf);
+   }
+   /* Map index buffer, if present */
+   if (indexBuffer) {
+      void *mapped_indexes
+         = pipe_buffer_map(pipe->screen, indexBuffer,
+                                    PIPE_BUFFER_USAGE_CPU_READ);
+      draw_set_mapped_element_buffer_range(draw, indexSize,
+                                           min_index,
+                                           max_index,
+                                           mapped_indexes);
+   }
+   else {
+      /* no index/element buffer */
+      draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
+   }
+
+
+   /* draw! */
+   draw_arrays(draw, mode, start, count);
+
+   /*
+    * unmap vertex/index buffers - will cause draw module to flush
+    */
+   for (i = 0; i < sp->num_vertex_buffers; i++) {
+      draw_set_mapped_vertex_buffer(draw, i, NULL);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
+   }
+   if (indexBuffer) {
+      draw_set_mapped_element_buffer(draw, 0, NULL);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
+   }
+
+
+   /* Note: leave drawing surfaces mapped */
+   softpipe_unmap_constant_buffers(sp);
+
+   return TRUE;
+}
+
+boolean
+softpipe_draw_elements(struct pipe_context *pipe,
+                       struct pipe_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count)
+{
+   return softpipe_draw_range_elements( pipe, indexBuffer,
+                                        indexSize,
+                                        0, 0xffffffff,
+                                        mode, start, count );
+}
+
+
+
+void
+softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   draw_set_edgeflags(sp->draw, edgeflags);
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
new file mode 100644
index 0000000000..401764bb43
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_defines.h"
+#include "draw/draw_context.h"
+#include "sp_flush.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_state.h"
+#include "sp_tile_cache.h"
+#include "sp_winsys.h"
+
+
+void
+softpipe_flush( struct pipe_context *pipe,
+		unsigned flags,
+                struct pipe_fence_handle **fence )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   draw_flush(softpipe->draw);
+
+   if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
+      for (i = 0; i < softpipe->num_textures; i++) {
+         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+      }
+   }
+
+   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.num_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+
+      /* Need this call for hardware buffers before swapbuffers.
+       *
+       * there should probably be another/different flush-type function
+       * that's called before swapbuffers because we don't always want
+       * to unmap surfaces when flushing.
+       */
+      softpipe_unmap_surfaces(softpipe);
+   }
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+#if 0
+   if(flags & PIPE_FLUSH_FRAME) {
+      static unsigned frame_no = 1;
+      static char filename[256];
+      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.cbufs[0]);
+      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.zsbuf);
+      ++frame_no;
+   }
+#endif
+   
+   if (fence)
+      *fence = NULL;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
new file mode 100644
index 0000000000..68d9b5fa83
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -0,0 +1,37 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_FLUSH_H
+#define SP_FLUSH_H
+
+struct pipe_context;
+struct pipe_fence_handle;
+
+void softpipe_flush(struct pipe_context *pipe, unsigned flags,
+                    struct pipe_fence_handle **fence);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h
new file mode 100644
index 0000000000..4792ace3a3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs.h
@@ -0,0 +1,54 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_FS_H
+#define SP_FS_H
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ);
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ);
+
+struct tgsi_interp_coef;
+struct tgsi_exec_vector;
+
+void sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
+			 float x, float y,
+			 struct tgsi_exec_vector *quadpos);
+
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
new file mode 100644
index 0000000000..453b0373f0
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -0,0 +1,164 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_headers.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_parse.h"
+
+struct sp_exec_fragment_shader
+{
+   struct sp_fragment_shader base;
+};
+
+
+/** cast wrapper */
+static INLINE struct sp_exec_fragment_shader *
+sp_exec_fragment_shader(const struct sp_fragment_shader *base)
+{
+   return (struct sp_exec_fragment_shader *) base;
+}
+
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+void
+sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
+
+/* TODO: hide the machine struct in here somewhere, remove from this
+ * interface:
+ */
+static unsigned 
+exec_run( const struct sp_fragment_shader *base,
+	  struct tgsi_exec_machine *machine,
+	  struct quad_header *quad )
+{
+
+   /* Compute X, Y, Z, W vals for this quad */
+   sp_setup_pos_vector(quad->posCoef, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
+		       &machine->QuadPos);
+   
+   return tgsi_exec_machine_run( machine );
+}
+
+
+
+static void 
+exec_delete( struct sp_fragment_shader *base )
+{
+   FREE((void *) base->shader.tokens);
+   FREE(base);
+}
+
+
+
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_exec(struct softpipe_context *softpipe,
+			const struct pipe_shader_state *templ)
+{
+   struct sp_exec_fragment_shader *shader;
+
+   /* Decide whether we'll be codegenerating this shader and if so do
+    * that now.
+    */
+
+   shader = CALLOC_STRUCT(sp_exec_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   /* we need to keep a local copy of the tokens */
+   shader->base.shader.tokens = tgsi_dup_tokens(templ->tokens);
+   shader->base.prepare = exec_prepare;
+   shader->base.run = exec_run;
+   shader->base.delete = exec_delete;
+
+   return &shader->base;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
new file mode 100644
index 0000000000..34adac5226
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_llvm.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *   Zack Rusin
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_sse2.h"
+
+#if 0
+
+struct sp_llvm_fragment_shader {
+   struct sp_fragment_shader base;
+   struct gallivm_prog *llvm_prog;
+};
+
+static void
+shade_quad_llvm(struct quad_stage *qs,
+                struct quad_header *quad)
+{
+   struct quad_shade_stage *qss = quad_shade_stage(qs);
+   struct softpipe_context *softpipe = qs->softpipe;
+   float dests[4][16][4] ALIGN16_ATTRIB;
+   float inputs[4][16][4] ALIGN16_ATTRIB;
+   const float fx = (float) quad->x0;
+   const float fy = (float) quad->y0;
+   struct gallivm_prog *llvm = qss->llvm_prog;
+
+   inputs[0][0][0] = fx;
+   inputs[1][0][0] = fx + 1.0f;
+   inputs[2][0][0] = fx;
+   inputs[3][0][0] = fx + 1.0f;
+
+   inputs[0][0][1] = fy;
+   inputs[1][0][1] = fy;
+   inputs[2][0][1] = fy + 1.0f;
+   inputs[3][0][1] = fy + 1.0f;
+
+
+   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
+
+#if DLLVM
+   debug_printf("MASK = %d\n", quad->mask);
+   for (int i = 0; i < 4; ++i) {
+      for (int j = 0; j < 2; ++j) {
+         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
+                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
+      }
+   }
+#endif
+
+   quad->mask &=
+      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
+                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
+                                   qss->samplers);
+#if DLLVM
+   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
+          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
+          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
+#endif
+
+   /* store result color */
+   if (qss->colorOutSlot >= 0) {
+      unsigned i;
+      /* XXX need to handle multiple color outputs someday */
+      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
+             == TGSI_SEMANTIC_COLOR);
+      for (i = 0; i < QUAD_SIZE; ++i) {
+         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
+         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
+         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
+         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
+      }
+   }
+#if DLLVM
+   for (int i = 0; i < QUAD_SIZE; ++i) {
+      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
+             quad->outputs.color[0][0][i],
+             quad->outputs.color[0][1][i],
+             quad->outputs.color[0][2][i],
+             quad->outputs.color[0][3][i]);
+   }
+#endif
+
+   /* store result Z */
+   if (qss->depthOutSlot >= 0) {
+      /* output[slot] is new Z */
+      uint i;
+      for (i = 0; i < 4; i++) {
+         quad->outputs.depth[i] = dests[i][0][2];
+      }
+   }
+   else {
+      /* copy input Z (which was interpolated by the executor) to output Z */
+      uint i;
+      for (i = 0; i < 4; i++) {
+         quad->outputs.depth[i] = inputs[i][0][2];
+      }
+   }
+#if DLLVM
+   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
+             quad->outputs.depth[0],
+             quad->outputs.depth[1],
+             quad->outputs.depth[2],
+             quad->outputs.depth[3], quad->mask);
+#endif
+
+   /* shader may cull fragments */
+   if( quad->mask ) {
+      qs->next->run( qs->next, quad );
+   }
+}
+
+
+unsigned 
+run_llvm_fs( const struct sp_fragment_shader *base,
+	     struct foo *machine )
+{
+}
+
+
+void 
+delete_llvm_fs( struct sp_fragment_shader *base )
+{
+   FREE(base);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   struct sp_llvm_fragment_shader *shader = NULL;
+
+   /* LLVM fragment shaders currently disabled:
+    */
+   state = CALLOC_STRUCT(sp_llvm_shader_state);
+   if (!state)
+      return NULL;
+
+   state->llvm_prog = 0;
+
+   if (!gallivm_global_cpu_engine()) {
+      gallivm_cpu_engine_create(state->llvm_prog);
+   }
+   else
+      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
+   
+   if (shader) {
+      shader->base.run = run_llvm_fs;
+      shader->base.delete = delete_llvm_fs;
+   }
+
+   return shader;
+}
+
+
+#else
+
+struct sp_fragment_shader *
+softpipe_create_fs_llvm(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
new file mode 100644
index 0000000000..9a273c8764
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -0,0 +1,169 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+#include "sp_headers.h"
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/tgsi_exec.h"
+#include "tgsi/tgsi_sse2.h"
+
+
+#if defined(PIPE_ARCH_X86)
+
+#include "rtasm/rtasm_x86sse.h"
+
+/* Surely this should be defined somewhere in a tgsi header:
+ */
+typedef void (PIPE_CDECL *codegen_function)(
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   const float (*constant)[4],
+   struct tgsi_exec_vector *temporary,
+   const struct tgsi_interp_coef *coef,
+   float (*immediates)[4]
+   //, const struct tgsi_exec_vector *quadPos
+ );
+
+
+struct sp_sse_fragment_shader {
+   struct sp_fragment_shader base;
+   struct x86_function             sse2_program;
+   codegen_function func;
+   float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
+};
+
+
+
+static void
+fs_sse_prepare( const struct sp_fragment_shader *base,
+		struct tgsi_exec_machine *machine,
+		struct tgsi_sampler **samplers )
+{
+}
+
+
+/* TODO: codegenerate the whole run function, skip this wrapper.
+ * TODO: break dependency on tgsi_exec_machine struct
+ * TODO: push Position calculation into the generated shader
+ * TODO: process >1 quad at a time
+ */
+static unsigned 
+fs_sse_run( const struct sp_fragment_shader *base,
+	    struct tgsi_exec_machine *machine,
+	    struct quad_header *quad )
+{
+   struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base;
+
+   /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
+   sp_setup_pos_vector(quad->posCoef, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
+		       machine->Temps);
+
+   /* init kill mask */
+   tgsi_set_kill_mask(machine, 0x0);
+   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
+
+   shader->func( machine->Inputs,
+		 machine->Outputs,
+		 machine->Consts,
+		 machine->Temps,
+		 machine->InterpCoefs,
+                 shader->immediates
+		 //	 , &machine->QuadPos
+      );
+
+   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+}
+
+
+static void 
+fs_sse_delete( struct sp_fragment_shader *base )
+{
+   struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base;
+
+   x86_release_func( &shader->sse2_program );
+   FREE(shader);
+}
+
+
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   struct sp_sse_fragment_shader *shader;
+
+   if (!softpipe->use_sse)
+      return NULL;
+
+   shader = CALLOC_STRUCT(sp_sse_fragment_shader);
+   if (!shader)
+      return NULL;
+
+   x86_init_func( &shader->sse2_program );
+   
+   if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
+                        shader->immediates, FALSE )) {
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->func = (codegen_function) x86_get_func( &shader->sse2_program );
+   if (!shader->func) {
+      x86_release_func( &shader->sse2_program );
+      FREE(shader);
+      return NULL;
+   }
+
+   shader->base.shader = *templ;
+   shader->base.prepare = fs_sse_prepare;
+   shader->base.run = fs_sse_run;
+   shader->base.delete = fs_sse_delete;
+
+   return &shader->base;
+}
+
+
+#else
+
+/* Maybe put this varient in the header file.
+ */
+struct sp_fragment_shader *
+softpipe_create_fs_sse(struct softpipe_context *softpipe,
+		       const struct pipe_shader_state *templ)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
new file mode 100644
index 0000000000..4a42cb3c19
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_HEADERS_H
+#define SP_HEADERS_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_exec.h"
+
+#define PRIM_POINT 1
+#define PRIM_LINE  2
+#define PRIM_TRI   3
+
+
+/* The rasterizer generates 2x2 quads of fragment and feeds them to
+ * the current fp_machine (see below).
+ * Remember that Y=0=top with Y increasing down the window.
+ */
+#define QUAD_TOP_LEFT     0
+#define QUAD_TOP_RIGHT    1
+#define QUAD_BOTTOM_LEFT  2
+#define QUAD_BOTTOM_RIGHT 3
+
+#define MASK_TOP_LEFT     (1 << QUAD_TOP_LEFT)
+#define MASK_TOP_RIGHT    (1 << QUAD_TOP_RIGHT)
+#define MASK_BOTTOM_LEFT  (1 << QUAD_BOTTOM_LEFT)
+#define MASK_BOTTOM_RIGHT (1 << QUAD_BOTTOM_RIGHT)
+#define MASK_ALL          0xf
+
+
+/**
+ * Encodes everything we need to know about a 2x2 pixel block.  Uses
+ * "Channel-Serial" or "SoA" layout.  
+ */
+struct quad_header_input
+{
+   int x0;
+   int y0;
+   float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
+   unsigned facing:1;   /**< Front (0) or back (1) facing? */
+   unsigned prim:2;     /**< PRIM_POINT, LINE, TRI */
+};
+
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
+
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   float depth[QUAD_SIZE];
+};
+
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   struct quad_header_output output;
+
+   const struct tgsi_interp_coef *coef;
+   const struct tgsi_interp_coef *posCoef;
+
+   unsigned nr_attrs;
+};
+
+#endif /* SP_HEADERS_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
new file mode 100644
index 0000000000..038ff04d4f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -0,0 +1,190 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief A draw stage that drives our triangle setup routines from
+ * within the draw pipeline.  One of two ways to drive setup, the
+ * other being in sp_prim_vbuf.c.
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+
+#include "sp_context.h"
+#include "sp_setup.h"
+#include "sp_state.h"
+#include "sp_prim_setup.h"
+#include "draw/draw_pipe.h"
+#include "draw/draw_vertex.h"
+#include "util/u_memory.h"
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct setup_context *setup;
+};
+
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
+{
+   return (struct setup_stage *)stage;
+}
+
+
+typedef const float (*cptrf4)[4];
+
+static void
+do_tri(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+   
+   setup_tri( setup->setup,
+              (cptrf4)prim->v[0]->data,
+              (cptrf4)prim->v[1]->data,
+              (cptrf4)prim->v[2]->data );
+}
+
+static void
+do_line(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   setup_line( setup->setup,
+               (cptrf4)prim->v[0]->data,
+               (cptrf4)prim->v[1]->data );
+}
+
+static void
+do_point(struct draw_stage *stage, struct prim_header *prim)
+{
+   struct setup_stage *setup = setup_stage( stage );
+
+   setup_point( setup->setup,
+                (cptrf4)prim->v[0]->data );
+}
+
+
+
+
+static void setup_begin( struct draw_stage *stage )
+{
+   struct setup_stage *setup = setup_stage(stage);
+
+   setup_prepare( setup->setup );
+
+   stage->point = do_point;
+   stage->line = do_line;
+   stage->tri = do_tri;
+}
+
+
+static void setup_first_point( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->point( stage, header );
+}
+
+static void setup_first_line( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->line( stage, header );
+}
+
+
+static void setup_first_tri( struct draw_stage *stage,
+			       struct prim_header *header )
+{
+   setup_begin(stage);
+   stage->tri( stage, header );
+}
+
+
+
+static void setup_flush( struct draw_stage *stage,
+			 unsigned flags )
+{
+   stage->point = setup_first_point;
+   stage->line = setup_first_line;
+   stage->tri = setup_first_tri;
+}
+
+
+static void reset_stipple_counter( struct draw_stage *stage )
+{
+}
+
+
+static void render_destroy( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   setup_destroy_context(ssetup->setup);
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
+{
+   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
+
+   sstage->setup = setup_create_context(softpipe);
+   sstage->stage.draw = softpipe->draw;
+   sstage->stage.point = setup_first_point;
+   sstage->stage.line = setup_first_line;
+   sstage->stage.tri = setup_first_tri;
+   sstage->stage.flush = setup_flush;
+   sstage->stage.reset_stipple_counter = reset_stipple_counter;
+   sstage->stage.destroy = render_destroy;
+
+   return (struct draw_stage *)sstage;
+}
+
+struct setup_context *
+sp_draw_setup_context( struct draw_stage *stage )
+{
+   struct setup_stage *ssetup = setup_stage(stage);
+   return ssetup->setup;
+}
+
+void
+sp_draw_flush( struct draw_stage *stage )
+{
+   stage->flush( stage, 0 );
+}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
new file mode 100644
index 0000000000..49bdd98ed8
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.h
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SP_PRIM_SETUP_H
+#define SP_PRIM_SETUP_H
+
+
+/**
+ * vbuf is a special stage to gather the stream of triangles, lines, points
+ * together and reconstruct vertex buffers for hardware upload.
+ *
+ * First attempt, work in progress.
+ * 
+ * TODO:
+ *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
+ *    - tell vbuf stage how to build hw vertices directly
+ *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
+ *
+ *
+ *
+ * Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+
+struct draw_stage;
+struct softpipe_context;
+
+
+typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
+                                unsigned prim,
+                                const ushort *elements,
+                                unsigned nr_elements,
+                                const void *vertex_buffer,
+                                unsigned nr_vertices );
+
+
+extern struct draw_stage *
+sp_draw_render_stage( struct softpipe_context *softpipe );
+
+extern struct setup_context *
+sp_draw_setup_context( struct draw_stage * );
+
+extern void
+sp_draw_flush( struct draw_stage * );
+
+
+extern struct draw_stage *
+sp_draw_vbuf_stage( struct draw_context *draw_context,
+                    struct pipe_context *pipe,
+                    vbuf_draw_func draw );
+
+
+#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
new file mode 100644
index 0000000000..9cd5784e5b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -0,0 +1,410 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Interface between 'draw' module's output and the softpipe rasterizer/setup
+ * code.  When the 'draw' module has finished filling a vertex buffer, the
+ * draw_arrays() functions below will be called.  Loop over the vertices and
+ * call the point/line/tri setup functions.
+ *
+ * Authors
+ *  Brian Paul
+ */
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_prim_vbuf.h"
+#include "sp_prim_setup.h"
+#include "sp_setup.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
+#include "util/u_memory.h"
+
+
+#define SP_MAX_VBUF_INDEXES 1024
+#define SP_MAX_VBUF_SIZE    4096
+
+typedef const float (*cptrf4)[4];
+
+/**
+ * Subclass of vbuf_render.
+ */
+struct softpipe_vbuf_render
+{
+   struct vbuf_render base;
+   struct softpipe_context *softpipe;
+   uint prim;
+   uint vertex_size;
+   void *vertex_buffer;
+};
+
+
+/** cast wrapper */
+static struct softpipe_vbuf_render *
+softpipe_vbuf_render(struct vbuf_render *vbr)
+{
+   return (struct softpipe_vbuf_render *) vbr;
+}
+
+
+static const struct vertex_info *
+sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   return softpipe_get_vbuf_vertex_info(cvbr->softpipe);
+}
+
+
+static void *
+sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
+                            ushort vertex_size, ushort nr_vertices)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   assert(!cvbr->vertex_buffer);
+   cvbr->vertex_buffer = align_malloc(vertex_size * nr_vertices, 16);
+   cvbr->vertex_size = vertex_size;
+   return cvbr->vertex_buffer;
+}
+
+
+static void
+sp_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
+                           unsigned vertex_size, unsigned vertices_used)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   align_free(vertices);
+   assert(vertices == cvbr->vertex_buffer);
+   cvbr->vertex_buffer = NULL;
+}
+
+
+static boolean
+sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   
+   setup_prepare( setup_ctx );
+
+
+
+   cvbr->prim = prim;
+   return TRUE;
+
+}
+
+
+static INLINE cptrf4 get_vert( const void *vertex_buffer,
+                               int index,
+                               int stride )
+{
+   return (cptrf4)((char *)vertex_buffer + index * stride);
+}
+
+
+/**
+ * draw elements / indexed primitives
+ */
+static void
+sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer = cvbr->vertex_buffer;
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup_point( setup_ctx,
+                      get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[i-1], stride),
+                     get_vert(vertex_buffer, indices[i-0], stride) );
+      }
+      if (nr) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, indices[nr-1], stride),
+                     get_vert(vertex_buffer, indices[0], stride) );
+      }
+      break;
+
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
+                    get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[0], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 3; i < nr; i += 4) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 3; i < nr; i += 2) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-2], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, indices[i-1], stride),
+                    get_vert(vertex_buffer, indices[i-3], stride),
+                    get_vert(vertex_buffer, indices[i-0], stride));
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /* XXX: why are we calling this???  If we had to call something, it
+    * would be a function in sp_setup.c:
+    */
+   sp_draw_flush( setup );
+}
+
+
+/**
+ * This function is hit when the draw module is working in pass-through mode.
+ * It's up to us to convert the vertex array into point/line/tri prims.
+ */
+static void
+sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   struct softpipe_context *softpipe = cvbr->softpipe;
+   const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
+   const void *vertex_buffer =
+      (void *) get_vert(cvbr->vertex_buffer, start, stride);
+   unsigned i;
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct draw_stage *setup = softpipe->setup;
+   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
+
+   switch (cvbr->prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < nr; i++) {
+         setup_point( setup_ctx,
+                      get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 1; i < nr; i += 2) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      for (i = 1; i < nr; i ++) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, i-1, stride),
+                     get_vert(vertex_buffer, i-0, stride) );
+      }
+      if (nr) {
+         setup_line( setup_ctx,
+                     get_vert(vertex_buffer, nr-1, stride),
+                     get_vert(vertex_buffer, 0, stride) );
+      }
+      break;
+
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 2; i < nr; i += 3) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i+(i&1)-2, stride),
+                    get_vert(vertex_buffer, i-(i&1)-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      for (i = 2; i < nr; i += 1) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, 0, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   case PIPE_PRIM_QUADS:
+      for (i = 3; i < nr; i += 4) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 3; i < nr; i += 2) {
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-2, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+
+         setup_tri( setup_ctx,
+                    get_vert(vertex_buffer, i-1, stride),
+                    get_vert(vertex_buffer, i-3, stride),
+                    get_vert(vertex_buffer, i-0, stride));
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+static void
+sp_vbuf_destroy(struct vbuf_render *vbr)
+{
+   struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+   cvbr->softpipe->vbuf_render = NULL;
+   FREE(cvbr);
+}
+
+
+/**
+ * Initialize the post-transform vertex buffer information for the given
+ * context.
+ */
+void
+sp_init_vbuf(struct softpipe_context *sp)
+{
+   assert(sp->draw);
+
+   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+
+   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
+   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
+
+   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
+   sp->vbuf_render->base.draw = sp_vbuf_draw;
+   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
+   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
+   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+
+   sp->vbuf_render->softpipe = sp;
+
+   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+
+   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+
+   draw_set_render(sp->draw, &sp->vbuf_render->base);
+}
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
new file mode 100644
index 0000000000..1de9cc2a89
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_VBUF_H
+#define SP_VBUF_H
+
+
+struct softpipe_context;
+
+extern void
+sp_init_vbuf(struct softpipe_context *softpipe);
+
+
+#endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.c b/src/gallium/drivers/softpipe/sp_quad.c
new file mode 100644
index 0000000000..892ef87ee9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad.c
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "pipe/p_shader_tokens.h"
+
+static void
+sp_push_quad_first(
+   struct softpipe_context *sp,
+   struct quad_stage *quad,
+   uint i )
+{
+   quad->next = sp->quad[i].first;
+   sp->quad[i].first = quad;
+}
+
+static void
+sp_build_depth_stencil(
+   struct softpipe_context *sp,
+   uint i )
+{
+   if (sp->depth_stencil->stencil[0].enabled ||
+       sp->depth_stencil->stencil[1].enabled) {
+      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
+   }
+   else if (sp->depth_stencil->depth.enabled &&
+            sp->framebuffer.zsbuf) {
+      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
+   }
+}
+
+void
+sp_build_quad_pipeline(struct softpipe_context *sp)
+{
+   uint i;
+
+   boolean early_depth_test =
+               sp->depth_stencil->depth.enabled &&
+               sp->framebuffer.zsbuf &&
+               !sp->depth_stencil->alpha.enabled &&
+               !sp->fs->info.uses_kill &&
+               !sp->fs->info.writes_z;
+
+   /* build up the pipeline in reverse order... */
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first = sp->quad[i].output;
+
+      if (sp->blend->colormask != 0xf) {
+         sp_push_quad_first( sp, sp->quad[i].colormask, i );
+      }
+
+      if (sp->blend->blend_enable ||
+          sp->blend->logicop_enable) {
+         sp_push_quad_first( sp, sp->quad[i].blend, i );
+      }
+
+      if (sp->depth_stencil->depth.occlusion_count) {
+         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
+      }
+
+      if (sp->rasterizer->poly_smooth ||
+          sp->rasterizer->line_smooth ||
+          sp->rasterizer->point_smooth) {
+         sp_push_quad_first( sp, sp->quad[i].coverage, i );
+      }
+
+      if (!early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+      }
+
+      if (sp->depth_stencil->alpha.enabled) {
+         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
+      }
+
+      /* XXX always enable shader? */
+      if (1) {
+         sp_push_quad_first( sp, sp->quad[i].shade, i );
+      }
+
+      if (early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
+      }
+
+#if !USE_DRAW_STAGE_PSTIPPLE
+      if (sp->rasterizer->poly_stipple_enable) {
+         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
+      }
+#endif
+   }
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
new file mode 100644
index 0000000000..08513cb95f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -0,0 +1,69 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_QUAD_H
+#define SP_QUAD_H
+
+
+struct softpipe_context;
+struct quad_header;
+
+
+struct quad_stage {
+   struct softpipe_context *softpipe;
+
+   struct quad_stage *next;
+
+   void (*begin)(struct quad_stage *qs);
+
+   /** the stage action */
+   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+
+   void (*destroy)(struct quad_stage *qs);
+};
+
+
+struct quad_stage *sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_earlyz_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_alpha_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe );
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
+
+void sp_build_quad_pipeline(struct softpipe_context *sp);
+
+void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
+
+#endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
new file mode 100644
index 0000000000..5bebd141e9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -0,0 +1,108 @@
+
+/**
+ * quad alpha test
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+static void
+alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const float ref = softpipe->depth_stencil->alpha.ref;
+   unsigned passMask = 0x0, j;
+   const uint cbuf = 0; /* only output[0].alpha is tested */
+   const float *aaaa = quad->output.color[cbuf][3];
+
+   switch (softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_NEVER:
+      break;
+   case PIPE_FUNC_LESS:
+      /*
+       * If mask were an array [4] we could do this SIMD-style:
+       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] < ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] == ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] <= ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] > ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] != ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (aaaa[j] >= ref) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   quad->inout.mask &= passMask;
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void alpha_test_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void alpha_test_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = alpha_test_begin;
+   stage->run = alpha_test_quad;
+   stage->destroy = alpha_test_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
new file mode 100644
index 0000000000..6f64c6e584
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -0,0 +1,759 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * quad blending
+ * \author Brian Paul
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_quad.h"
+
+
+#define VEC4_COPY(DST, SRC) \
+do { \
+    DST[0] = SRC[0]; \
+    DST[1] = SRC[1]; \
+    DST[2] = SRC[2]; \
+    DST[3] = SRC[3]; \
+} while(0)
+
+#define VEC4_SCALAR(DST, SRC) \
+do { \
+    DST[0] = SRC; \
+    DST[1] = SRC; \
+    DST[2] = SRC; \
+    DST[3] = SRC; \
+} while(0)
+
+#define VEC4_ADD(R, A, B) \
+do { \
+   R[0] = A[0] + B[0]; \
+   R[1] = A[1] + B[1]; \
+   R[2] = A[2] + B[2]; \
+   R[3] = A[3] + B[3]; \
+} while (0)
+
+#define VEC4_SUB(R, A, B) \
+do { \
+   R[0] = A[0] - B[0]; \
+   R[1] = A[1] - B[1]; \
+   R[2] = A[2] - B[2]; \
+   R[3] = A[3] - B[3]; \
+} while (0)
+
+#define VEC4_MUL(R, A, B) \
+do { \
+   R[0] = A[0] * B[0]; \
+   R[1] = A[1] * B[1]; \
+   R[2] = A[2] * B[2]; \
+   R[3] = A[3] * B[3]; \
+} while (0)
+
+#define VEC4_MIN(R, A, B) \
+do { \
+   R[0] = (A[0] < B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] < B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] < B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] < B[3]) ? A[3] : B[3]; \
+} while (0)
+
+#define VEC4_MAX(R, A, B) \
+do { \
+   R[0] = (A[0] > B[0]) ? A[0] : B[0]; \
+   R[1] = (A[1] > B[1]) ? A[1] : B[1]; \
+   R[2] = (A[2] > B[2]) ? A[2] : B[2]; \
+   R[3] = (A[3] > B[3]) ? A[3] : B[3]; \
+} while (0)
+
+
+
+static void
+logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+      float dest[4][QUAD_SIZE];
+      ubyte src[4][4], dst[4][4], res[4][4];
+      uint *src4 = (uint *) src;
+      uint *dst4 = (uint *) dst;
+      uint *res4 = (uint *) res;
+      struct softpipe_cached_tile *
+         tile = sp_get_cached_tile(softpipe,
+                                   softpipe->cbuf_cache[cbuf],
+                                   quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /* convert to ubyte */
+      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+      }
+
+      switch (softpipe->blend->logicop_func) {
+      case PIPE_LOGICOP_CLEAR:
+         for (j = 0; j < 4; j++)
+            res4[j] = 0;
+         break;
+      case PIPE_LOGICOP_NOR:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] | dst4[j]);
+         break;
+      case PIPE_LOGICOP_AND_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j] & dst4[j];
+         break;
+      case PIPE_LOGICOP_COPY_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j];
+         break;
+      case PIPE_LOGICOP_AND_REVERSE:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] & ~dst4[j];
+         break;
+      case PIPE_LOGICOP_INVERT:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~dst4[j];
+         break;
+      case PIPE_LOGICOP_XOR:
+         for (j = 0; j < 4; j++)
+            res4[j] = dst4[j] ^ src4[j];
+         break;
+      case PIPE_LOGICOP_NAND:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] & dst4[j]);
+         break;
+      case PIPE_LOGICOP_AND:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] & dst4[j];
+         break;
+      case PIPE_LOGICOP_EQUIV:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~(src4[j] ^ dst4[j]);
+         break;
+      case PIPE_LOGICOP_NOOP:
+         for (j = 0; j < 4; j++)
+            res4[j] = dst4[j];
+         break;
+      case PIPE_LOGICOP_OR_INVERTED:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~src4[j] | dst4[j];
+         break;
+      case PIPE_LOGICOP_COPY:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j];
+         break;
+      case PIPE_LOGICOP_OR_REVERSE:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] | ~dst4[j];
+         break;
+      case PIPE_LOGICOP_OR:
+         for (j = 0; j < 4; j++)
+            res4[j] = src4[j] | dst4[j];
+         break;
+      case PIPE_LOGICOP_SET:
+         for (j = 0; j < 4; j++)
+            res4[j] = ~0;
+         break;
+      default:
+         assert(0);
+      }
+
+      for (j = 0; j < 4; j++) {
+         quadColor[j][0] = ubyte_to_float(res[j][0]);
+         quadColor[j][1] = ubyte_to_float(res[j][1]);
+         quadColor[j][2] = ubyte_to_float(res[j][2]);
+         quadColor[j][3] = ubyte_to_float(res[j][3]);
+      }
+   }
+
+   /* pass quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+
+
+static void
+blend_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   if (softpipe->blend->logicop_enable) {
+      logicop_quad(qs, quad);
+      return;
+   }
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /*
+       * Compute src/first term RGB
+       */
+      switch (softpipe->blend->rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         VEC4_COPY(source[0], quadColor[0]); /* R */
+         VEC4_COPY(source[1], quadColor[1]); /* G */
+         VEC4_COPY(source[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         {
+            const float *alpha = quadColor[3];
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         {
+            const float *alpha = dest[3];
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         {
+            const float *alpha = quadColor[3];
+            float diff[4], temp[4];
+            VEC4_SUB(diff, one, dest[3]);
+            VEC4_MIN(temp, alpha, diff);
+            VEC4_MUL(source[0], quadColor[0], temp); /* R */
+            VEC4_MUL(source[1], quadColor[1], temp); /* G */
+            VEC4_MUL(source[2], quadColor[2], temp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], comp); /* R */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], comp); /* G */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float alpha[4];
+            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_SRC1_ALPHA:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(source[0], zero); /* R */
+         VEC4_COPY(source[1], zero); /* G */
+         VEC4_COPY(source[2], zero); /* B */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, quadColor[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, dest[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[0]); /* R */
+            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, dest[1]); /* G */
+            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, dest[2]); /* B */
+            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         {
+            float inv_comp[4];
+            /* R */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+            VEC4_MUL(source[0], quadColor[0], inv_comp);
+            /* G */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+            VEC4_MUL(source[1], quadColor[1], inv_comp);
+            /* B */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+            VEC4_MUL(source[2], quadColor[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+         assert(0); /* to do */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+         assert(0); /* to do */
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Compute src/first term A
+       */
+      switch (softpipe->blend->alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         VEC4_COPY(source[3], quadColor[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         {
+            const float *alpha = quadColor[3];
+            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         /* multiply alpha by 1.0 */
+         VEC4_COPY(source[3], quadColor[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(source[3], quadColor[3], comp); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(source[3], zero); /* A */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, quadColor[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_alpha[4];
+            VEC4_SUB(inv_alpha, one, dest[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            /* A */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(source[3], quadColor[3], inv_comp);
+         }
+         break;
+      default:
+         assert(0);
+      }
+
+
+      /*
+       * Compute dest/second term RGB
+       */
+      switch (softpipe->blend->rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         assert(0); /* illegal */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+            VEC4_MUL(dest[0], dest[0], comp); /* R */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+            VEC4_MUL(dest[1], dest[1], comp); /* G */
+            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+            VEC4_MUL(dest[2], dest[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(dest[0], dest[0], comp); /* R */
+            VEC4_MUL(dest[1], dest[1], comp); /* G */
+            VEC4_MUL(dest[2], dest[2], comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(dest[0], zero); /* R */
+         VEC4_COPY(dest[1], zero); /* G */
+         VEC4_COPY(dest[2], zero); /* B */
+         break;
+      case PIPE_BLENDFACTOR_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_SRC1_ALPHA:
+         /* XXX what are these? */
+         assert(0);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float one_minus_alpha[QUAD_SIZE];
+            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[3]); /* A */
+            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[0]); /* R */
+            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+            VEC4_SUB(inv_comp, one, dest[1]); /* G */
+            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+            VEC4_SUB(inv_comp, one, dest[2]); /* B */
+            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         {
+            float inv_comp[4];
+            /* R */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+            VEC4_MUL(dest[0], dest[0], inv_comp);
+            /* G */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+            VEC4_MUL(dest[1], dest[1], inv_comp);
+            /* B */
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+            VEC4_MUL(dest[2], dest[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(dest[0], dest[0], inv_comp);
+            VEC4_MUL(dest[1], dest[1], inv_comp);
+            VEC4_MUL(dest[2], dest[2], inv_comp);
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+         /* XXX what are these? */
+         assert(0);
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Compute dest/second term A
+       */
+      switch (softpipe->blend->alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         /* dest = dest * 1   NO-OP, leave dest as-is */
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+         break;
+      case PIPE_BLENDFACTOR_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_DST_ALPHA:
+         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+         assert(0); /* illegal */
+         break;
+      case PIPE_BLENDFACTOR_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_CONST_ALPHA:
+         {
+            float comp[4];
+            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+            VEC4_MUL(dest[3], dest[3], comp); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         VEC4_COPY(dest[3], zero); /* A */
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         {
+            float one_minus_alpha[QUAD_SIZE];
+            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_DST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SUB(inv_comp, one, dest[3]); /* A */
+            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+         }
+         break;
+      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+         /* fall-through */
+      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+         {
+            float inv_comp[4];
+            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+            VEC4_MUL(dest[3], dest[3], inv_comp);
+         }
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Combine RGB terms
+       */
+      switch (softpipe->blend->rgb_func) {
+      case PIPE_BLEND_ADD:
+         VEC4_ADD(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_ADD(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_ADD(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         VEC4_SUB(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_SUB(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_SUB(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         VEC4_SUB(quadColor[0], dest[0], source[0]); /* R */
+         VEC4_SUB(quadColor[1], dest[1], source[1]); /* G */
+         VEC4_SUB(quadColor[2], dest[2], source[2]); /* B */
+         break;
+      case PIPE_BLEND_MIN:
+         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      case PIPE_BLEND_MAX:
+         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+         break;
+      default:
+         assert(0);
+      }
+
+      /*
+       * Combine A terms
+       */
+      switch (softpipe->blend->alpha_func) {
+      case PIPE_BLEND_ADD:
+         VEC4_ADD(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         VEC4_SUB(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_REVERSE_SUBTRACT:
+         VEC4_SUB(quadColor[3], dest[3], source[3]); /* A */
+         break;
+      case PIPE_BLEND_MIN:
+         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      case PIPE_BLEND_MAX:
+         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+         break;
+      default:
+         assert(0);
+      }
+
+   } /* cbuf loop */
+
+   /* pass blended quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+static void blend_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void blend_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = blend_begin;
+   stage->run = blend_quad;
+   stage->destroy = blend_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
new file mode 100644
index 0000000000..92e9af09c1
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
@@ -0,0 +1,74 @@
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+
+
+/**
+ * Loop over colorbuffers, passing quad to next stage each time.
+ */
+static void
+cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
+   unsigned i;
+
+   assert(sizeof(quad->outputs.color) == sizeof(tmp));
+   assert(softpipe->framebuffer.num_cbufs <= PIPE_MAX_COLOR_BUFS);
+
+   /* make copy of original colors since they can get modified
+    * by blending and masking.
+    * XXX we won't have to do this if the fragment program actually emits
+    * N separate colors and we're drawing to N color buffers (MRT).
+    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
+    * in effect, we need to save/restore colors like this.
+    */
+   memcpy(tmp, quad->outputs.color, sizeof(tmp));
+
+   for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
+      /* set current cbuffer */
+#if 0 /* obsolete & going away */
+      softpipe->current_cbuf = i;
+#endif
+
+      /* pass blended quad to next stage */
+      qs->next->run(qs->next, quad);
+
+      /* restore quad's colors for next buffer */
+      memcpy(quad->outputs.color, tmp, sizeof(tmp));
+   }
+}
+
+
+static void cbuf_loop_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void cbuf_loop_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+/**
+ * Create the colorbuffer loop stage.
+ * This is used to implement multiple render targets and GL_FRONT_AND_BACK
+ * rendering.
+ */
+struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = cbuf_loop_begin;
+   stage->run = cbuf_loop_quad;
+   stage->destroy = cbuf_loop_destroy;
+
+   return stage;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
new file mode 100644
index 0000000000..f32bdfab78
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -0,0 +1,116 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  quad colormask stage
+ * \author Brian Paul
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+
+/**
+ * XXX colormask could be rolled into blending...
+ */
+static void
+colormask_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      uint i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
+         }
+      }
+
+      /* R */
+      if (!(softpipe->blend->colormask & PIPE_MASK_R))
+          COPY_4V(quadColor[0], dest[0]);
+
+      /* G */
+      if (!(softpipe->blend->colormask & PIPE_MASK_G))
+          COPY_4V(quadColor[1], dest[1]);
+
+      /* B */
+      if (!(softpipe->blend->colormask & PIPE_MASK_B))
+          COPY_4V(quadColor[2], dest[2]);
+
+      /* A */
+      if (!(softpipe->blend->colormask & PIPE_MASK_A))
+          COPY_4V(quadColor[3], dest[3]);
+   }
+
+   /* pass quad to next stage */
+   qs->next->run(qs->next, quad);
+}
+
+
+static void colormask_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void colormask_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = colormask_begin;
+   stage->run = colormask_quad;
+   stage->destroy = colormask_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
new file mode 100644
index 0000000000..ee29aa7dfe
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -0,0 +1,93 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * \brief  Apply AA coverage to quad alpha valus
+ * \author  Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+
+
+/**
+ * Multiply quad's alpha values by the fragment coverage.
+ */
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   if ((softpipe->rasterizer->poly_smooth && quad->input.prim == PRIM_TRI) ||
+       (softpipe->rasterizer->line_smooth && quad->input.prim == PRIM_LINE) ||
+       (softpipe->rasterizer->point_smooth && quad->input.prim == PRIM_POINT)) {
+      uint cbuf;
+
+      /* loop over colorbuffer outputs */
+      for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         unsigned j;
+         for (j = 0; j < QUAD_SIZE; j++) {
+            assert(quad->input.coverage[j] >= 0.0);
+            assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
+         }
+      }
+   }
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void coverage_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void coverage_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = coverage_begin;
+   stage->run = coverage_quad;
+   stage->destroy = coverage_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
new file mode 100644
index 0000000000..523bd3e080
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -0,0 +1,290 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Quad depth testing
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Do depth testing for a quad.
+ * Not static since it's used by the stencil code.
+ */
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+void
+sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
+   const enum pipe_format format = ps->format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   unsigned zmask = 0;
+   unsigned j;
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+
+   assert(ps); /* shouldn't get here if there's no zbuffer */
+
+   /*
+    * Convert quad's float depth values to int depth values (qzzzz).
+    * If the Z buffer stores integer values, we _have_ to do the depth
+    * compares with integers (not floats).  Otherwise, the float->int->float
+    * conversion of Z values (which isn't an identity function) will cause
+    * Z-fighting errors.
+    *
+    * Also, get the zbuffer values (bzzzz) from the cached tile.
+    */
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      {
+         float scale = 65535.0;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth16[y][x];
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      {
+         double scale = (double) (uint) ~0UL;
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x];
+         }
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* fall-through */
+   case PIPE_FORMAT_S8Z24_UNORM:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         }
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* fall-through */
+   case PIPE_FORMAT_Z24S8_UNORM:
+      {
+         float scale = (float) ((1 << 24) - 1);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+         }
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   switch (softpipe->depth_stencil->depth.func) {
+   case PIPE_FUNC_NEVER:
+      /* zmask = 0 */
+      break;
+   case PIPE_FUNC_LESS:
+      /* Note this is pretty much a single sse or cell instruction.  
+       * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] < bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] == bzzzz[j]) 
+	    zmask |= 1 << j;
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] <= bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] > bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] != bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (qzzzz[j] >= bzzzz[j]) 
+	    zmask |= (1 << j);
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      zmask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   quad->inout.mask &= zmask;
+
+   if (softpipe->depth_stencil->depth.writemask) {
+      
+      /* This is also efficient with sse / spe instructions: 
+       */
+      for (j = 0; j < QUAD_SIZE; j++) {
+	 if (quad->inout.mask & (1 << j)) {
+	    bzzzz[j] = qzzzz[j];
+	 }
+      }
+
+      /* put updated Z values back into cached tile */
+      switch (format) {
+      case PIPE_FORMAT_Z16_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+         }
+         break;
+      case PIPE_FORMAT_X8Z24_UNORM:
+         /* fall-through */
+         /* (yes, this falls through to a different case than above) */
+      case PIPE_FORMAT_Z32_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth32[y][x] = bzzzz[j];
+         }
+         break;
+      case PIPE_FORMAT_S8Z24_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint s8z24 = tile->data.depth32[y][x];
+            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
+            tile->data.depth32[y][x] = s8z24;
+         }
+         break;
+      case PIPE_FORMAT_Z24S8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            uint z24s8 = tile->data.depth32[y][x];
+            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
+            tile->data.depth32[y][x] = z24s8;
+         }
+         break;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+static void
+depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   sp_depth_test_quad(qs, quad);
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void depth_test_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void depth_test_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = depth_test_begin;
+   stage->run = depth_test_quad;
+   stage->destroy = depth_test_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
new file mode 100644
index 0000000000..6e2dde304e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -0,0 +1,88 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Quad early-z testing
+ */
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+
+
+/**
+ * All this stage does is compute the quad's Z values (which is normally
+ * done by the shading stage).
+ * The next stage will do the actual depth test.
+ */
+static void
+earlyz_quad(
+   struct quad_stage    *qs,
+   struct quad_header   *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+
+   qs->next->run( qs->next, quad );
+}
+
+static void
+earlyz_begin(
+   struct quad_stage *qs )
+{
+   qs->next->begin( qs->next );
+}
+
+static void
+earlyz_destroy(
+   struct quad_stage *qs )
+{
+   FREE( qs );
+}
+
+struct quad_stage *
+sp_quad_earlyz_stage(
+   struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
+
+   stage->softpipe = softpipe;
+   stage->begin = earlyz_begin;
+   stage->run = earlyz_quad;
+   stage->destroy = earlyz_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
new file mode 100644
index 0000000000..963a2b44f5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -0,0 +1,200 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Vertices are just an array of floats, with all the attributes
+ * packed.  We currently assume a layout like:
+ *
+ * attr[0][0..3] - window position
+ * attr[1..n][0..3] - remaining attributes.
+ *
+ * Attributes are assumed to be 4 floats wide but are packed so that
+ * all the enabled attributes run contiguously.
+ */
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "sp_texture.h"
+#include "sp_tex_sample.h"
+
+
+struct quad_shade_stage
+{
+   struct quad_stage stage;  /**< base class */
+   struct sp_shader_sampler samplers[PIPE_MAX_SAMPLERS];
+   struct sp_shader_sampler *samplers_list[PIPE_MAX_SAMPLERS];
+   struct tgsi_exec_machine machine;
+   struct tgsi_exec_vector *inputs, *outputs;
+};
+
+
+/** cast wrapper */
+static INLINE struct quad_shade_stage *
+quad_shade_stage(struct quad_stage *qs)
+{
+   return (struct quad_shade_stage *) qs;
+}
+
+
+
+/**
+ * Execute fragment shader for the four fragments in the quad.
+ */
+static void
+shade_quad(
+   struct quad_stage *qs,
+   struct quad_header *quad )
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = &qss->machine;
+   boolean z_written;
+   
+   /* Consts do not require 16 byte alignment. */
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+
+   machine->InterpCoefs = quad->coef;
+
+   /* run shader */
+   quad->inout.mask &= softpipe->fs->run( softpipe->fs, 
+				    &qss->machine,
+				    quad );
+
+   /* store outputs */
+   z_written = FALSE;
+   {
+      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
+      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
+      const uint n = qss->stage.softpipe->fs->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+               z_written = TRUE;
+            }
+            break;
+         }
+      }
+   }
+
+   if (!z_written) {
+      /* compute Z values now, as in the quad earlyz stage */
+      /* XXX we should really only do this if the earlyz stage is not used */
+      const float fx = (float) quad->input.x0;
+      const float fy = (float) quad->input.y0;
+      const float dzdx = quad->posCoef->dadx[2];
+      const float dzdy = quad->posCoef->dady[2];
+      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+
+      quad->output.depth[0] = z0;
+      quad->output.depth[1] = z0 + dzdx;
+      quad->output.depth[2] = z0 + dzdy;
+      quad->output.depth[3] = z0 + dzdx + dzdy;
+   }
+
+   /* shader may cull fragments */
+   if( quad->inout.mask ) {
+      qs->next->run( qs->next, quad );
+   }
+}
+
+/**
+ * Per-primitive (or per-begin?) setup
+ */
+static void shade_begin(struct quad_stage *qs)
+{
+   struct quad_shade_stage *qss = quad_shade_stage(qs);
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   softpipe->fs->prepare( softpipe->fs, 
+			  &qss->machine,
+			  (struct tgsi_sampler **) qss->samplers_list );
+
+   qs->next->begin(qs->next);
+}
+
+
+static void shade_destroy(struct quad_stage *qs)
+{
+   struct quad_shade_stage *qss = (struct quad_shade_stage *) qs;
+
+   tgsi_exec_machine_free_data(&qss->machine);
+   FREE( qss->inputs );
+   FREE( qss->outputs );
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_shade_stage( struct softpipe_context *softpipe )
+{
+   struct quad_shade_stage *qss = CALLOC_STRUCT(quad_shade_stage);
+   uint i;
+
+   /* allocate storage for program inputs/outputs, aligned to 16 bytes */
+   qss->inputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->inputs) + 16);
+   qss->outputs = MALLOC(PIPE_MAX_ATTRIBS * sizeof(*qss->outputs) + 16);
+   qss->machine.Inputs = align16(qss->inputs);
+   qss->machine.Outputs = align16(qss->outputs);
+
+   qss->stage.softpipe = softpipe;
+   qss->stage.begin = shade_begin;
+   qss->stage.run = shade_quad;
+   qss->stage.destroy = shade_destroy;
+
+   /* setup TGSI sampler state */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      assert(softpipe->tex_cache[i]);
+      qss->samplers[i].base.get_samples = sp_get_samples;
+      qss->samplers[i].unit = i;
+      qss->samplers[i].sp = softpipe;
+      qss->samplers[i].cache = softpipe->tex_cache[i];
+      qss->samplers_list[i] = &qss->samplers[i];
+   }
+
+   tgsi_exec_machine_init( &qss->machine );
+
+   return &qss->stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
new file mode 100644
index 0000000000..169bd82876
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -0,0 +1,85 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * \brief  Quad occlusion counter stage
+ * \author  Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+
+static unsigned count_bits( unsigned val )
+{
+   unsigned i;
+
+   for (i = 0; val ; val >>= 1)
+      i += (val & 1);
+
+   return i;
+}
+
+static void
+occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+
+   softpipe->occlusion_count += count_bits(quad->inout.mask);
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void occlusion_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void occlusion_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = occlusion_begin;
+   stage->run = occlusion_count_quad;
+   stage->destroy = occlusion_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
new file mode 100644
index 0000000000..b7aac7f84a
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -0,0 +1,103 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_quad.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * Last step of quad processing: write quad colors to the framebuffer,
+ * taking mask into account.
+ */
+static void
+output_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   /* in-tile pos: */
+   const int itx = quad->input.x0 % TILE_SIZE;
+   const int ity = quad->input.y0 % TILE_SIZE;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe,
+                              softpipe->cbuf_cache[cbuf],
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      int i, j;
+
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
+            if (0) {
+               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
+                            quad->input.x0 + x,
+                            quad->input.y0 + y,
+                            quadColor[0][j],
+                            quadColor[1][j],
+                            quadColor[2][j]);
+            }
+         }
+      }
+   }
+}
+
+
+static void output_begin(struct quad_stage *qs)
+{
+   assert(qs->next == NULL);
+}
+
+
+static void output_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = output_begin;
+   stage->run = output_quad;
+   stage->destroy = output_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
new file mode 100644
index 0000000000..abb5487748
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -0,0 +1,352 @@
+
+/**
+ * \brief Quad stencil testing
+ */
+
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (stencilVals[j] > 0) {
+               newstencil[j] = stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~stencilVals[j];
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   uint j;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].write_mask;
+   valMask = softpipe->depth_stencil->stencil[face].value_mask;
+
+   assert(ps); /* shouldn't get here if there's no stencil buffer */
+
+   /* get stencil values from cached tile */
+   switch (ps->format) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   case PIPE_FORMAT_S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         stencilVals[j] = tile->data.stencil8[y][x];
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(stencilVals, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned failMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
+         }
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned passMask = origMask & quad->inout.mask;
+            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
+         }
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+
+   }
+
+   /* put new stencil values into cached tile */
+   switch (ps->format) {
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         uint s8z24 = tile->data.depth32[y][x];
+         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
+         tile->data.depth32[y][x] = s8z24;
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         uint z24s8 = tile->data.depth32[y][x];
+         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
+         tile->data.depth32[y][x] = z24s8;
+      }
+      break;
+   case PIPE_FORMAT_S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.stencil8[y][x] = stencilVals[j];
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   if (quad->inout.mask)
+      qs->next->run(qs->next, quad);
+}
+
+
+static void stencil_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void stencil_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = stencil_begin;
+   stage->run = stencil_test_quad;
+   stage->destroy = stencil_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
new file mode 100644
index 0000000000..ccf37f6be5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -0,0 +1,94 @@
+
+/**
+ * quad polygon stipple stage
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+
+
+/**
+ * Apply polygon stipple to quads produced by triangle rasterization
+ */
+static void
+stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   static const uint bit31 = 1 << 31;
+   static const uint bit30 = 1 << 30;
+
+   if (quad->input.prim == PRIM_TRI) {
+      struct softpipe_context *softpipe = qs->softpipe;
+      /* need to invert Y to index into OpenGL's stipple pattern */
+      int y0, y1;
+      uint stipple0, stipple1;
+      if (softpipe->rasterizer->origin_lower_left) {
+         y0 = softpipe->framebuffer.height - 1 - quad->input.y0;
+         y1 = y0 - 1;
+      }
+      else {
+         y0 = quad->input.y0;
+         y1 = y0 + 1;
+      }
+      stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
+      stipple1 = softpipe->poly_stipple.stipple[y1 % 32];
+
+#if 1
+      {
+      const int col0 = quad->input.x0 % 32;
+      if ((stipple0 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_LEFT;
+
+      if ((stipple0 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_TOP_RIGHT;
+
+      if ((stipple1 & (bit31 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_LEFT;
+
+      if ((stipple1 & (bit30 >> col0)) == 0)
+         quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
+      }
+#else
+      /* We'd like to use this code, but we'd need to redefine
+       * MASK_TOP_LEFT to be (1 << 1) and MASK_TOP_RIGHT to be (1 << 0),
+       * and similarly for the BOTTOM bits.  But that may have undesirable
+       * side effects elsewhere.
+       */
+      const int col0 = 30 - (quad->input.x0 % 32);
+      quad->inout.mask &= (((stipple0 >> col0) & 0x3) | 
+                     (((stipple1 >> col0) & 0x3) << 2));
+#endif
+      if (!quad->inout.mask)
+         return;
+   }
+
+   qs->next->run(qs->next, quad);
+}
+
+
+static void stipple_begin(struct quad_stage *qs)
+{
+   qs->next->begin(qs->next);
+}
+
+
+static void stipple_destroy(struct quad_stage *qs)
+{
+   FREE( qs );
+}
+
+
+struct quad_stage *
+sp_quad_polygon_stipple_stage( struct softpipe_context *softpipe )
+{
+   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
+
+   stage->softpipe = softpipe;
+   stage->begin = stipple_begin;
+   stage->run = stipple_quad;
+   stage->destroy = stipple_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
new file mode 100644
index 0000000000..2106ee1d23
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -0,0 +1,107 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "draw/draw_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_query.h"
+
+struct softpipe_query {
+   uint64 start;
+   uint64 end;
+};
+
+
+static struct softpipe_query *softpipe_query( struct pipe_query *p )
+{
+   return (struct softpipe_query *)p;
+}
+
+static struct pipe_query *
+softpipe_create_query(struct pipe_context *pipe, 
+		      unsigned type)
+{
+   assert(type == PIPE_QUERY_OCCLUSION_COUNTER);
+   return (struct pipe_query *)CALLOC_STRUCT( softpipe_query );
+}
+
+
+static void
+softpipe_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   FREE(q);
+}
+
+
+static void
+softpipe_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+   
+   sq->start = softpipe->occlusion_count;
+}
+
+
+static void
+softpipe_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+   struct softpipe_query *sq = softpipe_query(q);
+
+   sq->end = softpipe->occlusion_count;
+}
+
+
+static boolean
+softpipe_get_query_result(struct pipe_context *pipe, 
+			  struct pipe_query *q,
+			  boolean wait,
+			  uint64 *result )
+{
+   struct softpipe_query *sq = softpipe_query(q);
+   *result = sq->end - sq->start;
+   return TRUE;
+}
+
+
+void softpipe_init_query_funcs(struct softpipe_context *softpipe )
+{
+   softpipe->pipe.create_query = softpipe_create_query;
+   softpipe->pipe.destroy_query = softpipe_destroy_query;
+   softpipe->pipe.begin_query = softpipe_begin_query;
+   softpipe->pipe.end_query = softpipe_end_query;
+   softpipe->pipe.get_query_result = softpipe_get_query_result;
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
new file mode 100644
index 0000000000..05060a4575
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Author:
+ *    Keith Whitwell
+ */
+
+#ifndef SP_QUERY_H
+#define SP_QUERY_H
+
+struct softpipe_context;
+extern void softpipe_init_query_funcs(struct softpipe_context * );
+
+
+#endif /* SP_QUERY_H */
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
new file mode 100644
index 0000000000..12f98c32f5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -0,0 +1,179 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "util/u_memory.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+
+#include "sp_texture.h"
+#include "sp_winsys.h"
+#include "sp_screen.h"
+
+
+static const char *
+softpipe_get_vendor(struct pipe_screen *screen)
+{
+   return "Tungsten Graphics, Inc.";
+}
+
+
+static const char *
+softpipe_get_name(struct pipe_screen *screen)
+{
+   return "softpipe";
+}
+
+
+static int
+softpipe_get_param(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
+      return 8;
+   case PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS:
+      return 8;
+   case PIPE_CAP_NPOT_TEXTURES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_GLSL:
+      return 1;
+   case PIPE_CAP_S3TC:
+      return 0;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_OCCLUSION_QUERY:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return 12; /* max 2Kx2K */
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return 8;  /* max 128x128x128 */
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return 12; /* max 2Kx2K */
+   default:
+      return 0;
+   }
+}
+
+
+static float
+softpipe_get_paramf(struct pipe_screen *screen, int param)
+{
+   switch (param) {
+   case PIPE_CAP_MAX_LINE_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_LINE_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_POINT_WIDTH:
+      /* fall-through */
+   case PIPE_CAP_MAX_POINT_WIDTH_AA:
+      return 255.0; /* arbitrary */
+   case PIPE_CAP_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+   case PIPE_CAP_MAX_TEXTURE_LOD_BIAS:
+      return 16.0; /* arbitrary */
+   default:
+      return 0;
+   }
+}
+
+
+/**
+ * Query format support for creating a texture, drawing surface, etc.
+ * \param format  the format to test
+ * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+ */
+static boolean
+softpipe_is_format_supported( struct pipe_screen *screen,
+                              enum pipe_format format, 
+                              enum pipe_texture_target target,
+                              unsigned tex_usage, 
+                              unsigned geom_flags )
+{
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      return FALSE;
+   default:
+      return TRUE;
+   }
+}
+
+
+static void
+softpipe_destroy_screen( struct pipe_screen *screen )
+{
+   struct pipe_winsys *winsys = screen->winsys;
+
+   if(winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+
+
+/**
+ * Create a new pipe_screen object
+ * Note: we're not presently subclassing pipe_screen (no softpipe_screen).
+ */
+struct pipe_screen *
+softpipe_create_screen(struct pipe_winsys *winsys)
+{
+   struct softpipe_screen *screen = CALLOC_STRUCT(softpipe_screen);
+
+   if (!screen)
+      return NULL;
+
+   screen->base.winsys = winsys;
+
+   screen->base.destroy = softpipe_destroy_screen;
+
+   screen->base.get_name = softpipe_get_name;
+   screen->base.get_vendor = softpipe_get_vendor;
+   screen->base.get_param = softpipe_get_param;
+   screen->base.get_paramf = softpipe_get_paramf;
+   screen->base.is_format_supported = softpipe_is_format_supported;
+
+   softpipe_init_screen_texture_funcs(&screen->base);
+
+   return &screen->base;
+}
diff --git a/src/gallium/drivers/softpipe/sp_screen.h b/src/gallium/drivers/softpipe/sp_screen.h
new file mode 100644
index 0000000000..3d4bfd3e84
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_screen.h
@@ -0,0 +1,58 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SCREEN_H
+#define SP_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+
+
+
+struct softpipe_screen {
+   struct pipe_screen base;
+
+   /* Increments whenever textures are modified.  Contexts can track
+    * this.
+    */
+   unsigned timestamp;          
+};
+
+
+
+
+static INLINE struct softpipe_screen *
+softpipe_screen( struct pipe_screen *pipe )
+{
+   return (struct softpipe_screen *)pipe;
+}
+
+
+#endif /* SP_SCREEN_H */
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
new file mode 100644
index 0000000000..13d8017393
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -0,0 +1,1569 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ *
+ * \author  Keith Whitwell <keith@tungstengraphics.com>
+ * \author  Brian Paul
+ */
+
+#include "sp_setup.h"
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_quad.h"
+#include "sp_state.h"
+#include "sp_prim_setup.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+#define DEBUG_VERTS 0
+#define DEBUG_FRAGS 0
+
+/**
+ * Triangle edge info
+ */
+struct edge {
+   float dx;		/**< X(v1) - X(v0), used only during setup */
+   float dy;		/**< Y(v1) - Y(v0), used only during setup */
+   float dxdy;		/**< dx/dy */
+   float sx, sy;	/**< first sample point coord */
+   int lines;		/**< number of lines on this edge */
+};
+
+#if SP_NUM_QUAD_THREADS > 1
+
+/* Set to 1 if you want other threads to be instantly
+ * notified of pending jobs.
+ */
+#define INSTANT_NOTEMPTY_NOTIFY 0
+
+struct thread_info
+{
+   struct setup_context *setup;
+   uint id;
+   pipe_thread handle;
+};
+
+struct quad_job;
+
+typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
+
+struct quad_job
+{
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   quad_job_routine routine;
+};
+
+#define NUM_QUAD_JOBS 64
+
+struct quad_job_que
+{
+   struct quad_job jobs[NUM_QUAD_JOBS];
+   uint first;
+   uint last;
+   pipe_mutex que_mutex;
+   pipe_condvar que_notfull_condvar;
+   pipe_condvar que_notempty_condvar;
+   uint jobs_added;
+   uint jobs_done;
+   pipe_condvar que_done_condvar;
+};
+
+static void
+add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
+{
+#if INSTANT_NOTEMPTY_NOTIFY
+   boolean empty;
+#endif
+
+   /* Wait for empty slot, see if the que is empty.
+    */
+   pipe_mutex_lock( que->que_mutex );
+   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
+#if !INSTANT_NOTEMPTY_NOTIFY
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+#endif
+      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
+   }
+#if INSTANT_NOTEMPTY_NOTIFY
+   empty = que->last == que->first;
+#endif
+   que->jobs_added++;
+   pipe_mutex_unlock( que->que_mutex );
+
+   /* Submit new job.
+    */
+   que->jobs[que->last].input = quad->input;
+   que->jobs[que->last].inout = quad->inout;
+   que->jobs[que->last].routine = routine;
+   que->last = (que->last + 1) % NUM_QUAD_JOBS;
+
+#if INSTANT_NOTEMPTY_NOTIFY
+   /* If the que was empty, notify consumers there's a job to be done.
+    */
+   if (empty) {
+      pipe_mutex_lock( que->que_mutex );
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+#endif
+}
+
+#endif
+
+/**
+ * Triangle setup info (derived from draw_stage).
+ * Also used for line drawing (taking some liberties).
+ */
+struct setup_context {
+   struct softpipe_context *softpipe;
+
+   /* Vertices are just an array of floats making up each attribute in
+    * turn.  Currently fixed at 4 floats, but should change in time.
+    * Codegen will help cope with this.
+    */
+   const float (*vmax)[4];
+   const float (*vmid)[4];
+   const float (*vmin)[4];
+   const float (*vprovoke)[4];
+
+   struct edge ebot;
+   struct edge etop;
+   struct edge emaj;
+
+   float oneoverarea;
+
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+   struct tgsi_interp_coef posCoef;  /* For Z, W */
+   struct quad_header quad;
+
+#if SP_NUM_QUAD_THREADS > 1
+   struct quad_job_que que;
+   struct thread_info threads[SP_NUM_QUAD_THREADS];
+#endif
+
+   struct {
+      int left[2];   /**< [0] = row0, [1] = row1 */
+      int right[2];
+      int y;
+      unsigned y_flags;
+      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
+   } span;
+
+#if DEBUG_FRAGS
+   uint numFragsEmitted;  /**< per primitive */
+   uint numFragsWritten;  /**< per primitive */
+#endif
+
+   unsigned winding;		/* which winding to cull */
+};
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static PIPE_THREAD_ROUTINE( quad_thread, param )
+{
+   struct thread_info *info = (struct thread_info *) param;
+   struct quad_job_que *que = &info->setup->que;
+
+   for (;;) {
+      struct quad_job job;
+      boolean full;
+
+      /* Wait for an available job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      while (que->last == que->first)
+         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
+
+      /* See if the que is full.
+       */
+      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
+
+      /* Take a job and remove it from que.
+       */
+      job = que->jobs[que->first];
+      que->first = (que->first + 1) % NUM_QUAD_JOBS;
+
+      /* Notify the producer if the que is not full.
+       */
+      if (full)
+         pipe_condvar_signal( que->que_notfull_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+
+      job.routine( info->setup, info->id, &job );
+
+      /* Notify the producer if that's the last finished job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      que->jobs_done++;
+      if (que->jobs_added == que->jobs_done)
+         pipe_condvar_signal( que->que_done_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+
+   return NULL;
+}
+
+#define WAIT_FOR_COMPLETION(setup) \
+   do {\
+      pipe_mutex_lock( setup->que.que_mutex );\
+      if (!INSTANT_NOTEMPTY_NOTIFY)\
+         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
+      while (setup->que.jobs_added != setup->que.jobs_done)\
+         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
+      pipe_mutex_unlock( setup->que.que_mutex );\
+   } while (0)
+
+#else
+
+#define WAIT_FOR_COMPLETION(setup) ((void) 0)
+
+#endif
+
+/**
+ * Test if x is NaN or +/- infinity.
+ */
+static INLINE boolean
+is_inf_or_nan(float x)
+{
+   union fi tmp;
+   tmp.f = x;
+   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
+}
+
+
+static boolean cull_tri( struct setup_context *setup,
+		      float det )
+{
+   if (det != 0) 
+   {   
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+      
+      if ((winding & setup->winding) == 0) 
+	 return FALSE;
+   }
+
+   /* Culled:
+    */
+   return TRUE;
+}
+
+
+
+/**
+ * Clip setup->quad against the scissor/surface bounds.
+ */
+static INLINE void
+quad_clip( struct setup_context *setup, struct quad_header *quad )
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
+      /* totally clipped */
+      quad->inout.mask = 0x0;
+      return;
+   }
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+}
+
+
+/**
+ * Emit a quad (pass to next stage) with clipping.
+ */
+static INLINE void
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+{
+   quad_clip( setup, quad );
+   if (quad->inout.mask) {
+      struct softpipe_context *sp = setup->softpipe;
+
+      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+   }
+}
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   clip_emit_quad( setup, &quad, thread );
+}
+
+#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
+
+#else
+
+#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
+
+#endif
+
+/**
+ * Emit a quad (pass to next stage).  No clipping is done.
+ */
+static INLINE void
+emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+{
+   struct softpipe_context *sp = setup->softpipe;
+#if DEBUG_FRAGS
+   uint mask = quad->inout.mask;
+#endif
+
+#if DEBUG_FRAGS
+   if (mask & 1) setup->numFragsEmitted++;
+   if (mask & 2) setup->numFragsEmitted++;
+   if (mask & 4) setup->numFragsEmitted++;
+   if (mask & 8) setup->numFragsEmitted++;
+#endif
+   sp->quad[thread].first->run( sp->quad[thread].first, quad );
+#if DEBUG_FRAGS
+   mask = quad->inout.mask;
+   if (mask & 1) setup->numFragsWritten++;
+   if (mask & 2) setup->numFragsWritten++;
+   if (mask & 4) setup->numFragsWritten++;
+   if (mask & 8) setup->numFragsWritten++;
+#endif
+}
+
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   emit_quad( setup, &quad, thread );
+}
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
+   } while (0)
+
+#else
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      emit_quad( setup, &setup->quad, 0 );\
+   } while (0)
+
+#endif
+
+/**
+ * Given an X or Y coordinate, return the block/quad coordinate that it
+ * belongs to.
+ */
+static INLINE int block( int x )
+{
+   return x & ~1;
+}
+
+
+/**
+ * Render a horizontal span of quads
+ */
+static void flush_spans( struct setup_context *setup )
+{
+   const int xleft0 = setup->span.left[0];
+   const int xleft1 = setup->span.left[1];
+   const int xright0 = setup->span.right[0];
+   const int xright1 = setup->span.right[1];
+   int minleft, maxright;
+   int x;
+
+   switch (setup->span.y_flags) {
+   case 0x3:
+      /* both odd and even lines written (both quad rows) */
+      minleft = block(MIN2(xleft0, xleft1));
+      maxright = block(MAX2(xright0, xright1));
+      for (x = minleft; x <= maxright; x += 2) {
+         /* determine which of the four pixels is inside the span bounds */
+         uint mask = 0x0;
+         if (x >= xleft0 && x < xright0)
+            mask |= MASK_TOP_LEFT;
+         if (x >= xleft1 && x < xright1)
+            mask |= MASK_BOTTOM_LEFT;
+         if (x+1 >= xleft0 && x+1 < xright0)
+            mask |= MASK_TOP_RIGHT;
+         if (x+1 >= xleft1 && x+1 < xright1)
+            mask |= MASK_BOTTOM_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   case 0x1:
+      /* only even line written (quad top row) */
+      minleft = block(xleft0);
+      maxright = block(xright0);
+      for (x = minleft; x <= maxright; x += 2) {
+         uint mask = 0x0;
+         if (x >= xleft0 && x < xright0)
+            mask |= MASK_TOP_LEFT;
+         if (x+1 >= xleft0 && x+1 < xright0)
+            mask |= MASK_TOP_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   case 0x2:
+      /* only odd line written (quad bottom row) */
+      minleft = block(xleft1);
+      maxright = block(xright1);
+      for (x = minleft; x <= maxright; x += 2) {
+         uint mask = 0x0;
+         if (x >= xleft1 && x < xright1)
+            mask |= MASK_BOTTOM_LEFT;
+         if (x+1 >= xleft1 && x+1 < xright1)
+            mask |= MASK_BOTTOM_RIGHT;
+         EMIT_QUAD( setup, x, setup->span.y, mask );
+      }
+      break;
+
+   default:
+      return;
+   }
+
+   setup->span.y = 0;
+   setup->span.y_flags = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+}
+
+
+#if DEBUG_VERTS
+static void print_vertex(const struct setup_context *setup,
+                         const float (*v)[4])
+{
+   int i;
+   debug_printf("   Vertex: (%p)\n", v);
+   for (i = 0; i < setup->quad.nr_attrs; i++) {
+      debug_printf("     %d: %f %f %f %f\n",  i,
+              v[i][0], v[i][1], v[i][2], v[i][3]);
+   }
+}
+#endif
+
+/**
+ * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
+ */
+static boolean setup_sort_vertices( struct setup_context *setup,
+                                    float det,
+                                    const float (*v0)[4],
+                                    const float (*v1)[4],
+                                    const float (*v2)[4] )
+{
+   setup->vprovoke = v2;
+
+   /* determine bottom to top order of vertices */
+   {
+      float y0 = v0[0][1];
+      float y1 = v1[0][1];
+      float y2 = v2[0][1];
+      if (y0 <= y1) {
+	 if (y1 <= y2) {
+	    /* y0<=y1<=y2 */
+	    setup->vmin = v0;
+	    setup->vmid = v1;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y0) {
+	    /* y2<=y0<=y1 */
+	    setup->vmin = v2;
+	    setup->vmid = v0;
+	    setup->vmax = v1;
+	 }
+	 else {
+	    /* y0<=y2<=y1 */
+	    setup->vmin = v0;
+	    setup->vmid = v2;
+	    setup->vmax = v1;
+	 }
+      }
+      else {
+	 if (y0 <= y2) {
+	    /* y1<=y0<=y2 */
+	    setup->vmin = v1;
+	    setup->vmid = v0;
+	    setup->vmax = v2;
+	 }
+	 else if (y2 <= y1) {
+	    /* y2<=y1<=y0 */
+	    setup->vmin = v2;
+	    setup->vmid = v1;
+	    setup->vmax = v0;
+	 }
+	 else {
+	    /* y1<=y2<=y0 */
+	    setup->vmin = v1;
+	    setup->vmid = v2;
+	    setup->vmax = v0;
+	 }
+      }
+   }
+
+   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
+   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
+   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
+
+   /*
+    * Compute triangle's area.  Use 1/area to compute partial
+    * derivatives of attributes later.
+    *
+    * The area will be the same as prim->det, but the sign may be
+    * different depending on how the vertices get sorted above.
+    *
+    * To determine whether the primitive is front or back facing we
+    * use the prim->det value because its sign is correct.
+    */
+   {
+      const float area = (setup->emaj.dx * setup->ebot.dy -
+			    setup->ebot.dx * setup->emaj.dy);
+
+      setup->oneoverarea = 1.0f / area;
+
+      /*
+      debug_printf("%s one-over-area %f  area %f  det %f\n",
+                   __FUNCTION__, setup->oneoverarea, area, det );
+      */
+      if (is_inf_or_nan(setup->oneoverarea))
+         return FALSE;
+   }
+
+   /* We need to know if this is a front or back-facing triangle for:
+    *  - the GLSL gl_FrontFacing fragment attribute (bool)
+    *  - two-sided stencil test
+    */
+   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+
+   return TRUE;
+}
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ * The value value comes from vertex[slot][i].
+ * The result will be put into setup->coef[slot].a0[i].
+ * \param slot  which attribute slot
+ * \param i  which component of the slot (0..3)
+ */
+static void const_coeff( struct setup_context *setup,
+                         struct tgsi_interp_coef *coef,
+                         uint vertSlot, uint i)
+{
+   assert(i <= 3);
+
+   coef->dadx[i] = 0;
+   coef->dady[i] = 0;
+
+   /* need provoking vertex info!
+    */
+   coef->a0[i] = setup->vprovoke[vertSlot][i];
+}
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void tri_linear_coeff( struct setup_context *setup,
+                              struct tgsi_interp_coef *coef,
+                              uint vertSlot, uint i)
+{
+   float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
+   float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+
+   /*
+   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
+		slot, "xyzw"[i],
+		setup->coef[slot].a0[i],
+		setup->coef[slot].dadx[i],
+		setup->coef[slot].dady[i]);
+   */
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void tri_persp_coeff( struct setup_context *setup,
+                             struct tgsi_interp_coef *coef,
+                             uint vertSlot, uint i)
+{
+   /* premultiply by 1/w  (v[0][3] is always W):
+    */
+   float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+   float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
+   float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+   float botda = mida - mina;
+   float majda = maxa - mina;
+   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
+   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
+   float dadx = a * setup->oneoverarea;
+   float dady = b * setup->oneoverarea;
+
+   /*
+   debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
+          	setup->vmin[vertSlot][i],
+          	setup->vmid[vertSlot][i],
+       		setup->vmax[vertSlot][i]
+          );
+   */
+   assert(i <= 3);
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (mina -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * Z and W are copied from posCoef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coeff(struct setup_context *setup, uint slot)
+{
+   /*X*/
+   setup->coef[slot].a0[0] = 0;
+   setup->coef[slot].dadx[0] = 1.0;
+   setup->coef[slot].dady[0] = 0.0;
+   /*Y*/
+   if (setup->softpipe->rasterizer->origin_lower_left) {
+      /* y=0=bottom */
+      const int winHeight = setup->softpipe->framebuffer.height;
+      setup->coef[slot].a0[1] = (float) (winHeight - 1);
+      setup->coef[slot].dady[1] = -1.0;
+   }
+   else {
+      /* y=0=top */
+      setup->coef[slot].a0[1] = 0.0;
+      setup->coef[slot].dady[1] = 1.0;
+   }
+   setup->coef[slot].dadx[1] = 0.0;
+   /*Z*/
+   setup->coef[slot].a0[2] = setup->posCoef.a0[2];
+   setup->coef[slot].dadx[2] = setup->posCoef.dadx[2];
+   setup->coef[slot].dady[2] = setup->posCoef.dady[2];
+   /*W*/
+   setup->coef[slot].a0[3] = setup->posCoef.a0[3];
+   setup->coef[slot].dadx[3] = setup->posCoef.dadx[3];
+   setup->coef[slot].dady[3] = setup->posCoef.dady[3];
+}
+
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
+ */
+static void setup_tri_coefficients( struct setup_context *setup )
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+
+   /* z and w are done by linear interpolation:
+    */
+   tri_linear_coeff(setup, &setup->posCoef, 0, 2);
+   tri_linear_coeff(setup, &setup->posCoef, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            tri_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            tri_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+}
+
+
+
+static void setup_tri_edges( struct setup_context *setup )
+{
+   float vmin_x = setup->vmin[0][0] + 0.5f;
+   float vmid_x = setup->vmid[0][0] + 0.5f;
+
+   float vmin_y = setup->vmin[0][1] - 0.5f;
+   float vmid_y = setup->vmid[0][1] - 0.5f;
+   float vmax_y = setup->vmax[0][1] - 0.5f;
+
+   setup->emaj.sy = ceilf(vmin_y);
+   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
+   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
+   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
+
+   setup->etop.sy = ceilf(vmid_y);
+   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
+   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
+   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
+
+   setup->ebot.sy = ceilf(vmin_y);
+   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
+   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
+   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
+}
+
+
+/**
+ * Render the upper or lower half of a triangle.
+ * Scissoring/cliprect is applied here too.
+ */
+static void subtriangle( struct setup_context *setup,
+			 struct edge *eleft,
+			 struct edge *eright,
+			 unsigned lines )
+{
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const int minx = (int) cliprect->minx;
+   const int maxx = (int) cliprect->maxx;
+   const int miny = (int) cliprect->miny;
+   const int maxy = (int) cliprect->maxy;
+   int y, start_y, finish_y;
+   int sy = (int)eleft->sy;
+
+   assert((int)eleft->sy == (int) eright->sy);
+
+   /* clip top/bottom */
+   start_y = sy;
+   finish_y = sy + lines;
+
+   if (start_y < miny)
+      start_y = miny;
+
+   if (finish_y > maxy)
+      finish_y = maxy;
+
+   start_y -= sy;
+   finish_y -= sy;
+
+   /*
+   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
+   */
+
+   for (y = start_y; y < finish_y; y++) {
+
+      /* avoid accumulating adds as floats don't have the precision to
+       * accurately iterate large triangle edges that way.  luckily we
+       * can just multiply these days.
+       *
+       * this is all drowned out by the attribute interpolation anyway.
+       */
+      int left = (int)(eleft->sx + y * eleft->dxdy);
+      int right = (int)(eright->sx + y * eright->dxdy);
+
+      /* clip left/right */
+      if (left < minx)
+         left = minx;
+      if (right > maxx)
+         right = maxx;
+
+      if (left < right) {
+         int _y = sy + y;
+         if (block(_y) != setup->span.y) {
+            flush_spans(setup);
+            setup->span.y = block(_y);
+         }
+
+         setup->span.left[_y&1] = left;
+         setup->span.right[_y&1] = right;
+         setup->span.y_flags |= 1<<(_y&1);
+      }
+   }
+
+
+   /* save the values so that emaj can be restarted:
+    */
+   eleft->sx += lines * eleft->dxdy;
+   eright->sx += lines * eright->dxdy;
+   eleft->sy += lines;
+   eright->sy += lines;
+}
+
+
+/**
+ * Recalculate prim's determinant.  This is needed as we don't have
+ * get this information through the vbuf_render interface & we must
+ * calculate it here.
+ */
+static float
+calc_det( const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
+/**
+ * Do setup for triangle rasterization, then render the triangle.
+ */
+void setup_tri( struct setup_context *setup,
+                const float (*v0)[4],
+                const float (*v1)[4],
+                const float (*v2)[4] )
+{
+   float det;
+
+#if DEBUG_VERTS
+   debug_printf("Setup triangle:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+   print_vertex(setup, v2);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+   
+   det = calc_det(v0, v1, v2);
+   /*
+   debug_printf("%s\n", __FUNCTION__ );
+   */
+
+#if DEBUG_FRAGS
+   setup->numFragsEmitted = 0;
+   setup->numFragsWritten = 0;
+#endif
+
+   if (cull_tri( setup, det ))
+      return;
+
+   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
+      return;
+   setup_tri_coefficients( setup );
+   setup_tri_edges( setup );
+
+   setup->quad.input.prim = PRIM_TRI;
+
+   setup->span.y = 0;
+   setup->span.y_flags = 0;
+   setup->span.right[0] = 0;
+   setup->span.right[1] = 0;
+   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
+
+   /*   init_constant_attribs( setup ); */
+
+   if (setup->oneoverarea < 0.0) {
+      /* emaj on left:
+       */
+      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
+      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+   }
+   else {
+      /* emaj on right:
+       */
+      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
+      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+   }
+
+   flush_spans( setup );
+
+   WAIT_FOR_COMPLETION(setup);
+
+#if DEBUG_FRAGS
+   printf("Tri: %u frags emitted, %u written\n",
+          setup->numFragsEmitted,
+          setup->numFragsWritten);
+#endif
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a line.
+ */
+static void
+line_linear_coeff(struct setup_context *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a line.
+ */
+static void
+line_persp_coeff(struct setup_context *setup,
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   /* XXX double-check/verify this arithmetic */
+   const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
+   const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
+   const float da = a1 - a0;
+   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
+   const float dady = da * setup->emaj.dy * setup->oneoverarea;
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (setup->vmin[vertSlot][i] -
+                  (dadx * (setup->vmin[0][0] - 0.5f) +
+                   dady * (setup->vmin[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute the setup->coef[] array dadx, dady, a0 values.
+ * Must be called after setup->vmin,vmax are initialized.
+ */
+static INLINE boolean
+setup_line_coefficients(struct setup_context *setup,
+                        const float (*v0)[4],
+                        const float (*v1)[4])
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+   float area;
+
+   /* use setup->vmin, vmax to point to vertices */
+   setup->vprovoke = v1;
+   setup->vmin = v0;
+   setup->vmax = v1;
+
+   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
+   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
+
+   /* NOTE: this is not really area but something proportional to it */
+   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
+   if (area == 0.0f || is_inf_or_nan(area))
+      return FALSE;
+   setup->oneoverarea = 1.0f / area;
+
+   /* z and w are done by linear interpolation:
+    */
+   line_linear_coeff(setup, &setup->posCoef, 0, 2);
+   line_linear_coeff(setup, &setup->posCoef, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            line_linear_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            line_persp_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+   return TRUE;
+}
+
+
+/**
+ * Plot a pixel in a line segment.
+ */
+static INLINE void
+plot(struct setup_context *setup, int x, int y)
+{
+   const int iy = y & 1;
+   const int ix = x & 1;
+   const int quadX = x - ix;
+   const int quadY = y - iy;
+   const int mask = (1 << ix) << (2 * iy);
+
+   if (quadX != setup->quad.input.x0 ||
+       quadY != setup->quad.input.y0)
+   {
+      /* flush prev quad, start new quad */
+
+      if (setup->quad.input.x0 != -1)
+         CLIP_EMIT_QUAD(setup);
+
+      setup->quad.input.x0 = quadX;
+      setup->quad.input.y0 = quadY;
+      setup->quad.inout.mask = 0x0;
+   }
+
+   setup->quad.inout.mask |= mask;
+}
+
+
+/**
+ * Do setup for line rasterization, then render the line.
+ * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
+ * to handle stippling and wide lines.
+ */
+void
+setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4])
+{
+   int x0 = (int) v0[0][0];
+   int x1 = (int) v1[0][0];
+   int y0 = (int) v0[0][1];
+   int y1 = (int) v1[0][1];
+   int dx = x1 - x0;
+   int dy = y1 - y0;
+   int xstep, ystep;
+
+#if DEBUG_VERTS
+   debug_printf("Setup line:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+#endif
+
+   if (setup->softpipe->no_rast)
+      return;
+
+   if (dx == 0 && dy == 0)
+      return;
+
+   if (!setup_line_coefficients(setup, v0, v1))
+      return;
+
+   assert(v0[0][0] < 1.0e9);
+   assert(v0[0][1] < 1.0e9);
+   assert(v1[0][0] < 1.0e9);
+   assert(v1[0][1] < 1.0e9);
+
+   if (dx < 0) {
+      dx = -dx;   /* make positive */
+      xstep = -1;
+   }
+   else {
+      xstep = 1;
+   }
+
+   if (dy < 0) {
+      dy = -dy;   /* make positive */
+      ystep = -1;
+   }
+   else {
+      ystep = 1;
+   }
+
+   assert(dx >= 0);
+   assert(dy >= 0);
+
+   setup->quad.input.x0 = setup->quad.input.y0 = -1;
+   setup->quad.inout.mask = 0x0;
+   setup->quad.input.prim = PRIM_LINE;
+   /* XXX temporary: set coverage to 1.0 so the line appears
+    * if AA mode happens to be enabled.
+    */
+   setup->quad.input.coverage[0] =
+   setup->quad.input.coverage[1] =
+   setup->quad.input.coverage[2] =
+   setup->quad.input.coverage[3] = 1.0;
+
+   if (dx > dy) {
+      /*** X-major line ***/
+      int i;
+      const int errorInc = dy + dy;
+      int error = errorInc - dx;
+      const int errorDec = error - dx;
+
+      for (i = 0; i < dx; i++) {
+         plot(setup, x0, y0);
+
+         x0 += xstep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            y0 += ystep;
+         }
+      }
+   }
+   else {
+      /*** Y-major line ***/
+      int i;
+      const int errorInc = dx + dx;
+      int error = errorInc - dy;
+      const int errorDec = error - dy;
+
+      for (i = 0; i < dy; i++) {
+         plot(setup, x0, y0);
+
+         y0 += ystep;
+         if (error < 0) {
+            error += errorInc;
+         }
+         else {
+            error += errorDec;
+            x0 += xstep;
+         }
+      }
+   }
+
+   /* draw final quad */
+   if (setup->quad.inout.mask) {
+      CLIP_EMIT_QUAD(setup);
+   }
+
+   WAIT_FOR_COMPLETION(setup);
+}
+
+
+static void
+point_persp_coeff(struct setup_context *setup,
+                  const float (*vert)[4],
+                  struct tgsi_interp_coef *coef,
+                  uint vertSlot, uint i)
+{
+   assert(i <= 3);
+   coef->dadx[i] = 0.0F;
+   coef->dady[i] = 0.0F;
+   coef->a0[i] = vert[vertSlot][i] * vert[0][3];
+}
+
+
+/**
+ * Do setup for point rasterization, then render the point.
+ * Round or square points...
+ * XXX could optimize a lot for 1-pixel points.
+ */
+void
+setup_point( struct setup_context *setup,
+             const float (*v0)[4] )
+{
+   struct softpipe_context *softpipe = setup->softpipe;
+   const struct sp_fragment_shader *spfs = softpipe->fs;
+   const int sizeAttr = setup->softpipe->psize_slot;
+   const float size
+      = sizeAttr > 0 ? v0[sizeAttr][0]
+      : setup->softpipe->rasterizer->point_size;
+   const float halfSize = 0.5F * size;
+   const boolean round = (boolean) setup->softpipe->rasterizer->point_smooth;
+   const float x = v0[0][0];  /* Note: data[0] is always position */
+   const float y = v0[0][1];
+   const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
+   uint fragSlot;
+
+#if DEBUG_VERTS
+   debug_printf("Setup point:\n");
+   print_vertex(setup, v0);
+#endif
+
+   if (softpipe->no_rast)
+      return;
+
+   /* For points, all interpolants are constant-valued.
+    * However, for point sprites, we'll need to setup texcoords appropriately.
+    * XXX: which coefficients are the texcoords???
+    * We may do point sprites as textured quads...
+    *
+    * KW: We don't know which coefficients are texcoords - ultimately
+    * the choice of what interpolation mode to use for each attribute
+    * should be determined by the fragment program, using
+    * per-attribute declaration statements that include interpolation
+    * mode as a parameter.  So either the fragment program will have
+    * to be adjusted for pointsprite vs normal point behaviour, or
+    * otherwise a special interpolation mode will have to be defined
+    * which matches the required behaviour for point sprites.  But -
+    * the latter is not a feature of normal hardware, and as such
+    * probably should be ruled out on that basis.
+    */
+   setup->vprovoke = v0;
+
+   /* setup Z, W */
+   const_coeff(setup, &setup->posCoef, 0, 2);
+   const_coeff(setup, &setup->posCoef, 0, 3);
+
+   for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
+      uint j;
+
+      switch (vinfo->attrib[fragSlot].interp_mode) {
+      case INTERP_CONSTANT:
+         /* fall-through */
+      case INTERP_LINEAR:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_PERSPECTIVE:
+         for (j = 0; j < NUM_CHANNELS; j++)
+            point_persp_coeff(setup, setup->vprovoke,
+                              &setup->coef[fragSlot], vertSlot, j);
+         break;
+      case INTERP_POS:
+         setup_fragcoord_coeff(setup, fragSlot);
+         break;
+      default:
+         assert(0);
+      }
+
+      if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
+         /* FOG.y = front/back facing  XXX fix this */
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].dadx[1] = 0.0;
+         setup->coef[fragSlot].dady[1] = 0.0;
+      }
+   }
+
+   setup->quad.input.prim = PRIM_POINT;
+
+   if (halfSize <= 0.5 && !round) {
+      /* special case for 1-pixel points */
+      const int ix = ((int) x) & 1;
+      const int iy = ((int) y) & 1;
+      setup->quad.input.x0 = (int) x - ix;
+      setup->quad.input.y0 = (int) y - iy;
+      setup->quad.inout.mask = (1 << ix) << (2 * iy);
+      CLIP_EMIT_QUAD(setup);
+   }
+   else {
+      if (round) {
+         /* rounded points */
+         const int ixmin = block((int) (x - halfSize));
+         const int ixmax = block((int) (x + halfSize));
+         const int iymin = block((int) (y - halfSize));
+         const int iymax = block((int) (y + halfSize));
+         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
+         const float rmax = halfSize + 0.7071F;
+         const float rmin2 = MAX2(0.0F, rmin * rmin);
+         const float rmax2 = rmax * rmax;
+         const float cscale = 1.0F / (rmax2 - rmin2);
+         int ix, iy;
+
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               float dx, dy, dist2, cover;
+
+               setup->quad.inout.mask = 0x0;
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 0.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+               }
+
+               dx = (ix + 0.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+               }
+
+               dx = (ix + 1.5f) - x;
+               dy = (iy + 1.5f) - y;
+               dist2 = dx * dx + dy * dy;
+               if (dist2 <= rmax2) {
+                  cover = 1.0F - (dist2 - rmin2) * cscale;
+                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+               }
+
+               if (setup->quad.inout.mask) {
+                  setup->quad.input.x0 = ix;
+                  setup->quad.input.y0 = iy;
+                  CLIP_EMIT_QUAD(setup);
+               }
+            }
+         }
+      }
+      else {
+         /* square points */
+         const int xmin = (int) (x + 0.75 - halfSize);
+         const int ymin = (int) (y + 0.25 - halfSize);
+         const int xmax = xmin + (int) size;
+         const int ymax = ymin + (int) size;
+         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
+         const int ixmin = block(xmin);
+         const int ixmax = block(xmax - 1);
+         const int iymin = block(ymin);
+         const int iymax = block(ymax - 1);
+         int ix, iy;
+
+         /*
+         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
+         */
+         for (iy = iymin; iy <= iymax; iy += 2) {
+            uint rowMask = 0xf;
+            if (iy < ymin) {
+               /* above the top edge */
+               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+            }
+            if (iy + 1 >= ymax) {
+               /* below the bottom edge */
+               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+            }
+
+            for (ix = ixmin; ix <= ixmax; ix += 2) {
+               uint mask = rowMask;
+
+               if (ix < xmin) {
+                  /* fragment is past left edge of point, turn off left bits */
+                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+               }
+               if (ix + 1 >= xmax) {
+                  /* past the right edge */
+                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+               }
+
+               setup->quad.inout.mask = mask;
+               setup->quad.input.x0 = ix;
+               setup->quad.input.y0 = iy;
+               CLIP_EMIT_QUAD(setup);
+            }
+         }
+      }
+   }
+
+   WAIT_FOR_COMPLETION(setup);
+}
+
+void setup_prepare( struct setup_context *setup )
+{
+   struct softpipe_context *sp = setup->softpipe;
+   unsigned i;
+
+   if (sp->dirty) {
+      softpipe_update_derived(sp);
+   }
+
+   /* Mark surfaces as defined now */
+   for (i = 0; i < sp->framebuffer.num_cbufs; i++){
+      if (sp->framebuffer.cbufs[i]) {
+         sp->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_DEFINED;
+      }
+   }
+   if (sp->framebuffer.zsbuf) {
+      sp->framebuffer.zsbuf->status = PIPE_SURFACE_STATUS_DEFINED;
+   }
+
+   /* Note: nr_attrs is only used for debugging (vertex printing) */
+   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
+
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first->begin( sp->quad[i].first );
+   }
+
+   if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
+       sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
+       sp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
+      /* we'll do culling */
+      setup->winding = sp->rasterizer->cull_mode;
+   }
+   else {
+      /* 'draw' will do culling */
+      setup->winding = PIPE_WINDING_NONE;
+   }
+}
+
+
+
+void setup_destroy_context( struct setup_context *setup )
+{
+   FREE( setup );
+}
+
+
+/**
+ * Create a new primitive setup/render stage.
+ */
+struct setup_context *setup_create_context( struct softpipe_context *softpipe )
+{
+   struct setup_context *setup = CALLOC_STRUCT(setup_context);
+#if SP_NUM_QUAD_THREADS > 1
+   uint i;
+#endif
+
+   setup->softpipe = softpipe;
+
+   setup->quad.coef = setup->coef;
+   setup->quad.posCoef = &setup->posCoef;
+
+#if SP_NUM_QUAD_THREADS > 1
+   setup->que.first = 0;
+   setup->que.last = 0;
+   pipe_mutex_init( setup->que.que_mutex );
+   pipe_condvar_init( setup->que.que_notfull_condvar );
+   pipe_condvar_init( setup->que.que_notempty_condvar );
+   setup->que.jobs_added = 0;
+   setup->que.jobs_done = 0;
+   pipe_condvar_init( setup->que.que_done_condvar );
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      setup->threads[i].setup = setup;
+      setup->threads[i].id = i;
+      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   }
+#endif
+
+   return setup;
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
new file mode 100644
index 0000000000..d54f334428
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef SP_SETUP_H
+#define SP_SETUP_H
+
+struct setup_context;
+struct softpipe_context;
+
+void 
+setup_tri( struct setup_context *setup,
+	   const float (*v0)[4],
+	   const float (*v1)[4],
+	   const float (*v2)[4] );
+
+void
+setup_line(struct setup_context *setup,
+           const float (*v0)[4],
+           const float (*v1)[4]);
+
+void
+setup_point( struct setup_context *setup,
+             const float (*v0)[4] );
+
+
+struct setup_context *setup_create_context( struct softpipe_context *softpipe );
+void setup_prepare( struct setup_context *setup );
+void setup_destroy_context( struct setup_context *setup );
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
new file mode 100644
index 0000000000..3eff41ffa5
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_STATE_H
+#define SP_STATE_H
+
+#include "pipe/p_state.h"
+#include "tgsi/tgsi_scan.h"
+
+
+#define SP_NEW_VIEWPORT      0x1
+#define SP_NEW_RASTERIZER    0x2
+#define SP_NEW_FS            0x4
+#define SP_NEW_BLEND         0x8
+#define SP_NEW_CLIP          0x10
+#define SP_NEW_SCISSOR       0x20
+#define SP_NEW_STIPPLE       0x40
+#define SP_NEW_FRAMEBUFFER   0x80
+#define SP_NEW_DEPTH_STENCIL_ALPHA 0x100
+#define SP_NEW_CONSTANTS     0x200
+#define SP_NEW_SAMPLER       0x400
+#define SP_NEW_TEXTURE       0x800
+#define SP_NEW_VERTEX        0x1000
+#define SP_NEW_VS            0x2000
+#define SP_NEW_QUERY         0x4000
+
+
+struct tgsi_sampler;
+struct tgsi_exec_machine;
+struct vertex_info;
+
+
+/**
+ * Subclass of pipe_shader_state (though it doesn't really need to be).
+ *
+ * This is starting to look an awful lot like a quad pipeline stage...
+ */
+struct sp_fragment_shader {
+   struct pipe_shader_state shader;
+
+   struct tgsi_shader_info info;
+
+   void (*prepare)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct tgsi_sampler **samplers);
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   unsigned (*run)( const struct sp_fragment_shader *shader,
+		    struct tgsi_exec_machine *machine,
+		    struct quad_header *quad );
+
+
+   void (*delete)( struct sp_fragment_shader * );
+};
+
+
+/** Subclass of pipe_shader_state */
+struct sp_vertex_shader {
+   struct pipe_shader_state shader;  /* Note: this field not actually used */
+   struct draw_vertex_shader *draw_data;
+};
+
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *,
+                            const struct pipe_blend_state *);
+void softpipe_bind_blend_state(struct pipe_context *,
+                               void *);
+void softpipe_delete_blend_state(struct pipe_context *,
+                                 void *);
+
+void *
+softpipe_create_sampler_state(struct pipe_context *,
+                              const struct pipe_sampler_state *);
+void softpipe_bind_sampler_states(struct pipe_context *, unsigned, void **);
+void softpipe_delete_sampler_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *,
+                                    const struct pipe_depth_stencil_alpha_state *);
+void softpipe_bind_depth_stencil_state(struct pipe_context *, void *);
+void softpipe_delete_depth_stencil_state(struct pipe_context *, void *);
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *,
+                                 const struct pipe_rasterizer_state *);
+void softpipe_bind_rasterizer_state(struct pipe_context *, void *);
+void softpipe_delete_rasterizer_state(struct pipe_context *, void *);
+
+void softpipe_set_framebuffer_state( struct pipe_context *,
+			     const struct pipe_framebuffer_state * );
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+                               const struct pipe_blend_color *blend_color );
+
+void softpipe_set_clip_state( struct pipe_context *,
+			     const struct pipe_clip_state * );
+
+void softpipe_set_constant_buffer(struct pipe_context *,
+                                  uint shader, uint index,
+                                  const struct pipe_constant_buffer *buf);
+
+void *softpipe_create_fs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_fs_state(struct pipe_context *, void *);
+void softpipe_delete_fs_state(struct pipe_context *, void *);
+void *softpipe_create_vs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_vs_state(struct pipe_context *, void *);
+void softpipe_delete_vs_state(struct pipe_context *, void *);
+
+void softpipe_set_polygon_stipple( struct pipe_context *,
+				  const struct pipe_poly_stipple * );
+
+void softpipe_set_scissor_state( struct pipe_context *,
+                                 const struct pipe_scissor_state * );
+
+void softpipe_set_sampler_textures( struct pipe_context *,
+                                    unsigned num,
+                                    struct pipe_texture ** );
+
+void softpipe_set_viewport_state( struct pipe_context *,
+                                  const struct pipe_viewport_state * );
+
+void softpipe_set_vertex_elements(struct pipe_context *,
+                                  unsigned count,
+                                  const struct pipe_vertex_element *);
+
+void softpipe_set_vertex_buffers(struct pipe_context *,
+                                 unsigned count,
+                                 const struct pipe_vertex_buffer *);
+
+
+void softpipe_update_derived( struct softpipe_context *softpipe );
+
+
+boolean softpipe_draw_arrays(struct pipe_context *pipe, unsigned mode,
+			     unsigned start, unsigned count);
+
+boolean softpipe_draw_elements(struct pipe_context *pipe,
+			       struct pipe_buffer *indexBuffer,
+			       unsigned indexSize,
+			       unsigned mode, unsigned start, unsigned count);
+boolean
+softpipe_draw_range_elements(struct pipe_context *pipe,
+                             struct pipe_buffer *indexBuffer,
+                             unsigned indexSize,
+                             unsigned min_index,
+                             unsigned max_index,
+                             unsigned mode, unsigned start, unsigned count);
+
+void
+softpipe_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+
+
+void
+softpipe_map_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_unmap_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_map_texture_surfaces(struct softpipe_context *sp);
+
+void
+softpipe_unmap_texture_surfaces(struct softpipe_context *sp);
+
+
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe);
+
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe);
+
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
new file mode 100644
index 0000000000..384fe559af
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -0,0 +1,98 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_state.h"
+
+
+void *
+softpipe_create_blend_state(struct pipe_context *pipe,
+                            const struct pipe_blend_state *blend)
+{
+   return mem_dup(blend, sizeof(*blend));
+}
+
+void softpipe_bind_blend_state( struct pipe_context *pipe,
+                                void *blend )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->blend = (const struct pipe_blend_state *)blend;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+void softpipe_delete_blend_state(struct pipe_context *pipe,
+                                 void *blend)
+{
+   FREE( blend );
+}
+
+
+void softpipe_set_blend_color( struct pipe_context *pipe,
+			     const struct pipe_blend_color *blend_color )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->blend_color = *blend_color;
+
+   softpipe->dirty |= SP_NEW_BLEND;
+}
+
+
+/** XXX move someday?  Or consolidate all these simple state setters
+ * into one file.
+ */
+
+
+void *
+softpipe_create_depth_stencil_state(struct pipe_context *pipe,
+				    const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   return mem_dup(depth_stencil, sizeof(*depth_stencil));
+}
+
+void
+softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
+                                  void *depth_stencil)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+void
+softpipe_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE( depth );
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
new file mode 100644
index 0000000000..4946c776e3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+void softpipe_set_clip_state( struct pipe_context *pipe,
+			     const struct pipe_clip_state *clip )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the clip state to the draw module */
+   draw_set_clip_state(softpipe->draw, clip);
+}
+
+
+void softpipe_set_viewport_state( struct pipe_context *pipe,
+                                  const struct pipe_viewport_state *viewport )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass the viewport info to the draw module */
+   draw_set_viewport_state(softpipe->draw, viewport);
+
+   softpipe->viewport = *viewport; /* struct copy */
+   softpipe->dirty |= SP_NEW_VIEWPORT;
+}
+
+
+void softpipe_set_scissor_state( struct pipe_context *pipe,
+                                 const struct pipe_scissor_state *scissor )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->scissor = *scissor; /* struct copy */
+   softpipe->dirty |= SP_NEW_SCISSOR;
+}
+
+
+void softpipe_set_polygon_stipple( struct pipe_context *pipe,
+                                   const struct pipe_poly_stipple *stipple )
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   draw_flush(softpipe->draw);
+
+   softpipe->poly_stipple = *stipple; /* struct copy */
+   softpipe->dirty |= SP_NEW_STIPPLE;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
new file mode 100644
index 0000000000..6b6a4c3ff3
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -0,0 +1,210 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_private.h"
+#include "sp_context.h"
+#include "sp_state.h"
+
+
+/**
+ * Mark the current vertex layout as "invalid".
+ * We'll validate the vertex layout later, when we start to actually
+ * render a point or line or tri.
+ */
+static void
+invalidate_vertex_layout(struct softpipe_context *softpipe)
+{
+   softpipe->vertex_info.num_attribs =  0;
+}
+
+
+/**
+ * The vertex info describes how to convert the post-transformed vertices
+ * (simple float[][4]) used by the 'draw' module into vertices for
+ * rasterization.
+ *
+ * This function validates the vertex layout and returns a pointer to a
+ * vertex_info object.
+ */
+struct vertex_info *
+softpipe_get_vertex_info(struct softpipe_context *softpipe)
+{
+   struct vertex_info *vinfo = &softpipe->vertex_info;
+
+   if (vinfo->num_attribs == 0) {
+      /* compute vertex layout now */
+      const struct sp_fragment_shader *spfs = softpipe->fs;
+      const enum interp_mode colorInterp
+         = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      uint i;
+
+      if (softpipe->vbuf) {
+         /* if using the post-transform vertex buffer, tell draw_vbuf to
+          * simply emit the whole post-xform vertex as-is:
+          */
+         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+         const uint num = draw_num_vs_outputs(softpipe->draw);
+         uint i;
+
+         /* No longer any need to try and emit draw vertex_header info.
+          */
+         vinfo_vbuf->num_attribs = 0;
+         for (i = 0; i < num; i++) {
+            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
+         }
+         draw_compute_vertex_size(vinfo_vbuf);
+      }
+
+      /*
+       * Loop over fragment shader inputs, searching for the matching output
+       * from the vertex shader.
+       */
+      vinfo->num_attribs = 0;
+      for (i = 0; i < spfs->info.num_inputs; i++) {
+         int src;
+         switch (spfs->info.input_semantic_name[i]) {
+         case TGSI_SEMANTIC_POSITION:
+            src = draw_find_vs_output(softpipe->draw,
+                                      TGSI_SEMANTIC_POSITION, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
+            break;
+
+         case TGSI_SEMANTIC_COLOR:
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_COLOR, 
+                                 spfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
+            break;
+
+         case TGSI_SEMANTIC_FOG:
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_FOG, 0);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            break;
+
+         case TGSI_SEMANTIC_GENERIC:
+            /* this includes texcoords and varying vars */
+            src = draw_find_vs_output(softpipe->draw, TGSI_SEMANTIC_GENERIC,
+                                      spfs->info.input_semantic_index[i]);
+            draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
+            break;
+
+         default:
+            assert(0);
+         }
+      }
+
+      softpipe->psize_slot = draw_find_vs_output(softpipe->draw,
+                                                 TGSI_SEMANTIC_PSIZE, 0);
+      if (softpipe->psize_slot > 0) {
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
+                               softpipe->psize_slot);
+      }
+
+      draw_compute_vertex_size(vinfo);
+   }
+
+   return vinfo;
+}
+
+
+/**
+ * Called from vbuf module.
+ *
+ * Note that there's actually two different vertex layouts in softpipe.
+ *
+ * The normal one is computed in softpipe_get_vertex_info() above and is
+ * used by the point/line/tri "setup" code.
+ *
+ * The other one (this one) is only used by the vbuf module (which is
+ * not normally used by default but used in testing).  For the vbuf module,
+ * we basically want to pass-through the draw module's vertex layout as-is.
+ * When the softpipe vbuf code begins drawing, the normal vertex layout
+ * will come into play again.
+ */
+struct vertex_info *
+softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
+{
+   (void) softpipe_get_vertex_info(softpipe);
+   return &softpipe->vertex_info_vbuf;
+}
+
+
+/**
+ * Recompute cliprect from scissor bounds, scissor enable and surface size.
+ */
+static void
+compute_cliprect(struct softpipe_context *sp)
+{
+   uint surfWidth = sp->framebuffer.width;
+   uint surfHeight = sp->framebuffer.height;
+
+   if (sp->rasterizer->scissor) {
+      /* clip to scissor rect */
+      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
+      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
+      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
+      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
+   }
+   else {
+      /* clip to surface bounds */
+      sp->cliprect.minx = 0;
+      sp->cliprect.miny = 0;
+      sp->cliprect.maxx = surfWidth;
+      sp->cliprect.maxy = surfHeight;
+   }
+}
+
+
+/* Hopefully this will remain quite simple, otherwise need to pull in
+ * something like the state tracker mechanism.
+ */
+void softpipe_update_derived( struct softpipe_context *softpipe )
+{
+   if (softpipe->dirty & (SP_NEW_RASTERIZER |
+                          SP_NEW_FS |
+                          SP_NEW_VS))
+      invalidate_vertex_layout( softpipe );
+
+   if (softpipe->dirty & (SP_NEW_SCISSOR |
+                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_FRAMEBUFFER))
+      compute_cliprect(softpipe);
+
+   if (softpipe->dirty & (SP_NEW_BLEND |
+                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_FRAMEBUFFER |
+                          SP_NEW_RASTERIZER |
+                          SP_NEW_FS | 
+			  SP_NEW_QUERY))
+      sp_build_quad_pipeline(softpipe);
+
+   softpipe->dirty = 0;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
new file mode 100644
index 0000000000..e5b609cf6c
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -0,0 +1,161 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_fs.h"
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw/draw_context.h"
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+
+
+void *
+softpipe_create_fs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_fragment_shader *state;
+
+   /* debug */
+   if (softpipe->dump_fs) 
+      tgsi_dump(templ->tokens, 0);
+
+   /* codegen */
+   state = softpipe_create_fs_llvm( softpipe, templ );
+   if (!state) {
+      state = softpipe_create_fs_sse( softpipe, templ );
+      if (!state) {
+         state = softpipe_create_fs_exec( softpipe, templ );
+      }
+   }
+
+   assert(state);
+
+   /* get/save the summary info for this shader */
+   tgsi_scan_shader(templ->tokens, &state->info);
+
+   return state;
+}
+
+
+void
+softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->fs = (struct sp_fragment_shader *) fs;
+
+   softpipe->dirty |= SP_NEW_FS;
+}
+
+
+void
+softpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct sp_fragment_shader *state = fs;
+
+   assert(fs != softpipe_context(pipe)->fs);
+   
+   state->delete( state );
+}
+
+
+void *
+softpipe_create_vs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_vertex_shader *state;
+
+   state = CALLOC_STRUCT(sp_vertex_shader);
+   if (state == NULL ) {
+      return NULL;
+   }
+
+   state->draw_data = draw_create_vertex_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL) {
+      FREE( state );
+      return NULL;
+   }
+
+   return state;
+}
+
+
+void
+softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->vs = (const struct sp_vertex_shader *)vs;
+
+   draw_bind_vertex_shader(softpipe->draw,
+                           (softpipe->vs ? softpipe->vs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_VS;
+}
+
+
+void
+softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_vertex_shader *state =
+      (struct sp_vertex_shader *)vs;
+
+   draw_delete_vertex_shader(softpipe->draw, state->draw_data);
+   FREE( state );
+}
+
+
+
+void
+softpipe_set_constant_buffer(struct pipe_context *pipe,
+                             uint shader, uint index,
+                             const struct pipe_constant_buffer *buf)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct pipe_winsys *ws = pipe->winsys;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index == 0);
+
+   /* note: reference counting */
+   winsys_buffer_reference(ws,
+			 &softpipe->constants[shader].buffer,
+			 buf ? buf->buffer : NULL);
+   softpipe->constants[shader].size = buf ? buf->size : 0;
+
+   softpipe->dirty |= SP_NEW_CONSTANTS;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
new file mode 100644
index 0000000000..87b7219683
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+softpipe_create_rasterizer_state(struct pipe_context *pipe,
+                                 const struct pipe_rasterizer_state *rast)
+{
+   return mem_dup(rast, sizeof(*rast));
+}
+
+void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
+                                    void *setup)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   /* pass-through to draw module */
+   draw_set_rasterizer_state(softpipe->draw, setup);
+
+   softpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+
+   softpipe->dirty |= SP_NEW_RASTERIZER;
+}
+
+void softpipe_delete_rasterizer_state(struct pipe_context *pipe,
+                                      void *rasterizer)
+{
+   FREE( rasterizer );
+}
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
new file mode 100644
index 0000000000..99a28c0d7e
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -0,0 +1,118 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:
+ *  Brian Paul
+ */
+
+#include "util/u_memory.h"
+#include "pipe/p_inlines.h"
+
+#include "draw/draw_context.h"
+
+#include "sp_context.h"
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+#include "draw/draw_context.h"
+
+
+
+void *
+softpipe_create_sampler_state(struct pipe_context *pipe,
+                              const struct pipe_sampler_state *sampler)
+{
+   return mem_dup(sampler, sizeof(*sampler));
+}
+
+
+void
+softpipe_bind_sampler_states(struct pipe_context *pipe,
+                             unsigned num, void **sampler)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_samplers &&
+       !memcmp(softpipe->sampler, sampler, num * sizeof(void *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < num; ++i)
+      softpipe->sampler[i] = sampler[i];
+   for (i = num; i < PIPE_MAX_SAMPLERS; ++i)
+      softpipe->sampler[i] = NULL;
+
+   softpipe->num_samplers = num;
+
+   softpipe->dirty |= SP_NEW_SAMPLER;
+}
+
+
+void
+softpipe_set_sampler_textures(struct pipe_context *pipe,
+                              unsigned num, struct pipe_texture **texture)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SAMPLERS);
+
+   /* Check for no-op */
+   if (num == softpipe->num_textures &&
+       !memcmp(softpipe->texture, texture, num * sizeof(struct pipe_texture *)))
+      return;
+
+   draw_flush(softpipe->draw);
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct pipe_texture *tex = i < num ? texture[i] : NULL;
+
+      pipe_texture_reference(&softpipe->texture[i], tex);
+      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+   }
+
+   softpipe->num_textures = num;
+
+   softpipe->dirty |= SP_NEW_TEXTURE;
+}
+
+
+void
+softpipe_delete_sampler_state(struct pipe_context *pipe,
+                              void *sampler)
+{
+   FREE( sampler );
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
new file mode 100644
index 0000000000..8877b18af9
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -0,0 +1,128 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+#include "pipe/p_inlines.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+#include "sp_tile_cache.h"
+
+
+/**
+ * XXX this might get moved someday
+ * Set the framebuffer surface info: color buffers, zbuffer, stencil buffer.
+ * Here, we flush the old surfaces and update the tile cache to point to the new
+ * surfaces.
+ */
+void
+softpipe_set_framebuffer_state(struct pipe_context *pipe,
+                               const struct pipe_framebuffer_state *fb)
+{
+   struct softpipe_context *sp = softpipe_context(pipe);
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      /* check if changing cbuf */
+      if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
+         /* flush old */
+         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+
+         /* assign new */
+         sp->framebuffer.cbufs[i] = fb->cbufs[i];
+
+         /* update cache */
+         sp_tile_cache_set_surface(sp->cbuf_cache[i], fb->cbufs[i]);
+      }
+   }
+
+   sp->framebuffer.num_cbufs = fb->num_cbufs;
+
+   /* zbuf changing? */
+   if (sp->framebuffer.zsbuf != fb->zsbuf) {
+      /* flush old */
+      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+
+      /* assign new */
+      sp->framebuffer.zsbuf = fb->zsbuf;
+
+      /* update cache */
+      sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
+   }
+
+#if 0
+   /* XXX combined depth/stencil here */
+
+   /* sbuf changing? */
+   if (sp->framebuffer.sbuf != fb->sbuf) {
+      /* flush old */
+      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
+
+      /* assign new */
+      sp->framebuffer.sbuf = fb->sbuf;
+
+      /* update cache */
+      if (fb->sbuf != fb->zbuf) {
+         /* separate stencil buf */
+         sp->sbuf_cache = sp->sbuf_cache_sep;
+         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
+      }
+      else {
+         /* combined depth/stencil */
+         sp->sbuf_cache = sp->zbuf_cache;
+         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
+      }
+   }
+#endif
+
+   /* Tell draw module how deep the Z/depth buffer is */
+   {
+      int depth_bits;
+      double mrd;
+      if (sp->framebuffer.zsbuf) {
+         depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
+                                            PIPE_FORMAT_COMP_Z);
+      }
+      else {
+         depth_bits = 0;
+      }
+      if (depth_bits > 16) {
+         mrd = 0.0000001;
+      }
+      else {
+         mrd = 0.00002;
+      }
+      draw_set_mrd(sp->draw, mrd);
+   }
+
+   sp->framebuffer.width = fb->width;
+   sp->framebuffer.height = fb->height;
+
+   sp->dirty |= SP_NEW_FRAMEBUFFER;
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_vertex.c b/src/gallium/drivers/softpipe/sp_state_vertex.c
new file mode 100644
index 0000000000..46b6991195
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_vertex.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_surface.h"
+
+#include "draw/draw_context.h"
+
+
+void
+softpipe_set_vertex_elements(struct pipe_context *pipe,
+                             unsigned count,
+                             const struct pipe_vertex_element *attribs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(softpipe->vertex_element, attribs,
+          count * sizeof(struct pipe_vertex_element));
+   softpipe->num_vertex_elements = count;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   draw_set_vertex_elements(softpipe->draw, count, attribs);
+}
+
+
+void
+softpipe_set_vertex_buffers(struct pipe_context *pipe,
+                            unsigned count,
+                            const struct pipe_vertex_buffer *buffers)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   assert(count <= PIPE_MAX_ATTRIBS);
+
+   memcpy(softpipe->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   softpipe->num_vertex_buffers = count;
+
+   softpipe->dirty |= SP_NEW_VERTEX;
+
+   draw_set_vertex_buffers(softpipe->draw, count, buffers);
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
new file mode 100644
index 0000000000..6ade732698
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -0,0 +1,38 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "util/u_rect.h"
+#include "sp_context.h"
+
+
+
+void
+sp_init_surface_functions(struct softpipe_context *sp)
+{
+   sp->pipe.surface_copy = util_surface_copy;
+   sp->pipe.surface_fill = util_surface_fill;
+}
diff --git a/src/gallium/drivers/softpipe/sp_surface.h b/src/gallium/drivers/softpipe/sp_surface.h
new file mode 100644
index 0000000000..22de3ba43f
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_surface.h
@@ -0,0 +1,42 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef SP_SURFACE_H
+#define SP_SURFACE_H
+
+
+struct softpipe_context;
+
+
+extern void
+sp_init_surface_functions(struct softpipe_context *sp);
+
+
+#endif /* SP_SURFACE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
new file mode 100644
index 0000000000..631c60966c
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -0,0 +1,1187 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture sampling
+ *
+ * Authors:
+ *   Brian Paul
+ */
+
+#include "sp_context.h"
+#include "sp_headers.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_sample.h"
+#include "sp_tile_cache.h"
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_exec.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+
+/*
+ * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
+ * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * The tests/texwrap.c demo is a good test.
+ * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
+ * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
+ */
+#define FRAC(f)  ((f) - util_ifloor(f))
+
+
+/**
+ * Linear interpolation macro
+ */
+static INLINE float
+lerp(float a, float v0, float v1)
+{
+   return v0 + a * (v1 - v0);
+}
+
+
+/**
+ * Do 2D/biliner interpolation of float values.
+ * v00, v10, v01 and v11 are typically four texture samples in a square/box.
+ * a and b are the horizontal and vertical interpolants.
+ * It's important that this function is inlined when compiled with
+ * optimization!  If we find that's not true on some systems, convert
+ * to a macro.
+ */
+static INLINE float
+lerp_2d(float a, float b,
+        float v00, float v10, float v01, float v11)
+{
+   const float temp0 = lerp(a, v00, v10);
+   const float temp1 = lerp(a, v01, v11);
+   return lerp(b, temp0, temp1);
+}
+
+
+/**
+ * As above, but 3D interpolation of 8 values.
+ */
+static INLINE float
+lerp_3d(float a, float b, float c,
+        float v000, float v100, float v010, float v110,
+        float v001, float v101, float v011, float v111)
+{
+   const float temp0 = lerp_2d(a, b, v000, v100, v010, v110);
+   const float temp1 = lerp_2d(a, b, v001, v101, v011, v111);
+   return lerp(c, temp0, temp1);
+}
+
+
+
+/**
+ * If A is a signed integer, A % B doesn't give the right value for A < 0
+ * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ */
+#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+
+
+/**
+ * Apply texture coord wrapping mode and return integer texture indexes
+ * for a vector of four texcoords (S or T or P).
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the incoming texcoords
+ * \param size  the texture image size
+ * \param icoord  returns the integer texcoords
+ * \return  integer texture index
+ */
+static INLINE void
+nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                   int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      /* s limited to [0,1) */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch] * size);
+         icoord[ch] = REMAINDER(i, size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP:
+      /* s limited to [0,1] */
+      /* i limited to [0,size-1] */
+      for (ch = 0; ch < 4; ch++) {
+         if (s[ch] <= 0.0F)
+            icoord[ch] = 0;
+         else if (s[ch] >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(s[ch] * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] < min)
+               icoord[ch] = 0;
+            else if (s[ch] > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [-1, size] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            if (s[ch] <= min)
+               icoord[ch] = -1;
+            else if (s[ch] >= max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(s[ch] * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      {
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const int flr = util_ifloor(s[ch]);
+            float u;
+            if (flr & 1)
+               u = 1.0F - (s[ch] - (float) flr);
+            else
+               u = s[ch] - (float) flr;
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* s limited to [0,1] */
+         /* i limited to [0,size-1] */
+         const float u = fabsf(s[ch]);
+         if (u <= 0.0F)
+            icoord[ch] = 0;
+         else if (u >= 1.0F)
+            icoord[ch] = size - 1;
+         else
+            icoord[ch] = util_ifloor(u * size);
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = 1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = 0;
+            else if (u > max)
+               icoord[ch] = size - 1;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         /* s limited to [min,max] */
+         /* i limited to [0, size-1] */
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            const float u = fabsf(s[ch]);
+            if (u < min)
+               icoord[ch] = -1;
+            else if (u > max)
+               icoord[ch] = size;
+            else
+               icoord[ch] = util_ifloor(u * size);
+         }
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Used to compute texel locations for linear sampling for four texcoords.
+ * \param wrapMode  PIPE_TEX_WRAP_x
+ * \param s  the texcoords
+ * \param size  the texture image size
+ * \param icoord0  returns first texture indexes
+ * \param icoord1  returns second texture indexes (usually icoord0 + 1)
+ * \param w  returns blend factor/weight between texture indexes
+ * \param icoord  returns the computed integer texture coords
+ */
+static INLINE void
+linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+                  int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         float u = s[ch] * size - 0.5F;
+         icoord0[ch] = REMAINDER(util_ifloor(u), size);
+         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.0F, 1.0F);
+         u = u * size - 0.5f;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = CLAMP(s[ch], min, max);
+            u = u * size - 0.5f;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      for (ch = 0; ch < 4; ch++) {
+         const int flr = util_ifloor(s[ch]);
+         float u;
+         if (flr & 1)
+            u = 1.0F - (s[ch] - (float) flr);
+         else
+            u = s[ch] - (float) flr;
+         u = u * size - 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      for (ch = 0; ch < 4; ch++) {
+         float u = fabsf(s[ch]);
+         if (u >= 1.0F)
+            u = (float) size;
+         else
+            u *= size;
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord0[ch] < 0)
+            icoord0[ch] = 0;
+         if (icoord1[ch] >= (int) size)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      {
+         const float min = -1.0F / (2.0F * size);
+         const float max = 1.0F - min;
+         for (ch = 0; ch < 4; ch++) {
+            float u = fabsf(s[ch]);
+            if (u <= min)
+               u = min * size;
+            else if (u >= max)
+               u = max * size;
+            else
+               u *= size;
+            u -= 0.5F;
+            icoord0[ch] = util_ifloor(u);
+            icoord1[ch] = icoord0[ch] + 1;
+            w[ch] = FRAC(u);
+         }
+      }
+      break;;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                          int icoord[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         int i = util_ifloor(s[ch]);
+         icoord[ch]= CLAMP(i, 0, (int) size-1);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
+      }
+      return;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * For RECT textures / unnormalized texcoords.
+ * Only a subset of wrap modes supported.
+ */
+static INLINE void
+linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   switch (wrapMode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      for (ch = 0; ch < 4; ch++) {
+         /* Not exactly what the spec says, but it matches NVIDIA output */
+         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         w[ch] = FRAC(u);
+      }
+      return;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      /* fall-through */
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      for (ch = 0; ch < 4; ch++) {
+         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+         u -= 0.5F;
+         icoord0[ch] = util_ifloor(u);
+         icoord1[ch] = icoord0[ch] + 1;
+         if (icoord1[ch] > (int) size - 1)
+            icoord1[ch] = size - 1;
+         w[ch] = FRAC(u);
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = ( sc / ma + 1.0F ) * 0.5F;
+   *newT = ( tc / ma + 1.0F ) * 0.5F;
+
+   return face;
+}
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ *
+ * This is only done for fragment shaders, not vertex shaders.
+ */
+static float
+compute_lambda(const struct pipe_texture *tex,
+               const struct pipe_sampler_state *sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias)
+{
+   float rho, lambda;
+
+   assert(sampler->normalized_coords);
+
+   assert(s);
+   {
+      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
+      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
+      dsdx = fabsf(dsdx);
+      dsdy = fabsf(dsdy);
+      rho = MAX2(dsdx, dsdy) * tex->width[0];
+   }
+   if (t) {
+      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
+      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
+      float max;
+      dtdx = fabsf(dtdx);
+      dtdy = fabsf(dtdy);
+      max = MAX2(dtdx, dtdy) * tex->height[0];
+      rho = MAX2(rho, max);
+   }
+   if (p) {
+      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
+      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
+      float max;
+      dpdx = fabsf(dpdx);
+      dpdy = fabsf(dpdy);
+      max = MAX2(dpdx, dpdy) * tex->depth[0];
+      rho = MAX2(rho, max);
+   }
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Do several things here:
+ * 1. Compute lambda from the texcoords, if needed
+ * 2. Determine if we're minifying or magnifying
+ * 3. If minifying, choose mipmap levels
+ * 4. Return image filter to use within mipmap images
+ * \param level0  Returns first mipmap level to sample from
+ * \param level1  Returns second mipmap level to sample from
+ * \param levelBlend  Returns blend factor between levels, in [0,1]
+ * \param imgFilter  Returns either the min or mag filter, depending on lambda
+ */
+static void
+choose_mipmap_levels(const struct pipe_texture *texture,
+                     const struct pipe_sampler_state *sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     unsigned *level0, unsigned *level1, float *levelBlend,
+                     unsigned *imgFilter)
+{
+   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
+      /* no mipmap selection needed */
+      *level0 = *level1 = CLAMP((int) sampler->min_lod,
+                                0, (int) texture->last_level);
+
+      if (sampler->min_img_filter != sampler->mag_img_filter) {
+         /* non-mipmapped texture, but still need to determine if doing
+          * minification or magnification.
+          */
+         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
+         if (lambda <= 0.0) {
+            *imgFilter = sampler->mag_img_filter;
+         }
+         else {
+            *imgFilter = sampler->min_img_filter;
+         }
+      }
+      else {
+         *imgFilter = sampler->mag_img_filter;
+      }
+   }
+   else {
+      float lambda;
+
+      if (1)
+         /* fragment shader */
+         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
+      else
+         /* vertex shader */
+         lambda = lodbias; /* not really a bias, but absolute LOD */
+
+      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
+         /* magnifying */
+         *imgFilter = sampler->mag_img_filter;
+         *level0 = *level1 = 0;
+      }
+      else {
+         /* minifying */
+         *imgFilter = sampler->min_img_filter;
+
+         /* choose mipmap level(s) and compute the blend factor between them */
+         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
+            /* Nearest mipmap level */
+            const int lvl = (int) (lambda + 0.5);
+            *level0 =
+            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
+         }
+         else {
+            /* Linear interpolation between mipmap levels */
+            const int lvl = (int) lambda;
+            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
+            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
+            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+         }
+      }
+   }
+}
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param face  the cube face in 0..5
+ * \param level  the mipmap level
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param z  which slice of a 3D texture
+ * \param rgba  the quad to put the texel/color into
+ * \param j  which element of the rgba quad to write to
+ *
+ * XXX maybe move this into sp_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+static void
+get_texel(const struct tgsi_sampler *tgsi_sampler,
+          unsigned face, unsigned level, int x, int y, int z,
+          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      rgba[0][j] = sampler->border_color[0];
+      rgba[1][j] = sampler->border_color[1];
+      rgba[2][j] = sampler->border_color[2];
+      rgba[3][j] = sampler->border_color[3];
+   }
+   else {
+      const int tx = x % TILE_SIZE;
+      const int ty = y % TILE_SIZE;
+      const struct softpipe_cached_tile *tile
+         = sp_get_cached_tile_tex(sp, samp->cache,
+                                  x, y, z, face, level);
+      rgba[0][j] = tile->data.color[ty][tx][0];
+      rgba[1][j] = tile->data.color[ty][tx][1];
+      rgba[2][j] = tile->data.color[ty][tx][2];
+      rgba[3][j] = tile->data.color[ty][tx][3];
+      if (0)
+      {
+         debug_printf("Get texel %f %f %f %f from %s\n",
+                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
+                      pf_name(texture->format));
+      }
+   }
+}
+
+
+/**
+ * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+ * When we sampled the depth texture, the depth value was put into all
+ * RGBA channels.  We look at the red channel here.
+ */
+static INLINE void
+shadow_compare(uint compare_func,
+               float rgba[NUM_CHANNELS][QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               uint j)
+{
+   int k;
+   switch (compare_func) {
+   case PIPE_FUNC_LESS:
+      k = p[j] < rgba[0][j];
+      break;
+   case PIPE_FUNC_LEQUAL:
+      k = p[j] <= rgba[0][j];
+      break;
+   case PIPE_FUNC_GREATER:
+      k = p[j] > rgba[0][j];
+      break;
+   case PIPE_FUNC_GEQUAL:
+      k = p[j] >= rgba[0][j];
+      break;
+   case PIPE_FUNC_EQUAL:
+      k = p[j] == rgba[0][j];
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      k = p[j] != rgba[0][j];
+      break;
+   case PIPE_FUNC_ALWAYS:
+      k = 1;
+      break;
+   case PIPE_FUNC_NEVER:
+      k = 0;
+      break;
+   default:
+      k = 0;
+      assert(0);
+      break;
+   }
+
+   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
+}
+
+
+/**
+ * Common code for sampling 1D/2D/cube textures.
+ * Could probably extend for 3D...
+ */
+static void
+sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
+                         const float s[QUAD_SIZE],
+                         const float t[QUAD_SIZE],
+                         const float p[QUAD_SIZE],
+                         float lodbias,
+                         float rgba[NUM_CHANNELS][QUAD_SIZE],
+                         const unsigned faces[4])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint compare_func = sampler->compare_func;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
+                         rgba2, j);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, rgba2, p, j);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+
+         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, tx, p, 0);
+               shadow_compare(compare_func, tx, p, 1);
+               shadow_compare(compare_func, tx, p, 2);
+               shadow_compare(compare_func, tx, p, 3);
+            }
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1],
+                                    tx[c][2], tx[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
+               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
+               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
+               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
+                  shadow_compare(compare_func, tx, p, 0);
+                  shadow_compare(compare_func, tx, p, 1);
+                  shadow_compare(compare_func, tx, p, 2);
+                  shadow_compare(compare_func, tx, p, 3);
+               }
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
+                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+               }
+
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static INLINE void
+sp_get_samples_1d(const struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   static const float tzero[4] = {0, 0, 0, 0};
+   sp_get_samples_2d_common(sampler, s, tzero, NULL, lodbias, rgba, faces);
+}
+
+
+static INLINE void
+sp_get_samples_2d(const struct tgsi_sampler *sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   static const unsigned faces[4] = {0, 0, 0, 0};
+   sp_get_samples_2d_common(sampler, s, t, p, lodbias, rgba, faces);
+}
+
+
+static INLINE void
+sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   /* get/map pipe_surfaces corresponding to 3D tex slices */
+   unsigned level0, level1, j, imgFilter;
+   int width, height, depth;
+   float levelBlend;
+   const uint face = 0;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   assert(sampler->normalized_coords);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4], z[4];
+         nearest_texcoord_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_4(sampler->wrap_t, t, height, y);
+         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               unsigned c;
+               x[j] /= 2;
+               y[j] /= 2;
+               z[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+         float xw[4], yw[4], zw[4]; /* interpolation weights */
+         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
+         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
+
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int c;
+            float tx0[4][4], tx1[4][4];
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
+
+            /* interpolate R, G, B, A */
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                    tx0[c][0], tx0[c][1],
+                                    tx0[c][2], tx0[c][3],
+                                    tx1[c][0], tx1[c][1],
+                                    tx1[c][2], tx1[c][3]);
+            }
+
+            if (level0 != level1) {
+               /* get texels from second mipmap level and blend */
+               float rgba2[4][4];
+               x0[j] /= 2;
+               y0[j] /= 2;
+               z0[j] /= 2;
+               x1[j] /= 2;
+               y1[j] /= 2;
+               z1[j] /= 2;
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
+               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
+               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
+               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
+               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
+
+               /* interpolate R, G, B, A */
+               for (c = 0; c < 4; c++) {
+                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                                        tx0[c][0], tx0[c][1],
+                                        tx0[c][2], tx0[c][3],
+                                        tx1[c][0], tx1[c][1],
+                                        tx1[c][2], tx1[c][3]);
+               }
+
+               /* blend mipmap levels */
+               for (c = 0; c < NUM_CHANNELS; c++) {
+                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
+               }
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+static void
+sp_get_samples_cube(const struct tgsi_sampler *sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   unsigned faces[QUAD_SIZE], j;
+   float ssss[4], tttt[4];
+   for (j = 0; j < QUAD_SIZE; j++) {
+      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   }
+   sp_get_samples_2d_common(sampler, ssss, tttt, NULL, lodbias, rgba, faces);
+}
+
+
+static void
+sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias,
+                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   const uint face = 0;
+   const uint compare_func = sampler->compare_func;
+   unsigned level0, level1, j, imgFilter;
+   int width, height;
+   float levelBlend;
+
+   choose_mipmap_levels(texture, sampler, s, t, p, lodbias,
+                        &level0, &level1, &levelBlend, &imgFilter);
+
+   /* texture RECTS cannot be mipmapped */
+   assert(level0 == level1);
+
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+
+   switch (imgFilter) {
+   case PIPE_TEX_FILTER_NEAREST:
+      {
+         int x[4], y[4];
+         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
+         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, rgba, p, j);
+            }
+         }
+      }
+      break;
+   case PIPE_TEX_FILTER_LINEAR:
+   case PIPE_TEX_FILTER_ANISO:
+      {
+         int x0[4], y0[4], x1[4], y1[4];
+         float xw[4], yw[4]; /* weights */
+         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
+         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
+         for (j = 0; j < QUAD_SIZE; j++) {
+            float tx[4][4]; /* texels */
+            int c;
+            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
+            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
+            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
+            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
+            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+               shadow_compare(compare_func, tx, p, 0);
+               shadow_compare(compare_func, tx, p, 1);
+               shadow_compare(compare_func, tx, p, 2);
+               shadow_compare(compare_func, tx, p, 3);
+            }
+            for (c = 0; c < 4; c++) {
+               rgba[c][j] = lerp_2d(xw[j], yw[j],
+                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Called via tgsi_sampler::get_samples()
+ * Get four filtered RGBA values from the sampler's texture.
+ */
+void
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
+   const struct softpipe_context *sp = samp->sp;
+   const uint unit = samp->unit;
+   const struct pipe_texture *texture = sp->texture[unit];
+   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+
+   if (!texture)
+      return;
+
+   switch (texture->target) {
+   case PIPE_TEXTURE_1D:
+      assert(sampler->normalized_coords);
+      sp_get_samples_1d(tgsi_sampler, s, t, p, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_2D:
+      if (sampler->normalized_coords)
+         sp_get_samples_2d(tgsi_sampler, s, t, p, lodbias, rgba);
+      else
+         sp_get_samples_rect(tgsi_sampler, s, t, p, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_3D:
+      assert(sampler->normalized_coords);
+      sp_get_samples_3d(tgsi_sampler, s, t, p, lodbias, rgba);
+      break;
+   case PIPE_TEXTURE_CUBE:
+      assert(sampler->normalized_coords);
+      sp_get_samples_cube(tgsi_sampler, s, t, p, lodbias, rgba);
+      break;
+   default:
+      assert(0);
+   }
+
+#if 0 /* DEBUG */
+   {
+      int i;
+      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
+      for (i = 0; i < 4; i++) {
+         printf("Frag %d: %f %f %f %f\n", i,
+                rgba[0][i],
+                rgba[1][i],
+                rgba[2][i],
+                rgba[3][i]);
+      }
+   }
+#endif
+}
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
new file mode 100644
index 0000000000..f0a8a78778
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_SAMPLE_H
+#define SP_TEX_SAMPLE_H
+
+
+#include "tgsi/tgsi_exec.h"
+
+
+/**
+ * Subclass of tgsi_sampler
+ */
+struct sp_shader_sampler
+{
+   struct tgsi_sampler base;  /**< base class */
+
+   uint unit;
+   struct softpipe_context *sp;
+   struct softpipe_tile_cache *cache;
+};
+
+
+
+static INLINE const struct sp_shader_sampler *
+sp_shader_sampler(const struct tgsi_sampler *sampler)
+{
+   return (const struct sp_shader_sampler *) sampler;
+}
+
+
+extern void
+sp_get_samples(struct tgsi_sampler *sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+#endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
new file mode 100644
index 0000000000..cb48035771
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -0,0 +1,356 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  *   Michel Dänzer <michel@tungstengraphics.com>
+  */
+
+#include "pipe/p_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "pipe/p_winsys.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+#include "sp_screen.h"
+
+
+/* Simple, maximally packed layout.
+ */
+
+static unsigned minify( unsigned d )
+{
+   return MAX2(1, d>>1);
+}
+
+
+/* Conventional allocation path for non-display textures:
+ */
+static boolean
+softpipe_texture_layout(struct pipe_screen *screen,
+                        struct softpipe_texture * spt)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct pipe_texture *pt = &spt->base;
+   unsigned level;
+   unsigned width = pt->width[0];
+   unsigned height = pt->height[0];
+   unsigned depth = pt->depth[0];
+
+   unsigned buffer_size = 0;
+
+   for (level = 0; level <= pt->last_level; level++) {
+      pt->width[level] = width;
+      pt->height[level] = height;
+      pt->depth[level] = depth;
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
+      spt->stride[level] = pt->nblocksx[level]*pt->block.size;
+
+      spt->level_offset[level] = buffer_size;
+
+      buffer_size += (pt->nblocksy[level] *
+                      ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
+                      spt->stride[level]);
+
+      width  = minify(width);
+      height = minify(height);
+      depth = minify(depth);
+   }
+
+   spt->buffer = ws->buffer_create(ws, 32,
+                                   PIPE_BUFFER_USAGE_PIXEL,
+                                   buffer_size);
+
+   return spt->buffer != NULL;
+}
+
+
+
+/* Hack it up to use the old winsys->surface_alloc_storage()
+ * method for now:
+ */
+static boolean
+softpipe_displaytarget_layout(struct pipe_screen *screen,
+                              struct softpipe_texture * spt)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct pipe_surface surf;
+   unsigned flags = (PIPE_BUFFER_USAGE_CPU_READ |
+                     PIPE_BUFFER_USAGE_CPU_WRITE |
+                     PIPE_BUFFER_USAGE_GPU_READ |
+                     PIPE_BUFFER_USAGE_GPU_WRITE);
+
+
+   memset(&surf, 0, sizeof(surf));
+
+   ws->surface_alloc_storage( ws, 
+                              &surf,
+                              spt->base.width[0], 
+                              spt->base.height[0],
+                              spt->base.format,
+                              flags,
+                              spt->base.tex_usage);
+      
+   /* Now extract the goodies: 
+    */
+   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
+   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
+   spt->stride[0] = surf.stride;
+   spt->buffer = surf.buffer;
+
+   return spt->buffer != NULL;
+}
+
+
+
+
+
+static struct pipe_texture *
+softpipe_texture_create(struct pipe_screen *screen,
+                        const struct pipe_texture *templat)
+{
+   struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
+
+   spt->base = *templat;
+   spt->base.refcount = 1;
+   spt->base.screen = screen;
+
+   if (spt->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET) {
+      if (!softpipe_displaytarget_layout(screen, spt))
+         goto fail;
+   }
+   else {
+      if (!softpipe_texture_layout(screen, spt))
+         goto fail;
+   }
+    
+   assert(spt->base.refcount == 1);
+   return &spt->base;
+
+ fail:
+   FREE(spt);
+   return NULL;
+}
+
+
+static struct pipe_texture *
+softpipe_texture_blanket(struct pipe_screen * screen,
+                         const struct pipe_texture *base,
+                         const unsigned *stride,
+                         struct pipe_buffer *buffer)
+{
+   struct softpipe_texture *spt;
+   assert(screen);
+
+   /* Only supports one type */
+   if (base->target != PIPE_TEXTURE_2D ||
+       base->last_level != 0 ||
+       base->depth[0] != 1) {
+      return NULL;
+   }
+
+   spt = CALLOC_STRUCT(softpipe_texture);
+   if (!spt)
+      return NULL;
+
+   spt->base = *base;
+   spt->base.refcount = 1;
+   spt->base.screen = screen;
+   spt->base.nblocksx[0] = pf_get_nblocksx(&spt->base.block, spt->base.width[0]);  
+   spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
+   spt->stride[0] = stride[0];
+
+   pipe_buffer_reference(screen, &spt->buffer, buffer);
+
+   return &spt->base;
+}
+
+
+static void
+softpipe_texture_release(struct pipe_screen *screen,
+                         struct pipe_texture **pt)
+{
+   if (!*pt)
+      return;
+
+   if (--(*pt)->refcount <= 0) {
+      struct softpipe_texture *spt = softpipe_texture(*pt);
+
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      FREE(spt);
+   }
+   *pt = NULL;
+}
+
+
+static struct pipe_surface *
+softpipe_get_tex_surface(struct pipe_screen *screen,
+                         struct pipe_texture *pt,
+                         unsigned face, unsigned level, unsigned zslice,
+                         unsigned usage)
+{
+   struct pipe_winsys *ws = screen->winsys;
+   struct softpipe_texture *spt = softpipe_texture(pt);
+   struct pipe_surface *ps;
+
+   assert(level <= pt->last_level);
+
+   ps = ws->surface_alloc(ws);
+   if (ps) {
+      assert(ps->refcount);
+      assert(ps->winsys);
+      pipe_buffer_reference(screen, &ps->buffer, spt->buffer);
+      ps->format = pt->format;
+      ps->block = pt->block;
+      ps->width = pt->width[level];
+      ps->height = pt->height[level];
+      ps->nblocksx = pt->nblocksx[level];
+      ps->nblocksy = pt->nblocksy[level];
+      ps->stride = spt->stride[level];
+      ps->offset = spt->level_offset[level];
+      ps->usage = usage;
+      
+      /* Because we are softpipe, anything that the state tracker
+       * thought was going to be done with the GPU will actually get
+       * done with the CPU.  Let's adjust the flags to take that into
+       * account.
+       */
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_WRITE) {
+         /* GPU_WRITE means "render" and that can involve reads (blending) */
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_CPU_READ;
+      }
+
+      if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
+         ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->modified = TRUE;
+      }
+
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
+
+      if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
+	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
+		       ps->nblocksy *
+		       ps->stride;
+      }
+      else {
+	 assert(face == 0);
+	 assert(zslice == 0);
+      }
+   }
+   return ps;
+}
+
+
+static void 
+softpipe_tex_surface_release(struct pipe_screen *screen, 
+                             struct pipe_surface **s)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert ((*s)->texture);
+   pipe_texture_reference(&(*s)->texture, NULL); 
+
+   screen->winsys->surface_release(screen->winsys, s);
+}
+
+
+static void *
+softpipe_surface_map( struct pipe_screen *screen,
+                      struct pipe_surface *surface,
+                      unsigned flags )
+{
+   ubyte *map;
+
+   if (flags & ~surface->usage) {
+      assert(0);
+      return NULL;
+   }
+
+   map = pipe_buffer_map( screen, surface->buffer, flags );
+   if (map == NULL)
+      return NULL;
+
+   /* May want to different things here depending on read/write nature
+    * of the map:
+    */
+   if (surface->texture &&
+       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
+   {
+      /* Do something to notify sharing contexts of a texture change.
+       * In softpipe, that would mean flushing the texture cache.
+       */
+      softpipe_screen(screen)->timestamp++;
+   }
+   
+   return map + surface->offset;
+}
+
+
+static void
+softpipe_surface_unmap(struct pipe_screen *screen,
+                       struct pipe_surface *surface)
+{
+   pipe_buffer_unmap( screen, surface->buffer );
+}
+
+
+void
+softpipe_init_texture_funcs(struct softpipe_context *sp)
+{
+}
+
+
+void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
+{
+   screen->texture_create = softpipe_texture_create;
+   screen->texture_blanket = softpipe_texture_blanket;
+   screen->texture_release = softpipe_texture_release;
+
+   screen->get_tex_surface = softpipe_get_tex_surface;
+   screen->tex_surface_release = softpipe_tex_surface_release;
+
+   screen->surface_map = softpipe_surface_map;
+   screen->surface_unmap = softpipe_surface_unmap;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
new file mode 100644
index 0000000000..bf437a7c61
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -0,0 +1,70 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEXTURE_H
+#define SP_TEXTURE_H
+
+
+#include "pipe/p_state.h"
+
+
+struct pipe_context;
+struct pipe_screen;
+struct softpipe_context;
+
+
+struct softpipe_texture
+{
+   struct pipe_texture base;
+
+   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+
+   /* The data is held here:
+    */
+   struct pipe_buffer *buffer;
+
+   boolean modified;
+};
+
+
+/** cast wrapper */
+static INLINE struct softpipe_texture *
+softpipe_texture(struct pipe_texture *pt)
+{
+   return (struct softpipe_texture *) pt;
+}
+
+
+extern void
+softpipe_init_texture_funcs( struct softpipe_context *softpipe );
+
+extern void
+softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
+
+
+#endif /* SP_TEXTURE */
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
new file mode 100644
index 0000000000..78b0efa46d
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -0,0 +1,614 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Framebuffer/surface tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tile_cache.h"
+
+#define NUM_ENTRIES 32
+
+
+/** XXX move these */
+#define MAX_WIDTH 2048
+#define MAX_HEIGHT 2048
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   void *surface_map;
+   struct pipe_texture *texture;  /**< if caching a texture */
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];
+   uint clear_val;
+   boolean depth_stencil; /** Is the surface a depth/stencil format? */
+
+   struct pipe_surface *tex_surf;
+   void *tex_surf_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+};
+
+
+/**
+ * Return the position in the cache for the tile that contains win pos (x,y).
+ * We currently use a direct mapped cache so this is like a hack key.
+ * At some point we should investige something more sophisticated, like
+ * a LRU replacement policy.
+ */
+#define CACHE_POS(x, y) \
+   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+
+
+
+/**
+ * Is the tile at (x,y) in cleared state?
+ */
+static INLINE uint
+is_clear_flag_set(const uint *bitvec, int x, int y)
+{
+   int pos, bit;
+   x /= TILE_SIZE;
+   y /= TILE_SIZE;
+   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bit = bitvec[pos / 32] & (1 << (pos & 31));
+   return bit;
+}
+   
+
+/**
+ * Mark the tile at (x,y) as not cleared.
+ */
+static INLINE void
+clear_clear_flag(uint *bitvec, int x, int y)
+{
+   int pos;
+   x /= TILE_SIZE;
+   y /= TILE_SIZE;
+   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
+   bitvec[pos / 32] &= ~(1 << (pos & 31));
+}
+   
+
+struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].x =
+         tc->entries[pos].y = -1;
+      }
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
+{
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->surface) {
+      pipe_surface_reference(&tc->surface, NULL);
+   }
+   if (tc->tex_surf) {
+      pipe_surface_reference(&tc->tex_surf, NULL);
+   }
+
+   FREE( tc );
+}
+
+
+/**
+ * Specify the surface to cache.
+ */
+void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *ps)
+{
+   assert(!tc->texture);
+
+   if (tc->surface_map) {
+      tc->screen->surface_unmap(tc->screen, tc->surface);
+      tc->surface_map = NULL;
+   }
+
+   pipe_surface_reference(&tc->surface, ps);
+
+   if (tc->surface) {
+      if (tc->surface_map) /* XXX: this is always NULL!? */
+	 tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
+                                                   PIPE_BUFFER_USAGE_CPU_READ | 
+                                                   PIPE_BUFFER_USAGE_CPU_WRITE);
+
+      tc->depth_stencil = (ps->format == PIPE_FORMAT_S8Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_X8Z24_UNORM ||
+                           ps->format == PIPE_FORMAT_Z24S8_UNORM ||
+                           ps->format == PIPE_FORMAT_Z24X8_UNORM ||
+                           ps->format == PIPE_FORMAT_Z16_UNORM ||
+                           ps->format == PIPE_FORMAT_Z32_UNORM ||
+                           ps->format == PIPE_FORMAT_S8_UNORM);
+   }
+}
+
+
+/**
+ * Return the surface being cached.
+ */
+struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc)
+{
+   return tc->surface;
+}
+
+
+void
+sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc)
+{
+   if (tc->surface && !tc->surface_map)
+      tc->surface_map = tc->screen->surface_map(tc->screen, tc->surface,
+                                                PIPE_BUFFER_USAGE_CPU_WRITE |
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+
+   if (tc->tex_surf && !tc->tex_surf_map)
+      tc->tex_surf_map = tc->screen->surface_map(tc->screen, tc->tex_surf,
+                                                 PIPE_BUFFER_USAGE_CPU_READ);
+}
+
+
+void
+sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc)
+{
+   if (tc->surface_map) {
+      tc->screen->surface_unmap(tc->screen, tc->surface);
+      tc->surface_map = NULL;
+   }
+
+   if (tc->tex_surf_map) {
+      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+      tc->tex_surf_map = NULL;
+   }
+}
+
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tile_cache_set_texture(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->surface);
+
+   pipe_texture_reference(&tc->texture, texture);
+
+   if (tc->tex_surf_map) {
+      tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+      tc->tex_surf_map = NULL;
+   }
+   pipe_surface_reference(&tc->tex_surf, NULL);
+
+   /* mark as entries as invalid/empty */
+   /* XXX we should try to avoid this when the teximage hasn't changed */
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].x = -1;
+   }
+
+   tc->tex_face = -1; /* any invalid value here */
+}
+
+
+/**
+ * Set pixels in a tile to the given clear color/value, float.
+ */
+static void
+clear_tile_rgba(struct softpipe_cached_tile *tile,
+                enum pipe_format format,
+                const float clear_value[4])
+{
+   if (clear_value[0] == 0.0 &&
+       clear_value[1] == 0.0 &&
+       clear_value[2] == 0.0 &&
+       clear_value[3] == 0.0) {
+      memset(tile->data.color, 0, sizeof(tile->data.color));
+   }
+   else {
+      uint i, j;
+      for (i = 0; i < TILE_SIZE; i++) {
+         for (j = 0; j < TILE_SIZE; j++) {
+            tile->data.color[i][j][0] = clear_value[0];
+            tile->data.color[i][j][1] = clear_value[1];
+            tile->data.color[i][j][2] = clear_value[2];
+            tile->data.color[i][j][3] = clear_value[3];
+         }
+      }
+   }
+}
+
+
+/**
+ * Set a tile to a solid value/color.
+ */
+static void
+clear_tile(struct softpipe_cached_tile *tile,
+           enum pipe_format format,
+           uint clear_value)
+{
+   uint i, j;
+
+   switch (pf_get_size(format)) {
+   case 1:
+      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      break;
+   case 2:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 2 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.depth16[i][j] = (ushort) clear_value;
+            }
+         }
+      }
+      break;
+   case 4:
+      if (clear_value == 0) {
+         memset(tile->data.any, 0, 4 * TILE_SIZE * TILE_SIZE);
+      }
+      else {
+         for (i = 0; i < TILE_SIZE; i++) {
+            for (j = 0; j < TILE_SIZE; j++) {
+               tile->data.color32[i][j] = clear_value;
+            }
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+/**
+ * Actually clear the tiles which were flagged as being in a clear state.
+ */
+static void
+sp_tile_cache_flush_clear(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc)
+{
+   struct pipe_surface *ps = tc->surface;
+   const uint w = tc->surface->width;
+   const uint h = tc->surface->height;
+   uint x, y;
+   uint numCleared = 0;
+
+   /* clear the scratch tile to the clear value */
+   clear_tile(&tc->tile, ps->format, tc->clear_val);
+
+   /* push the tile to all positions marked as clear */
+   for (y = 0; y < h; y += TILE_SIZE) {
+      for (x = 0; x < w; x += TILE_SIZE) {
+         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+            pipe_put_tile_raw(ps,
+                              x, y, TILE_SIZE, TILE_SIZE,
+                              tc->tile.data.color32, 0/*STRIDE*/);
+
+            /* do this? */
+            clear_clear_flag(tc->clear_flags, x, y);
+
+            numCleared++;
+         }
+      }
+   }
+#if 0
+   debug_printf("num cleared: %u\n", numCleared);
+#endif
+}
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the surface.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tile_cache(struct softpipe_context *softpipe,
+                    struct softpipe_tile_cache *tc)
+{
+   struct pipe_surface *ps = tc->surface;
+   int inuse = 0, pos;
+
+   if (ps && ps->buffer) {
+      /* caching a drawing surface */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         struct softpipe_cached_tile *tile = tc->entries + pos;
+         if (tile->x >= 0) {
+            if (tc->depth_stencil) {
+               pipe_put_tile_raw(ps,
+                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->data.depth32, 0/*STRIDE*/);
+            }
+            else {
+               pipe_put_tile_rgba(ps,
+                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  (float *) tile->data.color);
+            }
+            tile->x = tile->y = -1;  /* mark as empty */
+            inuse++;
+         }
+      }
+
+#if TILE_CLEAR_OPTIMIZATION
+      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+#endif
+   }
+   else if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].x = -1;
+      }
+      tc->tex_face = -1;
+   }
+
+#if 0
+   debug_printf("flushed tiles in use: %d\n", inuse);
+#endif
+}
+
+
+/**
+ * Get a tile from the cache.
+ * \param x, y  position of tile, in pixels
+ */
+struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_context *softpipe,
+                   struct softpipe_tile_cache *tc, int x, int y)
+{
+   struct pipe_surface *ps = tc->surface;
+
+   /* tile pos in framebuffer: */
+   const int tile_x = x & ~(TILE_SIZE - 1);
+   const int tile_y = y & ~(TILE_SIZE - 1);
+
+   /* cache pos/entry: */
+   const int pos = CACHE_POS(x, y);
+   struct softpipe_cached_tile *tile = tc->entries + pos;
+
+   if (tile_x != tile->x ||
+       tile_y != tile->y) {
+
+      if (tile->x != -1) {
+         /* put dirty tile back in framebuffer */
+         if (tc->depth_stencil) {
+            pipe_put_tile_raw(ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_put_tile_rgba(ps,
+                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+
+      tile->x = tile_x;
+      tile->y = tile_y;
+
+      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         /* don't get tile from framebuffer, just clear it */
+         if (tc->depth_stencil) {
+            clear_tile(tile, ps->format, tc->clear_val);
+         }
+         else {
+            clear_tile_rgba(tile, ps->format, tc->clear_color);
+         }
+         clear_clear_flag(tc->clear_flags, x, y);
+      }
+      else {
+         /* get new tile data from surface */
+         if (tc->depth_stencil) {
+            pipe_get_tile_raw(ps,
+                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->data.depth32, 0/*STRIDE*/);
+         }
+         else {
+            pipe_get_tile_rgba(ps,
+                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               (float *) tile->data.color);
+         }
+      }
+   }
+
+   return tile;
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos(int x, int y, int z, int face, int level)
+{
+   uint entry = x + y * 5 + z * 4 + face + level;
+   return entry % NUM_ENTRIES;
+}
+
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_context *sp,
+                       struct softpipe_tile_cache *tc, int x, int y, int z,
+                       int face, int level)
+{
+   struct pipe_screen *screen = sp->pipe.screen;
+   /* tile pos in framebuffer: */
+   const int tile_x = x & ~(TILE_SIZE - 1);
+   const int tile_y = y & ~(TILE_SIZE - 1);
+   /* cache pos/entry: */
+   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
+                                  face, level);
+   struct softpipe_cached_tile *tile = tc->entries + pos;
+
+   if (tc->texture) {
+      struct softpipe_texture *spt = softpipe_texture(tc->texture);
+      if (spt->modified) {
+         /* texture was modified, force a cache reload */
+         tile->x = -1;
+         spt->modified = FALSE;
+      }
+   }
+
+   if (tile_x != tile->x ||
+       tile_y != tile->y ||
+       z != tile->z ||
+       face != tile->face ||
+       level != tile->level) {
+      /* cache miss */
+
+      /* check if we need to get a new surface */
+      if (!tc->tex_surf ||
+          tc->tex_face != face ||
+          tc->tex_level != level ||
+          tc->tex_z != z) {
+         /* get new surface (view into texture) */
+
+	 if (tc->tex_surf_map)
+            tc->screen->surface_unmap(tc->screen, tc->tex_surf);
+
+         tc->tex_surf = screen->get_tex_surface(screen, tc->texture, face, level, z, 
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+         tc->tex_surf_map = screen->surface_map(screen, tc->tex_surf,
+                                                PIPE_BUFFER_USAGE_CPU_READ);
+
+         tc->tex_face = face;
+         tc->tex_level = level;
+         tc->tex_z = z;
+      }
+
+      /* get tile from the surface (view into texture) */
+      pipe_get_tile_rgba(tc->tex_surf,
+                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->x = tile_x;
+      tile->y = tile_y;
+      tile->z = z;
+      tile->face = face;
+      tile->level = level;
+   }
+
+   return tile;
+}
+
+
+/**
+ * When a whole surface is being cleared to a value we can avoid
+ * fetching tiles above.
+ * Save the color and set a 'clearflag' for each tile of the screen.
+ */
+void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, uint clearValue)
+{
+   uint r, g, b, a;
+   uint pos;
+
+   tc->clear_val = clearValue;
+
+   switch (tc->surface->format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      r = (clearValue >> 24) & 0xff;
+      g = (clearValue >> 16) & 0xff;
+      b = (clearValue >>  8) & 0xff;
+      a = (clearValue      ) & 0xff;
+      break;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      r = (clearValue >> 16) & 0xff;
+      g = (clearValue >>  8) & 0xff;
+      b = (clearValue      ) & 0xff;
+      a = (clearValue >> 24) & 0xff;
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      r = (clearValue >>  8) & 0xff;
+      g = (clearValue >> 16) & 0xff;
+      b = (clearValue >> 24) & 0xff;
+      a = (clearValue      ) & 0xff;
+      break;
+   default:
+      r = g = b = a = 0;
+   }
+
+   tc->clear_color[0] = r / 255.0f;
+   tc->clear_color[1] = g / 255.0f;
+   tc->clear_color[2] = b / 255.0f;
+   tc->clear_color[3] = a / 255.0f;
+
+#if TILE_CLEAR_OPTIMIZATION
+   /* set flags to indicate all the tiles are cleared */
+   memset(tc->clear_flags, 255, sizeof(tc->clear_flags));
+#else
+   /* disable the optimization */
+   memset(tc->clear_flags, 0, sizeof(tc->clear_flags));
+#endif
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      struct softpipe_cached_tile *tile = tc->entries + pos;
+      tile->x = tile->y = -1;
+   }
+}
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
new file mode 100644
index 0000000000..a66bb50bcc
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -0,0 +1,105 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TILE_CACHE_H
+#define SP_TILE_CACHE_H
+
+#define TILE_CLEAR_OPTIMIZATION 1
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+
+struct softpipe_cached_tile
+{
+   int x, y;           /**< pos of tile in window coords */
+   int z, face, level; /**< Extra texture indexes */
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+      uint color32[TILE_SIZE][TILE_SIZE];
+      uint depth32[TILE_SIZE][TILE_SIZE];
+      ushort depth16[TILE_SIZE][TILE_SIZE];
+      ubyte stencil8[TILE_SIZE][TILE_SIZE];
+      ubyte any[1];
+   } data;
+};
+
+
+extern struct softpipe_tile_cache *
+sp_create_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tile_cache(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
+                          struct pipe_surface *sps);
+
+extern struct pipe_surface *
+sp_tile_cache_get_surface(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_map_surfaces(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_unmap_surfaces(struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_set_texture(struct pipe_context *pipe,
+                          struct softpipe_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+extern void
+sp_flush_tile_cache(struct softpipe_context *softpipe,
+                    struct softpipe_tile_cache *tc);
+
+extern void
+sp_tile_cache_clear(struct softpipe_tile_cache *tc, uint clearValue);
+
+extern struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_context *softpipe,
+                   struct softpipe_tile_cache *tc, int x, int y);
+
+extern const struct softpipe_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_context *softpipe,
+                       struct softpipe_tile_cache *tc, int x, int y, int z,
+                       int face, int level);
+
+
+#endif /* SP_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_winsys.h b/src/gallium/drivers/softpipe/sp_winsys.h
new file mode 100644
index 0000000000..4ab666486c
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_winsys.h
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* This is the interface that softpipe requires any window system
+ * hosting it to implement.  This is the only include file in softpipe
+ * which is public.
+ */
+
+
+#ifndef SP_WINSYS_H
+#define SP_WINSYS_H
+
+
+#include "pipe/p_compiler.h" /* for boolean */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+enum pipe_format;
+
+struct softpipe_winsys {
+   /** test if the given format is supported for front/back color bufs */
+   boolean (*is_format_supported)( struct softpipe_winsys *sws,
+                                   enum pipe_format format );
+
+};
+
+struct pipe_screen;
+struct pipe_winsys;
+struct pipe_context;
+
+
+struct pipe_context *softpipe_create( struct pipe_screen *,
+                                      struct pipe_winsys *,
+				      void *unused );
+
+
+struct pipe_screen *
+softpipe_create_screen(struct pipe_winsys *);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SP_WINSYS_H */
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
new file mode 100644
index 0000000000..f0e1cd596d
--- /dev/null
+++ b/src/gallium/drivers/trace/README
@@ -0,0 +1,64 @@
+                             TRACE PIPE DRIVER
+
+
+= About =
+
+This directory contains a Gallium3D pipe driver which traces all incoming calls.
+
+
+= Build Instructions =
+
+To build, invoke scons on the top dir as
+ 
+ scons statetrackers=mesa drivers=softpipe,i965simple,trace winsys=xlib
+
+
+= Usage =
+
+To use do
+
+ ln -s libGL.so build/linux-x86-debug/gallium/winsys/xlib/libGL.so.1
+ export LD_LIBRARY_PATH=$PWD/build/linux-x86-debug/gallium/winsys/xlib
+
+ensure the right libGL.so is being picked by doing
+
+ ldd progs/trivial/tri 
+
+and then try running
+
+ GALLIUM_TRACE=tri.trace progs/trivial/tri
+
+which should create a tri.trace file, which is an XML file. You can view copying 
+trace.xsl to the same directory, and opening with a XSLT capable browser such as 
+Firefox or Internet Explorer.
+
+
+= Integrating =
+
+You can integrate the trace pipe driver either inside the state tracker or the 
+winsys. The procedure on both cases is the same. Let's assume you have a 
+pipe_screen and a pipe_context pair obtained by the usual means (variable and 
+function names are just for illustration purposes):
+
+  real_screen = real_screen_create(...);
+  
+  real_context = real_context_create(...);
+  
+The trace screen and pipe_context is then created by doing
+
+  trace_screen = trace_screen_create(real_screen);
+  
+  trace_context = trace_context_create(trace_screen, real_context);
+  
+You can then simply use trace_screen and trace_context instead of real_screen
+and real_context.
+
+Do not call trace_winsys_create. Simply pass trace_screen->winsys or 
+trace_context->winsys in places you would pass winsys.
+
+You can create as many contexts you wish. Just ensure that you don't mistake 
+trace_screen with real_screen when creating them.
+
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
diff --git a/src/gallium/drivers/trace/SConscript b/src/gallium/drivers/trace/SConscript
new file mode 100644
index 0000000000..0a6bfb8f4c
--- /dev/null
+++ b/src/gallium/drivers/trace/SConscript
@@ -0,0 +1,16 @@
+Import('*')
+
+env = env.Clone()
+
+trace = env.ConvenienceLibrary(
+    target = 'trace',
+    source = [
+        'tr_context.c',
+        'tr_dump.c',
+        'tr_screen.c',
+        'tr_state.c',
+        'tr_texture.c',
+        'tr_winsys.c',
+    ])
+
+Export('trace')
+\ No newline at end of file
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
new file mode 100644
index 0000000000..1dd7719379
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -0,0 +1,1072 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_screen.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+#include "tr_winsys.h"
+#include "tr_context.h"
+
+
+static INLINE struct pipe_texture * 
+trace_texture_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_texture *texture)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen); 
+   struct trace_texture *tr_tex;
+   
+   if(!texture)
+      return NULL;
+   
+   tr_tex = trace_texture(tr_scr, texture);
+   
+   assert(tr_tex->texture);
+   assert(tr_tex->texture->screen == tr_scr->screen);
+   return tr_tex->texture;
+}
+
+
+static INLINE struct pipe_surface * 
+trace_surface_unwrap(struct trace_context *tr_ctx,
+                     struct pipe_surface *surface)
+{
+   struct trace_screen *tr_scr = trace_screen(tr_ctx->base.screen); 
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   
+   if(!surface)
+      return NULL;
+
+   assert(surface->texture);
+   if(!surface->texture)
+      return surface;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   
+   assert(tr_surf->surface);
+   assert(tr_surf->surface->texture->screen == tr_scr->screen);
+   return tr_surf->surface;
+}
+
+
+static INLINE void
+trace_context_set_edgeflags(struct pipe_context *_pipe,
+                            const unsigned *bitfield)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_edgeflags");
+   
+   trace_dump_arg(ptr, pipe);
+   /* FIXME: we don't know how big this array is */
+   trace_dump_arg(ptr, bitfield);
+
+   pipe->set_edgeflags(pipe, bitfield);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE boolean
+trace_context_draw_arrays(struct pipe_context *_pipe,
+                          unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_dump_call_begin("pipe_context", "draw_arrays");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_arrays(pipe, mode, start, count);;
+
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE boolean
+trace_context_draw_elements(struct pipe_context *_pipe,
+                          struct pipe_buffer *indexBuffer,
+                          unsigned indexSize,
+                          unsigned mode, unsigned start, unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, indexBuffer);
+
+   trace_dump_call_begin("pipe_context", "draw_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_elements(pipe, indexBuffer, indexSize, mode, start, count);;
+
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE boolean
+trace_context_draw_range_elements(struct pipe_context *_pipe,
+                                  struct pipe_buffer *indexBuffer,
+                                  unsigned indexSize,
+                                  unsigned minIndex,
+                                  unsigned maxIndex,
+                                  unsigned mode, 
+                                  unsigned start, 
+                                  unsigned count)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   boolean result;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, indexBuffer);
+
+   trace_dump_call_begin("pipe_context", "draw_range_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, indexBuffer);
+   trace_dump_arg(uint, indexSize);
+   trace_dump_arg(uint, minIndex);
+   trace_dump_arg(uint, maxIndex);
+   trace_dump_arg(uint, mode);
+   trace_dump_arg(uint, start);
+   trace_dump_arg(uint, count);
+
+   result = pipe->draw_range_elements(pipe, 
+                                      indexBuffer, 
+                                      indexSize, minIndex, maxIndex, 
+                                      mode, start, count);
+   
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE struct pipe_query *
+trace_context_create_query(struct pipe_context *_pipe,
+                           unsigned query_type)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_query *result;
+
+   trace_dump_call_begin("pipe_context", "create_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, query_type);
+
+   result = pipe->create_query(pipe, query_type);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_destroy_query(struct pipe_context *_pipe,
+                            struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->destroy_query(pipe, query);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_begin_query(struct pipe_context *_pipe, 
+                          struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "begin_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->begin_query(pipe, query);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_end_query(struct pipe_context *_pipe, 
+                        struct pipe_query *query)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "end_query");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, query);
+
+   pipe->end_query(pipe, query);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE boolean
+trace_context_get_query_result(struct pipe_context *_pipe, 
+                               struct pipe_query *query,
+                               boolean wait,
+                               uint64 *presult)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   uint64 result;
+   boolean _result;
+
+   trace_dump_call_begin("pipe_context", "get_query_result");
+
+   trace_dump_arg(ptr, pipe);
+
+   _result = pipe->get_query_result(pipe, query, wait, presult);;
+   result = *presult;
+
+   trace_dump_arg(uint, result);
+   trace_dump_ret(bool, _result);
+   
+   trace_dump_call_end();
+   
+   return _result;
+}
+
+
+static INLINE void *
+trace_context_create_blend_state(struct pipe_context *_pipe,
+                                 const struct pipe_blend_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_state, state);
+
+   result = pipe->create_blend_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_blend_state(struct pipe_context *_pipe, 
+                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_blend_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_blend_state(struct pipe_context *_pipe, 
+                                 void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_blend_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_blend_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_sampler_state(struct pipe_context *_pipe,
+                                   const struct pipe_sampler_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(sampler_state, state);
+
+   result = pipe->create_sampler_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_sampler_states(struct pipe_context *_pipe, 
+                                  unsigned num_states, void **states)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_sampler_states");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_states);
+   trace_dump_arg_array(ptr, states, num_states);
+
+   pipe->bind_sampler_states(pipe, num_states, states);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_sampler_state(struct pipe_context *_pipe, 
+                                   void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_sampler_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_sampler_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_rasterizer_state(struct pipe_context *_pipe,
+                                      const struct pipe_rasterizer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(rasterizer_state, state);
+
+   result = pipe->create_rasterizer_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_rasterizer_state(struct pipe_context *_pipe, 
+                                    void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_rasterizer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_rasterizer_state(struct pipe_context *_pipe, 
+                                      void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_rasterizer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_rasterizer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_depth_stencil_alpha_state(struct pipe_context *_pipe,
+                                               const struct pipe_depth_stencil_alpha_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_depth_stencil_alpha_state");
+
+   result = pipe->create_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(depth_stencil_alpha_state, state);
+   
+   trace_dump_ret(ptr, result);
+
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_depth_stencil_alpha_state(struct pipe_context *_pipe, 
+                                             void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe, 
+                                               void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_depth_stencil_alpha_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_depth_stencil_alpha_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_fs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_fs_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_fs_state(struct pipe_context *_pipe, 
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_fs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_fs_state(struct pipe_context *_pipe, 
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_fs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_fs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void *
+trace_context_create_vs_state(struct pipe_context *_pipe,
+                              const struct pipe_shader_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   void * result;
+
+   trace_dump_call_begin("pipe_context", "create_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(shader_state, state);
+
+   result = pipe->create_vs_state(pipe, state);;
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static INLINE void
+trace_context_bind_vs_state(struct pipe_context *_pipe, 
+                            void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "bind_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->bind_vs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_delete_vs_state(struct pipe_context *_pipe, 
+                              void *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "delete_vs_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, state);
+
+   pipe->delete_vs_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_blend_color(struct pipe_context *_pipe,
+                              const struct pipe_blend_color *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_blend_color");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(blend_color, state);
+
+   pipe->set_blend_color(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_clip_state(struct pipe_context *_pipe,
+                             const struct pipe_clip_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_clip_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(clip_state, state);
+
+   pipe->set_clip_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_constant_buffer(struct pipe_context *_pipe,
+                                  uint shader, uint index,
+                                  const struct pipe_constant_buffer *buffer)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_winsys_user_buffer_update(_pipe->winsys, (struct pipe_buffer *)buffer);
+   
+   trace_dump_call_begin("pipe_context", "set_constant_buffer");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, shader);
+   trace_dump_arg(uint, index);
+   trace_dump_arg(constant_buffer, buffer);
+
+   pipe->set_constant_buffer(pipe, shader, index, buffer);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_framebuffer_state(struct pipe_context *_pipe,
+                                    const struct pipe_framebuffer_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_framebuffer_state unwrapped_state;
+   unsigned i;
+   
+   /* Unwrap the input state */
+   memcpy(&unwrapped_state, state, sizeof(unwrapped_state));
+   for(i = 0; i < state->num_cbufs; ++i)
+      unwrapped_state.cbufs[i] = trace_surface_unwrap(tr_ctx, state->cbufs[i]);
+   for(i = state->num_cbufs; i < PIPE_MAX_COLOR_BUFS; ++i)
+      unwrapped_state.cbufs[i] = NULL;
+   unwrapped_state.zsbuf = trace_surface_unwrap(tr_ctx, state->zsbuf);
+   state = &unwrapped_state;
+   
+   trace_dump_call_begin("pipe_context", "set_framebuffer_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(framebuffer_state, state);
+
+   pipe->set_framebuffer_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_polygon_stipple(struct pipe_context *_pipe,
+                                  const struct pipe_poly_stipple *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_polygon_stipple");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(poly_stipple, state);
+
+   pipe->set_polygon_stipple(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_scissor_state(struct pipe_context *_pipe,
+                                const struct pipe_scissor_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_scissor_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(scissor_state, state);
+
+   pipe->set_scissor_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_viewport_state(struct pipe_context *_pipe,
+                                 const struct pipe_viewport_state *state)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_viewport_state");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(viewport_state, state);
+
+   pipe->set_viewport_state(pipe, state);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_sampler_textures(struct pipe_context *_pipe,
+                                   unsigned num_textures,
+                                   struct pipe_texture **textures)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   struct pipe_texture *unwrapped_textures[PIPE_MAX_SAMPLERS];
+   unsigned i;
+   
+   for(i = 0; i < num_textures; ++i)
+      unwrapped_textures[i] = trace_texture_unwrap(tr_ctx, textures[i]);
+   textures = unwrapped_textures;
+
+   trace_dump_call_begin("pipe_context", "set_sampler_textures");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_textures);
+   trace_dump_arg_array(ptr, textures, num_textures);
+
+   pipe->set_sampler_textures(pipe, num_textures, textures);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_buffers(struct pipe_context *_pipe,
+                                 unsigned num_buffers,
+                                 const struct pipe_vertex_buffer *buffers)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+   unsigned i;
+
+   for(i = 0; i < num_buffers; ++i)
+      trace_winsys_user_buffer_update(_pipe->winsys, buffers[i].buffer);
+
+   trace_dump_call_begin("pipe_context", "set_vertex_buffers");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_buffers);
+   
+   trace_dump_arg_begin("buffers");
+   trace_dump_struct_array(vertex_buffer, buffers, num_buffers);
+   trace_dump_arg_end();
+
+   pipe->set_vertex_buffers(pipe, num_buffers, buffers);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_set_vertex_elements(struct pipe_context *_pipe,
+                                  unsigned num_elements,
+                                  const struct pipe_vertex_element *elements)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_vertex_elements");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, num_elements);
+
+   trace_dump_arg_begin("elements");
+   trace_dump_struct_array(vertex_element, elements, num_elements);
+   trace_dump_arg_end();
+
+   pipe->set_vertex_elements(pipe, num_elements, elements);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_surface_copy(struct pipe_context *_pipe,
+                           boolean do_flip,
+                           struct pipe_surface *dest,
+                           unsigned destx, unsigned desty,
+                           struct pipe_surface *src,
+                           unsigned srcx, unsigned srcy,
+                           unsigned width, unsigned height)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dest = trace_surface_unwrap(tr_ctx, dest);
+   src = trace_surface_unwrap(tr_ctx, src);
+   
+   trace_dump_call_begin("pipe_context", "surface_copy");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(bool, do_flip);
+   trace_dump_arg(ptr, dest);
+   trace_dump_arg(uint, destx);
+   trace_dump_arg(uint, desty);
+   trace_dump_arg(ptr, src);
+   trace_dump_arg(uint, srcx);
+   trace_dump_arg(uint, srcy);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->surface_copy(pipe, do_flip, 
+                      dest, destx, desty, 
+                      src, srcx, srcy, width, height);
+   
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_surface_fill(struct pipe_context *_pipe,
+                           struct pipe_surface *dst,
+                           unsigned dstx, unsigned dsty,
+                           unsigned width, unsigned height,
+                           unsigned value)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   dst = trace_surface_unwrap(tr_ctx, dst);
+
+   trace_dump_call_begin("pipe_context", "surface_fill");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(uint, dstx);
+   trace_dump_arg(uint, dsty);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+
+   pipe->surface_fill(pipe, dst, dstx, dsty, width, height, value);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_clear(struct pipe_context *_pipe, 
+                    struct pipe_surface *surface,
+                    unsigned clearValue)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   surface = trace_surface_unwrap(tr_ctx, surface);
+
+   trace_dump_call_begin("pipe_context", "clear");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(ptr, surface);
+   trace_dump_arg(uint, clearValue);
+
+   pipe->clear(pipe, surface, clearValue);;
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_flush(struct pipe_context *_pipe,
+                    unsigned flags,
+                    struct pipe_fence_handle **fence)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "flush");
+
+   trace_dump_arg(ptr, pipe);
+   trace_dump_arg(uint, flags);
+
+   pipe->flush(pipe, flags, fence);;
+
+   if(fence)
+      trace_dump_ret(ptr, *fence);
+
+   trace_dump_call_end();
+}
+
+
+static INLINE void
+trace_context_destroy(struct pipe_context *_pipe)
+{
+   struct trace_context *tr_ctx = trace_context(_pipe);
+   struct pipe_context *pipe = tr_ctx->pipe;
+
+   trace_dump_call_begin("pipe_context", "destroy");
+
+   trace_dump_arg(ptr, pipe);
+
+   pipe->destroy(pipe);
+   
+   trace_dump_call_end();
+
+   FREE(tr_ctx);
+}
+
+
+struct pipe_context *
+trace_context_create(struct pipe_screen *screen, 
+                     struct pipe_context *pipe)
+{
+   struct trace_context *tr_ctx;
+   
+   if(!pipe)
+      goto error1;
+   
+   if(!trace_dump_enabled())
+      goto error1;
+   
+   tr_ctx = CALLOC_STRUCT(trace_context);
+   if(!tr_ctx)
+      goto error1;
+
+   tr_ctx->base.winsys = screen->winsys;
+   tr_ctx->base.screen = screen;
+   tr_ctx->base.destroy = trace_context_destroy;
+   tr_ctx->base.set_edgeflags = trace_context_set_edgeflags;
+   tr_ctx->base.draw_arrays = trace_context_draw_arrays;
+   tr_ctx->base.draw_elements = trace_context_draw_elements;
+   tr_ctx->base.draw_range_elements = trace_context_draw_range_elements;
+   tr_ctx->base.create_query = trace_context_create_query;
+   tr_ctx->base.destroy_query = trace_context_destroy_query;
+   tr_ctx->base.begin_query = trace_context_begin_query;
+   tr_ctx->base.end_query = trace_context_end_query;
+   tr_ctx->base.get_query_result = trace_context_get_query_result;
+   tr_ctx->base.create_blend_state = trace_context_create_blend_state;
+   tr_ctx->base.bind_blend_state = trace_context_bind_blend_state;
+   tr_ctx->base.delete_blend_state = trace_context_delete_blend_state;
+   tr_ctx->base.create_sampler_state = trace_context_create_sampler_state;
+   tr_ctx->base.bind_sampler_states = trace_context_bind_sampler_states;
+   tr_ctx->base.delete_sampler_state = trace_context_delete_sampler_state;
+   tr_ctx->base.create_rasterizer_state = trace_context_create_rasterizer_state;
+   tr_ctx->base.bind_rasterizer_state = trace_context_bind_rasterizer_state;
+   tr_ctx->base.delete_rasterizer_state = trace_context_delete_rasterizer_state;
+   tr_ctx->base.create_depth_stencil_alpha_state = trace_context_create_depth_stencil_alpha_state;
+   tr_ctx->base.bind_depth_stencil_alpha_state = trace_context_bind_depth_stencil_alpha_state;
+   tr_ctx->base.delete_depth_stencil_alpha_state = trace_context_delete_depth_stencil_alpha_state;
+   tr_ctx->base.create_fs_state = trace_context_create_fs_state;
+   tr_ctx->base.bind_fs_state = trace_context_bind_fs_state;
+   tr_ctx->base.delete_fs_state = trace_context_delete_fs_state;
+   tr_ctx->base.create_vs_state = trace_context_create_vs_state;
+   tr_ctx->base.bind_vs_state = trace_context_bind_vs_state;
+   tr_ctx->base.delete_vs_state = trace_context_delete_vs_state;
+   tr_ctx->base.set_blend_color = trace_context_set_blend_color;
+   tr_ctx->base.set_clip_state = trace_context_set_clip_state;
+   tr_ctx->base.set_constant_buffer = trace_context_set_constant_buffer;
+   tr_ctx->base.set_framebuffer_state = trace_context_set_framebuffer_state;
+   tr_ctx->base.set_polygon_stipple = trace_context_set_polygon_stipple;
+   tr_ctx->base.set_scissor_state = trace_context_set_scissor_state;
+   tr_ctx->base.set_viewport_state = trace_context_set_viewport_state;
+   tr_ctx->base.set_sampler_textures = trace_context_set_sampler_textures;
+   tr_ctx->base.set_vertex_buffers = trace_context_set_vertex_buffers;
+   tr_ctx->base.set_vertex_elements = trace_context_set_vertex_elements;
+   tr_ctx->base.surface_copy = trace_context_surface_copy;
+   tr_ctx->base.surface_fill = trace_context_surface_fill;
+   tr_ctx->base.clear = trace_context_clear;
+   tr_ctx->base.flush = trace_context_flush;
+
+   tr_ctx->pipe = pipe;
+   
+   trace_dump_call_begin("", "pipe_context_create");
+   trace_dump_arg_begin("screen");
+   trace_dump_ptr(pipe->screen);
+   trace_dump_arg_end();
+   trace_dump_ret(ptr, pipe);
+   trace_dump_call_end();
+
+   return &tr_ctx->base;
+   
+error1:
+   return pipe;
+}
diff --git a/src/gallium/drivers/trace/tr_context.h b/src/gallium/drivers/trace/tr_context.h
new file mode 100644
index 0000000000..7831900ec2
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_context.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_CONTEXT_H_
+#define TR_CONTEXT_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_context.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct trace_context
+{
+   struct pipe_context base;
+   
+   struct pipe_context *pipe;
+};
+
+
+static INLINE struct trace_context *
+trace_context(struct pipe_context *pipe)
+{
+   assert(pipe);
+   return (struct trace_context *)pipe;
+}
+
+
+
+struct pipe_context *
+trace_context_create(struct pipe_screen *screen,
+                     struct pipe_context *pipe);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_CONTEXT_H_ */
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
new file mode 100644
index 0000000000..a0ead0ded3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -0,0 +1,404 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/**
+ * @file
+ * Trace dumping functions.
+ * 
+ * For now we just use standard XML for dumping the trace calls, as this is
+ * simple to write, parse, and visually inspect, but the actual representation 
+ * is abstracted out of this file, so that we can switch to a binary 
+ * representation if/when it becomes justified.
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>   
+ */
+
+#include "pipe/p_config.h"
+
+#if defined(PIPE_OS_LINUX)
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+#include "util/u_string.h"
+#include "util/u_stream.h"
+
+#include "tr_dump.h"
+
+
+static struct util_stream *stream = NULL;
+static unsigned refcount = 0;
+
+
+static INLINE void 
+trace_dump_write(const char *buf, size_t size)
+{
+   if(stream)
+      util_stream_write(stream, buf, size);
+}
+
+
+static INLINE void 
+trace_dump_writes(const char *s)
+{
+   trace_dump_write(s, strlen(s));
+}
+
+
+static INLINE void 
+trace_dump_writef(const char *format, ...)
+{
+   static char buf[1024];
+   unsigned len;
+   va_list ap;
+   va_start(ap, format);
+   len = util_vsnprintf(buf, sizeof(buf), format, ap);
+   va_end(ap);
+   trace_dump_write(buf, len);
+}
+
+
+static INLINE void 
+trace_dump_escape(const char *str) 
+{
+   const unsigned char *p = (const unsigned char *)str;
+   unsigned char c;
+   while((c = *p++) != 0) {
+      if(c == '<')
+         trace_dump_writes("&lt;");
+      else if(c == '>')
+         trace_dump_writes("&gt;");
+      else if(c == '&')
+         trace_dump_writes("&amp;");
+      else if(c == '\'')
+         trace_dump_writes("&apos;");
+      else if(c == '\"')
+         trace_dump_writes("&quot;");
+      else if(c >= 0x20 && c <= 0x7e)
+         trace_dump_writef("%c", c);
+      else
+         trace_dump_writef("&#%u;", c);
+   }
+}
+
+
+static INLINE void 
+trace_dump_indent(unsigned level)
+{
+   unsigned i;
+   for(i = 0; i < level; ++i)
+      trace_dump_writes("\t");
+}
+
+
+static INLINE void 
+trace_dump_newline(void) 
+{
+   trace_dump_writes("\n");
+}
+
+
+static INLINE void 
+trace_dump_tag(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes("/>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin(const char *name)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static INLINE void 
+trace_dump_tag_begin1(const char *name, 
+                      const char *attr1, const char *value1)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("='");
+   trace_dump_escape(value1);
+   trace_dump_writes("'>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin2(const char *name, 
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void 
+trace_dump_tag_begin3(const char *name, 
+                      const char *attr1, const char *value1,
+                      const char *attr2, const char *value2,
+                      const char *attr3, const char *value3)
+{
+   trace_dump_writes("<");
+   trace_dump_writes(name);
+   trace_dump_writes(" ");
+   trace_dump_writes(attr1);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value1);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr2);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value2);
+   trace_dump_writes("\' ");
+   trace_dump_writes(attr3);
+   trace_dump_writes("=\'");
+   trace_dump_escape(value3);
+   trace_dump_writes("\'>");
+}
+
+
+static INLINE void
+trace_dump_tag_end(const char *name)
+{
+   trace_dump_writes("</");
+   trace_dump_writes(name);
+   trace_dump_writes(">");
+}
+
+static void 
+trace_dump_trace_close(void)
+{
+   if(stream) {
+      trace_dump_writes("</trace>\n");
+      util_stream_close(stream);
+      stream = NULL;
+      refcount = 0;
+   }
+}
+
+boolean trace_dump_trace_begin()
+{
+   const char *filename;
+   
+   filename = debug_get_option("GALLIUM_TRACE", NULL);
+   if(!filename)
+      return FALSE;
+   
+   if(!stream) {
+   
+      stream = util_stream_create(filename, 0);
+      if(!stream)
+         return FALSE;
+      
+      trace_dump_writes("<?xml version='1.0' encoding='UTF-8'?>\n");
+      trace_dump_writes("<?xml-stylesheet type='text/xsl' href='trace.xsl'?>\n");
+      trace_dump_writes("<trace version='0.1'>\n");
+      
+#if defined(PIPE_OS_LINUX)
+      /* Linux applications rarely cleanup GL / Gallium resources so catch 
+       * application exit here */ 
+      atexit(trace_dump_trace_close);
+#endif
+   }
+   
+   ++refcount;
+   
+   return TRUE;
+}
+
+boolean trace_dump_enabled(void)
+{
+   return stream ? TRUE : FALSE;
+}
+
+void trace_dump_trace_end(void)
+{
+   if(stream)
+      if(!--refcount)
+         trace_dump_trace_close();
+}
+
+void trace_dump_call_begin(const char *klass, const char *method)
+{
+   trace_dump_indent(1);
+   trace_dump_tag_begin2("call", "class", klass, "method", method);
+   trace_dump_newline();
+}
+
+void trace_dump_call_end(void)
+{
+   trace_dump_indent(1);
+   trace_dump_tag_end("call");
+   trace_dump_newline();
+   util_stream_flush(stream);
+}
+
+void trace_dump_arg_begin(const char *name)
+{
+   trace_dump_indent(2);
+   trace_dump_tag_begin1("arg", "name", name);
+}
+
+void trace_dump_arg_end(void)
+{
+   trace_dump_tag_end("arg");
+   trace_dump_newline();
+}
+
+void trace_dump_ret_begin(void)
+{
+   trace_dump_indent(2);
+   trace_dump_tag_begin("ret");
+}
+
+void trace_dump_ret_end(void)
+{
+   trace_dump_tag_end("ret");
+   trace_dump_newline();
+}
+
+void trace_dump_bool(int value)
+{
+   trace_dump_writef("<bool>%c</bool>", value ? '1' : '0');
+}
+
+void trace_dump_int(long long int value)
+{
+   trace_dump_writef("<int>%lli</int>", value);
+}
+
+void trace_dump_uint(long long unsigned value)
+{
+   trace_dump_writef("<uint>%llu</uint>", value);
+}
+
+void trace_dump_float(double value)
+{
+   trace_dump_writef("<float>%g</float>", value);
+}
+
+void trace_dump_bytes(const void *data,
+                      long unsigned size)
+{
+   static const char hex_table[16] = "0123456789ABCDEF";
+   const uint8_t *p = data;
+   long unsigned i;
+   trace_dump_writes("<bytes>");
+   for(i = 0; i < size; ++i) {
+      uint8_t byte = *p++;
+      char hex[2];
+      hex[0] = hex_table[byte >> 4];
+      hex[1] = hex_table[byte & 0xf];
+      trace_dump_write(hex, 2);
+   }
+   trace_dump_writes("</bytes>");
+}
+
+void trace_dump_string(const char *str)
+{
+   trace_dump_writes("<string>");
+   trace_dump_escape(str);
+   trace_dump_writes("</string>");
+}
+
+void trace_dump_enum(const char *value)
+{
+   trace_dump_writes("<enum>");
+   trace_dump_escape(value);
+   trace_dump_writes("</enum>");
+}
+
+void trace_dump_array_begin(void)
+{
+   trace_dump_writes("<array>");
+}
+
+void trace_dump_array_end(void)
+{
+   trace_dump_writes("</array>");
+}
+
+void trace_dump_elem_begin(void)
+{
+   trace_dump_writes("<elem>");
+}
+
+void trace_dump_elem_end(void)
+{
+   trace_dump_writes("</elem>");
+}
+
+void trace_dump_struct_begin(const char *name)
+{
+   trace_dump_writef("<struct name='%s'>", name);
+}
+
+void trace_dump_struct_end(void)
+{
+   trace_dump_writes("</struct>");
+}
+
+void trace_dump_member_begin(const char *name)
+{
+   trace_dump_writef("<member name='%s'>", name);
+}
+
+void trace_dump_member_end(void)
+{
+   trace_dump_writes("</member>");
+}
+
+void trace_dump_null(void)
+{
+   trace_dump_writes("<null/>");
+}
+
+void trace_dump_ptr(const void *value)
+{
+   if(value)
+      trace_dump_writef("<ptr>0x%08lx</ptr>", (unsigned long)(uintptr_t)value);
+   else
+      trace_dump_null();
+}
diff --git a/src/gallium/drivers/trace/tr_dump.h b/src/gallium/drivers/trace/tr_dump.h
new file mode 100644
index 0000000000..76a53731b3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_dump.h
@@ -0,0 +1,132 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Trace data dumping primitives.
+ */
+
+#ifndef TR_DUMP_H
+#define TR_DUMP_H
+
+
+#include "pipe/p_compiler.h"
+
+
+boolean trace_dump_trace_begin(void);
+boolean trace_dump_enabled(void);
+void trace_dump_trace_end(void);
+void trace_dump_call_begin(const char *klass, const char *method);
+void trace_dump_call_end(void);
+void trace_dump_arg_begin(const char *name);
+void trace_dump_arg_end(void);
+void trace_dump_ret_begin(void);
+void trace_dump_ret_end(void);
+void trace_dump_bool(int value);
+void trace_dump_int(long long int value);
+void trace_dump_uint(long long unsigned value);
+void trace_dump_float(double value);
+void trace_dump_bytes(const void *data, long unsigned size);
+void trace_dump_string(const char *str);
+void trace_dump_enum(const char *value);
+void trace_dump_array_begin(void);
+void trace_dump_array_end(void);
+void trace_dump_elem_begin(void);
+void trace_dump_elem_end(void);
+void trace_dump_struct_begin(const char *name);
+void trace_dump_struct_end(void);
+void trace_dump_member_begin(const char *name);
+void trace_dump_member_end(void);
+void trace_dump_null(void);
+void trace_dump_ptr(const void *value);
+
+
+/*
+ * Code saving macros. 
+ */
+
+#define trace_dump_arg(_type, _arg) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_##_type(_arg); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_ret(_type, _arg) \
+   do { \
+      trace_dump_ret_begin(); \
+      trace_dump_##_type(_arg); \
+      trace_dump_ret_end(); \
+   } while(0)
+
+#define trace_dump_array(_type, _obj, _size) \
+   do { \
+      unsigned long idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type((_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_struct_array(_type, _obj, _size) \
+   do { \
+      unsigned long idx; \
+      trace_dump_array_begin(); \
+      for(idx = 0; idx < (_size); ++idx) { \
+         trace_dump_elem_begin(); \
+         trace_dump_##_type(&(_obj)[idx]); \
+         trace_dump_elem_end(); \
+      } \
+      trace_dump_array_end(); \
+   } while(0)
+
+#define trace_dump_member(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_##_type((_obj)->_member); \
+      trace_dump_member_end(); \
+   } while(0)
+
+#define trace_dump_arg_array(_type, _arg, _size) \
+   do { \
+      trace_dump_arg_begin(#_arg); \
+      trace_dump_array(_type, _arg, _size); \
+      trace_dump_arg_end(); \
+   } while(0)
+
+#define trace_dump_member_array(_type, _obj, _member) \
+   do { \
+      trace_dump_member_begin(#_member); \
+      trace_dump_array(_type, (_obj)->_member, sizeof((_obj)->_member)/sizeof((_obj)->_member[0])); \
+      trace_dump_member_end(); \
+   } while(0)
+
+
+#endif /* TR_DUMP_H */
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
new file mode 100644
index 0000000000..8789f86b1a
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -0,0 +1,469 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_winsys.h"
+#include "tr_texture.h"
+#include "tr_screen.h"
+
+
+static const char *
+trace_screen_get_name(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_screen", "get_name");
+   
+   trace_dump_arg(ptr, screen);
+
+   result = screen->get_name(screen);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static const char *
+trace_screen_get_vendor(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_screen", "get_vendor");
+   
+   trace_dump_arg(ptr, screen);
+  
+   result = screen->get_vendor(screen);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static int 
+trace_screen_get_param(struct pipe_screen *_screen, 
+                       int param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   int result;
+   
+   trace_dump_call_begin("pipe_screen", "get_param");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_param(screen, param);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static float 
+trace_screen_get_paramf(struct pipe_screen *_screen, 
+                        int param)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   float result;
+   
+   trace_dump_call_begin("pipe_screen", "get_paramf");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(int, param);
+
+   result = screen->get_paramf(screen, param);
+   
+   trace_dump_ret(float, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static boolean 
+trace_screen_is_format_supported(struct pipe_screen *_screen,
+                                 enum pipe_format format,
+                                 enum pipe_texture_target target,
+                                 unsigned tex_usage, 
+                                 unsigned geom_flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   boolean result;
+   
+   trace_dump_call_begin("pipe_screen", "is_format_supported");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(format, format);
+   trace_dump_arg(int, target);
+   trace_dump_arg(uint, tex_usage);
+   trace_dump_arg(uint, geom_flags);
+
+   result = screen->is_format_supported(screen, format, target, tex_usage, geom_flags);
+   
+   trace_dump_ret(bool, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static struct pipe_texture *
+trace_screen_texture_create(struct pipe_screen *_screen,
+                            const struct pipe_texture *templat)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct pipe_texture *result;
+   
+   trace_dump_call_begin("pipe_screen", "texture_create");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(template, templat);
+
+   result = screen->texture_create(screen, templat);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_texture_create(tr_scr, result);
+   
+   return result;
+}
+
+
+static struct pipe_texture *
+trace_screen_texture_blanket(struct pipe_screen *_screen,
+                             const struct pipe_texture *templat,
+                             const unsigned *ppitch,
+                             struct pipe_buffer *buffer)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   unsigned pitch = *ppitch;
+   struct pipe_texture *result;
+
+   trace_dump_call_begin("pipe_screen", "texture_blanket");
+
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(template, templat);
+   trace_dump_arg(uint, pitch);
+   trace_dump_arg(ptr, buffer);
+
+   result = screen->texture_blanket(screen, templat, ppitch, buffer);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_texture_create(tr_scr, result);
+   
+   return result;
+}
+
+
+static void 
+trace_screen_texture_release(struct pipe_screen *_screen,
+                             struct pipe_texture **ptexture)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct pipe_texture *texture;
+   
+   assert(ptexture);
+   if(*ptexture) {
+      tr_tex = trace_texture(tr_scr, *ptexture);
+      texture = tr_tex->texture;
+      assert(texture->screen == screen);
+   }
+   else
+      texture = NULL;
+   
+   if (*ptexture) {
+      if (!--(*ptexture)->refcount) {
+         trace_dump_call_begin("pipe_screen", "texture_destroy");
+         
+         trace_dump_arg(ptr, screen);
+         trace_dump_arg(ptr, texture);
+         
+         trace_texture_destroy(tr_scr, *ptexture);
+         
+         trace_dump_call_end();
+      }
+   
+      *ptexture = NULL;
+   }
+}
+
+
+static struct pipe_surface *
+trace_screen_get_tex_surface(struct pipe_screen *_screen,
+                             struct pipe_texture *texture,
+                             unsigned face, unsigned level,
+                             unsigned zslice,
+                             unsigned usage)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct pipe_surface *result;
+   
+   assert(texture);
+   tr_tex = trace_texture(tr_scr, texture);
+   texture = tr_tex->texture;
+   assert(texture->screen == screen);
+   
+   trace_dump_call_begin("pipe_screen", "get_tex_surface");
+   
+   trace_dump_arg(ptr, screen);
+   trace_dump_arg(ptr, texture);
+   trace_dump_arg(uint, face);
+   trace_dump_arg(uint, level);
+   trace_dump_arg(uint, zslice);
+   trace_dump_arg(uint, usage);
+
+   result = screen->get_tex_surface(screen, texture, face, level, zslice, usage);
+
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   result = trace_surface_create(tr_tex, result);
+
+   return result;
+}
+
+
+static void 
+trace_screen_tex_surface_release(struct pipe_screen *_screen,
+                                 struct pipe_surface **psurface)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   struct pipe_surface *surface;
+   
+   assert(psurface);
+   if(*psurface) {
+      tr_tex = trace_texture(tr_scr, (*psurface)->texture);
+      tr_surf = trace_surface(tr_tex, *psurface);
+      surface = tr_surf->surface;
+   }
+   else
+      surface = NULL;
+   
+   if (*psurface) {
+      if (!--(*psurface)->refcount) {
+         trace_dump_call_begin("pipe_screen", "tex_surface_destroy");
+         
+         trace_dump_arg(ptr, screen);
+         trace_dump_arg(ptr, surface);
+
+         trace_surface_destroy(tr_tex, *psurface);
+
+         trace_dump_call_end();
+      }
+   
+      *psurface = NULL;
+   }
+}
+
+
+static void *
+trace_screen_surface_map(struct pipe_screen *_screen,
+                         struct pipe_surface *surface,
+                         unsigned flags)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   void *map;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   surface = tr_surf->surface;
+
+   map = screen->surface_map(screen, surface, flags);
+   if(map) {
+      if(flags & PIPE_BUFFER_USAGE_CPU_WRITE) {
+         assert(!tr_surf->map);
+         tr_surf->map = map;
+      }
+   }
+   
+   return map;
+}
+
+
+static void 
+trace_screen_surface_unmap(struct pipe_screen *_screen,
+                           struct pipe_surface *surface)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   struct trace_texture *tr_tex;
+   struct trace_surface *tr_surf;
+   
+   tr_tex = trace_texture(tr_scr, surface->texture);
+   tr_surf = trace_surface(tr_tex, surface);
+   surface = tr_surf->surface;
+   
+   if(tr_surf->map) {
+      size_t size = surface->nblocksy * surface->stride;
+      
+      trace_dump_call_begin("pipe_winsys", "surface_write");
+      
+      trace_dump_arg(ptr, screen);
+      
+      trace_dump_arg(ptr, surface);
+      
+      trace_dump_arg_begin("data");
+      trace_dump_bytes(tr_surf->map, size);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("stride");
+      trace_dump_uint(surface->stride);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("size");
+      trace_dump_uint(size);
+      trace_dump_arg_end();
+   
+      trace_dump_call_end();
+
+      tr_surf->map = NULL;
+   }
+
+   screen->surface_unmap(screen, surface);
+}
+
+
+static void
+trace_screen_destroy(struct pipe_screen *_screen)
+{
+   struct trace_screen *tr_scr = trace_screen(_screen);
+   struct pipe_screen *screen = tr_scr->screen;
+   
+   trace_dump_call_begin("pipe_screen", "destroy");
+   
+   trace_dump_arg(ptr, screen);
+
+   screen->destroy(screen);
+   
+   trace_dump_call_end();
+
+   trace_dump_trace_end();
+
+   FREE(tr_scr);
+}
+
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen)
+{
+   struct trace_screen *tr_scr;
+   struct pipe_winsys *winsys;
+   
+   if(!screen)
+      goto error1;
+
+   if(!trace_dump_trace_begin())
+      goto error1;
+
+   tr_scr = CALLOC_STRUCT(trace_screen);
+   if(!tr_scr)
+      goto error2;
+
+   winsys = trace_winsys_create(screen->winsys);
+   if(!winsys)
+      goto error3;
+   
+   tr_scr->base.winsys = winsys;
+   tr_scr->base.destroy = trace_screen_destroy;
+   tr_scr->base.get_name = trace_screen_get_name;
+   tr_scr->base.get_vendor = trace_screen_get_vendor;
+   tr_scr->base.get_param = trace_screen_get_param;
+   tr_scr->base.get_paramf = trace_screen_get_paramf;
+   tr_scr->base.is_format_supported = trace_screen_is_format_supported;
+   tr_scr->base.texture_create = trace_screen_texture_create;
+   tr_scr->base.texture_blanket = trace_screen_texture_blanket;
+   tr_scr->base.texture_release = trace_screen_texture_release;
+   tr_scr->base.get_tex_surface = trace_screen_get_tex_surface;
+   tr_scr->base.tex_surface_release = trace_screen_tex_surface_release;
+   tr_scr->base.surface_map = trace_screen_surface_map;
+   tr_scr->base.surface_unmap = trace_screen_surface_unmap;
+   
+   tr_scr->screen = screen;
+
+   trace_dump_call_begin("", "pipe_screen_create");
+   trace_dump_arg_begin("winsys");
+   trace_dump_ptr(screen->winsys);
+   trace_dump_arg_end();
+   trace_dump_ret(ptr, screen);
+   trace_dump_call_end();
+
+   return &tr_scr->base;
+
+error3:
+   FREE(tr_scr);
+error2:
+   trace_dump_trace_end();
+error1:
+   return screen;
+}
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen)
+{
+   assert(screen);
+   assert(screen->destroy == trace_screen_destroy);
+   return (struct trace_screen *)screen;
+}
diff --git a/src/gallium/drivers/trace/tr_screen.h b/src/gallium/drivers/trace/tr_screen.h
new file mode 100644
index 0000000000..93fefdb9a5
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_screen.h
@@ -0,0 +1,60 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_SCREEN_H_
+#define TR_SCREEN_H_
+
+
+#include "pipe/p_screen.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct trace_screen
+{
+   struct pipe_screen base;
+   
+   struct pipe_screen *screen;
+};
+
+
+struct trace_screen *
+trace_screen(struct pipe_screen *screen);
+
+
+struct pipe_screen *
+trace_screen_create(struct pipe_screen *screen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TR_SCREEN_H_ */
diff --git a/src/gallium/drivers/trace/tr_state.c b/src/gallium/drivers/trace/tr_state.c
new file mode 100644
index 0000000000..986d939e0c
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_state.c
@@ -0,0 +1,464 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "tgsi/tgsi_dump.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+
+
+void trace_dump_format(enum pipe_format format)
+{
+   trace_dump_enum(pf_name(format) );
+}
+
+
+void trace_dump_block(const struct pipe_format_block *block)
+{
+   trace_dump_struct_begin("pipe_format_block");
+   trace_dump_member(uint, block, size);
+   trace_dump_member(uint, block, width);
+   trace_dump_member(uint, block, height);
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_template(const struct pipe_texture *templat)
+{
+   if(!templat) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_texture");
+   
+   trace_dump_member(int, templat, target);
+   trace_dump_member(format, templat, format);
+   
+   trace_dump_member_begin("width");
+   trace_dump_array(uint, templat->width, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("height");
+   trace_dump_array(uint, templat->height, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("depth");
+   trace_dump_array(uint, templat->depth, 1);
+   trace_dump_member_end();
+
+   trace_dump_member_begin("block");
+   trace_dump_block(&templat->block);
+   trace_dump_member_end();
+   
+   trace_dump_member(uint, templat, last_level);
+   trace_dump_member(uint, templat, tex_usage);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_rasterizer_state");
+
+   trace_dump_member(bool, state, flatshade);
+   trace_dump_member(bool, state, light_twoside);
+   trace_dump_member(uint, state, front_winding);
+   trace_dump_member(uint, state, cull_mode);
+   trace_dump_member(uint, state, fill_cw);
+   trace_dump_member(uint, state, fill_ccw);
+   trace_dump_member(bool, state, offset_cw);
+   trace_dump_member(bool, state, offset_ccw);
+   trace_dump_member(bool, state, scissor);
+   trace_dump_member(bool, state, poly_smooth);
+   trace_dump_member(bool, state, poly_stipple_enable);
+   trace_dump_member(bool, state, point_smooth);
+   trace_dump_member(bool, state, point_sprite);
+   trace_dump_member(bool, state, point_size_per_vertex);
+   trace_dump_member(bool, state, multisample);
+   trace_dump_member(bool, state, line_smooth);
+   trace_dump_member(bool, state, line_stipple_enable);
+   trace_dump_member(uint, state, line_stipple_factor);
+   trace_dump_member(uint, state, line_stipple_pattern);
+   trace_dump_member(bool, state, line_last_pixel);
+   trace_dump_member(bool, state, bypass_clipping);
+   trace_dump_member(bool, state, bypass_vs);
+   trace_dump_member(bool, state, origin_lower_left);
+   trace_dump_member(bool, state, flatshade_first);
+   trace_dump_member(bool, state, gl_rasterization_rules);
+
+   trace_dump_member(float, state, line_width);
+   trace_dump_member(float, state, point_size);
+   trace_dump_member(float, state, point_size_min);
+   trace_dump_member(float, state, point_size_max);
+   trace_dump_member(float, state, offset_units);
+   trace_dump_member(float, state, offset_scale);
+   
+   trace_dump_member_array(uint, state, sprite_coord_mode);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_poly_stipple");
+
+   trace_dump_member_begin("stipple");
+   trace_dump_array(uint,
+                    state->stipple, 
+                    Elements(state->stipple));
+   trace_dump_member_end();
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_viewport_state");
+
+   trace_dump_member_array(float, state, scale);
+   trace_dump_member_array(float, state, translate);
+   
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_scissor_state");
+
+   trace_dump_member(uint, state, minx);
+   trace_dump_member(uint, state, miny);
+   trace_dump_member(uint, state, maxx);
+   trace_dump_member(uint, state, maxy);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_clip_state(const struct pipe_clip_state *state)
+{
+   unsigned i;
+   
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_clip_state");
+
+   trace_dump_member_begin("ucp");
+   trace_dump_array_begin();
+   for(i = 0; i < PIPE_MAX_CLIP_PLANES; ++i) {
+      trace_dump_elem_begin();
+      trace_dump_array(float, state->ucp[i], 4);
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member(uint, state, nr);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_constant_buffer(const struct pipe_constant_buffer *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_constant_buffer");
+
+   trace_dump_member(ptr, state, buffer);
+   trace_dump_member(uint, state, size);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_shader_state(const struct pipe_shader_state *state)
+{
+   static char str[8192];
+
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   tgsi_dump_str(state->tokens, 0, str, sizeof(str));
+   
+   trace_dump_struct_begin("pipe_shader_state");
+
+   trace_dump_member_begin("tokens");
+   trace_dump_string(str);
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state)
+{
+   unsigned i;
+   
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_depth_stencil_alpha_state");
+
+   trace_dump_member_begin("depth");
+   trace_dump_struct_begin("pipe_depth_state");
+   trace_dump_member(bool, &state->depth, enabled);
+   trace_dump_member(bool, &state->depth, writemask);
+   trace_dump_member(uint, &state->depth, func);
+   trace_dump_member(bool, &state->depth, occlusion_count);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+   
+   trace_dump_member_begin("stencil");
+   trace_dump_array_begin();
+   for(i = 0; i < Elements(state->stencil); ++i) {
+      trace_dump_elem_begin();
+      trace_dump_struct_begin("pipe_stencil_state");
+      trace_dump_member(bool, &state->stencil[i], enabled);
+      trace_dump_member(uint, &state->stencil[i], func);
+      trace_dump_member(uint, &state->stencil[i], fail_op);
+      trace_dump_member(uint, &state->stencil[i], zpass_op);
+      trace_dump_member(uint, &state->stencil[i], zfail_op);
+      trace_dump_member(uint, &state->stencil[i], ref_value);    
+      trace_dump_member(uint, &state->stencil[i], value_mask);
+      trace_dump_member(uint, &state->stencil[i], write_mask);
+      trace_dump_struct_end();
+      trace_dump_elem_end();
+   }
+   trace_dump_array_end();
+   trace_dump_member_end();
+
+   trace_dump_member_begin("alpha");
+   trace_dump_struct_begin("pipe_alpha_state");
+   trace_dump_member(bool, &state->alpha, enabled);
+   trace_dump_member(uint, &state->alpha, func);
+   trace_dump_member(float, &state->alpha, ref);
+   trace_dump_struct_end();
+   trace_dump_member_end();
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_blend_state(const struct pipe_blend_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_blend_state");
+
+   trace_dump_member(bool, state, blend_enable);
+
+   trace_dump_member(uint, state, rgb_func);
+   trace_dump_member(uint, state, rgb_src_factor);
+   trace_dump_member(uint, state, rgb_dst_factor);
+
+   trace_dump_member(uint, state, alpha_func);
+   trace_dump_member(uint, state, alpha_src_factor);
+   trace_dump_member(uint, state, alpha_dst_factor);
+
+   trace_dump_member(bool, state, logicop_enable);
+   trace_dump_member(uint, state, logicop_func);
+
+   trace_dump_member(uint, state, colormask);
+   trace_dump_member(bool, state, dither);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_blend_color(const struct pipe_blend_color *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_blend_color");
+
+   trace_dump_member_array(float, state, color);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state)
+{
+   trace_dump_struct_begin("pipe_framebuffer_state");
+
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+   trace_dump_member(uint, state, num_cbufs);
+   trace_dump_member_array(ptr, state, cbufs);
+   trace_dump_member(ptr, state, zsbuf);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_sampler_state");
+
+   trace_dump_member(uint, state, wrap_s);
+   trace_dump_member(uint, state, wrap_t);
+   trace_dump_member(uint, state, wrap_r);
+   trace_dump_member(uint, state, min_img_filter);
+   trace_dump_member(uint, state, min_mip_filter);
+   trace_dump_member(uint, state, mag_img_filter);
+   trace_dump_member(bool, state, compare_mode);
+   trace_dump_member(uint, state, compare_func);
+   trace_dump_member(bool, state, normalized_coords);
+   trace_dump_member(uint, state, prefilter);
+   trace_dump_member(float, state, shadow_ambient);
+   trace_dump_member(float, state, lod_bias);
+   trace_dump_member(float, state, min_lod);
+   trace_dump_member(float, state, max_lod);
+   trace_dump_member_array(float, state, border_color);
+   trace_dump_member(float, state, max_anisotropy);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_surface(const struct pipe_surface *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_surface");
+
+   trace_dump_member(ptr, state, buffer);
+   trace_dump_member(format, state, format);
+   trace_dump_member(uint, state, status);
+   trace_dump_member(uint, state, clear_value);
+   trace_dump_member(uint, state, width);
+   trace_dump_member(uint, state, height);
+
+   trace_dump_member_begin("block");
+   trace_dump_block(&state->block);
+   trace_dump_member_end();
+   
+   trace_dump_member(uint, state, nblocksx);
+   trace_dump_member(uint, state, nblocksy);
+   trace_dump_member(uint, state, stride);
+   trace_dump_member(uint, state, layout);
+   trace_dump_member(uint, state, offset);
+   trace_dump_member(uint, state, refcount);
+   trace_dump_member(uint, state, usage);
+
+   trace_dump_member(ptr, state, texture);
+   trace_dump_member(uint, state, face);
+   trace_dump_member(uint, state, level);
+   trace_dump_member(uint, state, zslice);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_buffer");
+
+   trace_dump_member(uint, state, pitch);
+   trace_dump_member(uint, state, max_index);
+   trace_dump_member(uint, state, buffer_offset);
+   trace_dump_member(ptr, state, buffer);
+
+   trace_dump_struct_end();
+}
+
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state)
+{
+   if(!state) {
+      trace_dump_null();
+      return;
+   }
+
+   trace_dump_struct_begin("pipe_vertex_element");
+
+   trace_dump_member(uint, state, src_offset);
+
+   trace_dump_member(uint, state, vertex_buffer_index);
+   trace_dump_member(uint, state, nr_components);
+ 
+   trace_dump_member(format, state, src_format);
+
+   trace_dump_struct_end();
+}
diff --git a/src/gallium/drivers/trace/tr_state.h b/src/gallium/drivers/trace/tr_state.h
new file mode 100644
index 0000000000..5ae533dc66
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_state.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_STATE_H
+#define TR_STATE_H
+
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "pipe/p_shader_tokens.h"
+
+
+void trace_dump_format(enum pipe_format format);
+
+void trace_dump_block(const struct pipe_format_block *block);
+
+void trace_dump_template(const struct pipe_texture *templat);
+
+
+void trace_dump_rasterizer_state(const struct pipe_rasterizer_state *state);
+
+void trace_dump_poly_stipple(const struct pipe_poly_stipple *state);
+
+void trace_dump_viewport_state(const struct pipe_viewport_state *state);
+
+void trace_dump_scissor_state(const struct pipe_scissor_state *state);
+
+void trace_dump_clip_state(const struct pipe_clip_state *state);
+
+void trace_dump_constant_buffer(const struct pipe_constant_buffer *state);
+
+void trace_dump_token(const struct tgsi_token *token);
+
+void trace_dump_shader_state(const struct pipe_shader_state *state);
+
+void trace_dump_depth_stencil_alpha_state(const struct pipe_depth_stencil_alpha_state *state);
+
+void trace_dump_blend_state(const struct pipe_blend_state *state);
+
+void trace_dump_blend_color(const struct pipe_blend_color *state);
+
+void trace_dump_framebuffer_state(const struct pipe_framebuffer_state *state);
+
+void trace_dump_sampler_state(const struct pipe_sampler_state *state);
+
+void trace_dump_surface(const struct pipe_surface *state);
+
+void trace_dump_vertex_buffer(const struct pipe_vertex_buffer *state);
+
+void trace_dump_vertex_element(const struct pipe_vertex_element *state);
+
+
+#endif /* TR_STATE_H */
diff --git a/src/gallium/drivers/trace/tr_texture.c b/src/gallium/drivers/trace/tr_texture.c
new file mode 100644
index 0000000000..440a78704a
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.c
@@ -0,0 +1,112 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_inlines.h"
+#include "util/u_hash_table.h"
+#include "util/u_memory.h"
+
+#include "tr_screen.h"
+#include "tr_texture.h"
+
+
+struct pipe_texture *
+trace_texture_create(struct trace_screen *tr_scr, 
+                     struct pipe_texture *texture)
+{
+   struct trace_texture *tr_tex;
+   
+   if(!texture)
+      goto error;
+   
+   assert(texture->screen == tr_scr->screen);
+   
+   tr_tex = CALLOC_STRUCT(trace_texture);
+   if(!tr_tex)
+      goto error;
+   
+   memcpy(&tr_tex->base, texture, sizeof(struct pipe_texture));
+   tr_tex->base.screen = &tr_scr->base;
+   tr_tex->texture = texture;
+   
+   return &tr_tex->base;
+   
+error:
+   pipe_texture_reference(&texture, NULL);
+   return NULL;
+}
+
+
+void
+trace_texture_destroy(struct trace_screen *tr_scr, 
+                      struct pipe_texture *texture)
+{
+   struct trace_texture *tr_tex = trace_texture(tr_scr, texture); 
+   pipe_texture_reference(&tr_tex->texture, NULL);
+   FREE(tr_tex);
+}
+
+
+struct pipe_surface *
+trace_surface_create(struct trace_texture *tr_tex, 
+                     struct pipe_surface *surface)
+{
+   struct trace_surface *tr_surf;
+   
+   if(!surface)
+      goto error;
+   
+   assert(surface->texture == tr_tex->texture);
+   
+   tr_surf = CALLOC_STRUCT(trace_surface);
+   if(!tr_surf)
+      goto error;
+   
+   memcpy(&tr_surf->base, surface, sizeof(struct pipe_surface));
+   
+   tr_surf->base.winsys = tr_tex->base.screen->winsys;
+   tr_surf->base.texture = NULL;
+   pipe_texture_reference(&tr_surf->base.texture, &tr_tex->base);
+   tr_surf->surface = surface;
+
+   return &tr_surf->base;
+   
+error:
+   pipe_surface_reference(&surface, NULL);
+   return NULL;
+}
+
+
+void
+trace_surface_destroy(struct trace_texture *tr_tex, 
+                      struct pipe_surface *surface)
+{
+   struct trace_surface *tr_surf = trace_surface(tr_tex, surface);
+   pipe_texture_reference(&tr_surf->base.texture, NULL);
+   pipe_surface_reference(&tr_surf->surface, NULL);
+   FREE(tr_surf);
+}
+
diff --git a/src/gallium/drivers/trace/tr_texture.h b/src/gallium/drivers/trace/tr_texture.h
new file mode 100644
index 0000000000..9e72edb8a3
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_texture.h
@@ -0,0 +1,95 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_TEXTURE_H_
+#define TR_TEXTURE_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "tr_screen.h"
+
+
+struct trace_texture
+{
+   struct pipe_texture base;
+
+   struct pipe_texture *texture;
+};
+
+
+struct trace_surface
+{
+   struct pipe_surface base;
+
+   struct pipe_surface *surface;
+   
+   void *map;
+};
+
+
+static INLINE struct trace_texture *
+trace_texture(struct trace_screen *tr_scr, 
+              struct pipe_texture *texture)
+{
+   if(!texture)
+      return NULL;
+   assert(texture->screen == &tr_scr->base);
+   return (struct trace_texture *)texture;
+}
+
+
+static INLINE struct trace_surface *
+trace_surface(struct trace_texture *tr_tex, 
+              struct pipe_surface *surface)
+{
+   if(!surface)
+      return NULL;
+   assert(surface->texture == &tr_tex->base);
+   return (struct trace_surface *)surface;
+}
+
+
+struct pipe_texture *
+trace_texture_create(struct trace_screen *tr_scr, 
+                     struct pipe_texture *texture);
+
+void
+trace_texture_destroy(struct trace_screen *tr_scr, 
+                      struct pipe_texture *texture);
+
+struct pipe_surface *
+trace_surface_create(struct trace_texture *tr_tex, 
+                     struct pipe_surface *surface);
+
+void
+trace_surface_destroy(struct trace_texture *tr_tex,
+                      struct pipe_surface *surface);
+
+
+#endif /* TR_TEXTURE_H_ */
diff --git a/src/gallium/drivers/trace/tr_winsys.c b/src/gallium/drivers/trace/tr_winsys.c
new file mode 100644
index 0000000000..177835854e
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_winsys.c
@@ -0,0 +1,497 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_hash_table.h"
+
+#include "tr_dump.h"
+#include "tr_state.h"
+#include "tr_screen.h"
+#include "tr_texture.h"
+#include "tr_winsys.h"
+
+
+static unsigned trace_buffer_hash(void *buffer)
+{
+   return (unsigned)(uintptr_t)buffer;
+}
+
+
+static int trace_buffer_compare(void *buffer1, void *buffer2)
+{
+   return (char *)buffer2 - (char *)buffer1;
+}
+
+                  
+static const char *
+trace_winsys_get_name(struct pipe_winsys *_winsys)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const char *result;
+   
+   trace_dump_call_begin("pipe_winsys", "get_name");
+   
+   trace_dump_arg(ptr, winsys);
+
+   result = winsys->get_name(winsys);
+   
+   trace_dump_ret(string, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static void 
+trace_winsys_flush_frontbuffer(struct pipe_winsys *_winsys,
+                               struct pipe_surface *surface,
+                               void *context_private)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+
+   assert(surface);
+   if(surface->texture) {
+      struct trace_screen *tr_scr = trace_screen(surface->texture->screen);
+      struct trace_texture *tr_tex = trace_texture(tr_scr, surface->texture);
+      struct trace_surface *tr_surf = trace_surface(tr_tex, surface);
+      surface = tr_surf->surface;
+   }
+   
+   trace_dump_call_begin("pipe_winsys", "flush_frontbuffer");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, surface);
+   /* XXX: hide, as there is nothing we can do with this
+   trace_dump_arg(ptr, context_private);
+   */
+
+   winsys->flush_frontbuffer(winsys, surface, context_private);
+   
+   trace_dump_call_end();
+}
+
+
+static struct pipe_surface *
+trace_winsys_surface_alloc(struct pipe_winsys *_winsys)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_surface *result;
+   
+   trace_dump_call_begin("pipe_winsys", "surface_alloc");
+   
+   trace_dump_arg(ptr, winsys);
+
+   result = winsys->surface_alloc(winsys);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   assert(!result || !result->texture);
+
+   return result;
+}
+
+
+static int
+trace_winsys_surface_alloc_storage(struct pipe_winsys *_winsys,
+                                   struct pipe_surface *surface,
+                                   unsigned width, unsigned height,
+                                   enum pipe_format format,
+                                   unsigned flags,
+                                   unsigned tex_usage)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   int result;
+   
+   assert(surface && !surface->texture);
+
+   trace_dump_call_begin("pipe_winsys", "surface_alloc_storage");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, surface);
+   trace_dump_arg(uint, width);
+   trace_dump_arg(uint, height);
+   trace_dump_arg(format, format);
+   trace_dump_arg(uint, flags);
+   trace_dump_arg(uint, tex_usage);
+
+   result = winsys->surface_alloc_storage(winsys,
+                                          surface,
+                                          width, height,
+                                          format,
+                                          flags,
+                                          tex_usage);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static void
+trace_winsys_surface_release(struct pipe_winsys *_winsys, 
+                             struct pipe_surface **psurface)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_surface *surface = *psurface;
+   
+   assert(psurface && *psurface && !(*psurface)->texture);
+   
+   trace_dump_call_begin("pipe_winsys", "surface_release");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, surface);
+
+   winsys->surface_release(winsys, psurface);
+   
+   trace_dump_call_end();
+}
+
+
+static struct pipe_buffer *
+trace_winsys_buffer_create(struct pipe_winsys *_winsys, 
+                           unsigned alignment, 
+                           unsigned usage,
+                           unsigned size)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_buffer *buffer;
+   
+   trace_dump_call_begin("pipe_winsys", "buffer_create");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(uint, alignment);
+   trace_dump_arg(uint, usage);
+   trace_dump_arg(uint, size);
+
+   buffer = winsys->buffer_create(winsys, alignment, usage, size);
+   
+   trace_dump_ret(ptr, buffer);
+   
+   trace_dump_call_end();
+
+   /* Zero the buffer to avoid dumping uninitialized memory */
+   if(buffer->usage & PIPE_BUFFER_USAGE_CPU_WRITE) {
+      void *map;
+      map = winsys->buffer_map(winsys, buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+      if(map) {
+         memset(map, 0, buffer->size);
+         winsys->buffer_unmap(winsys, buffer);
+      }
+   }
+   
+   return buffer;
+}
+
+
+static struct pipe_buffer *
+trace_winsys_user_buffer_create(struct pipe_winsys *_winsys, 
+                                void *data,
+                                unsigned size)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_buffer *result;
+   
+   trace_dump_call_begin("pipe_winsys", "user_buffer_create");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg_begin("data");
+   trace_dump_bytes(data, size);
+   trace_dump_arg_end();
+   trace_dump_arg(uint, size);
+
+   result = winsys->user_buffer_create(winsys, data, size);
+   
+   trace_dump_ret(ptr, result);
+   
+   trace_dump_call_end();
+   
+   /* XXX: Mark the user buffers. (we should wrap pipe_buffers, but is is 
+    * impossible to do so while texture-less surfaces are still around */
+   if(result) {
+      assert(!(result->usage & TRACE_BUFFER_USAGE_USER));
+      result->usage |= TRACE_BUFFER_USAGE_USER;
+   }
+   
+   return result;
+}
+
+
+void
+trace_winsys_user_buffer_update(struct pipe_winsys *_winsys, 
+                                struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const void *map;
+   
+   if(buffer && buffer->usage & TRACE_BUFFER_USAGE_USER) {
+      map = winsys->buffer_map(winsys, buffer, PIPE_BUFFER_USAGE_CPU_READ);
+      if(map) {
+         trace_dump_call_begin("pipe_winsys", "buffer_write");
+         
+         trace_dump_arg(ptr, winsys);
+         
+         trace_dump_arg(ptr, buffer);
+         
+         trace_dump_arg_begin("data");
+         trace_dump_bytes(map, buffer->size);
+         trace_dump_arg_end();
+      
+         trace_dump_arg_begin("size");
+         trace_dump_uint(buffer->size);
+         trace_dump_arg_end();
+      
+         trace_dump_call_end();
+         
+         winsys->buffer_unmap(winsys, buffer);
+      }
+   }
+}
+
+
+static void *
+trace_winsys_buffer_map(struct pipe_winsys *_winsys, 
+                        struct pipe_buffer *buffer,
+                        unsigned usage)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   void *map;
+   
+   map = winsys->buffer_map(winsys, buffer, usage);
+   if(map) {
+      if(usage & PIPE_BUFFER_USAGE_CPU_WRITE) {
+         assert(!hash_table_get(tr_ws->buffer_maps, buffer));
+         hash_table_set(tr_ws->buffer_maps, buffer, map);
+      }
+   }
+   
+   return map;
+}
+
+
+static void
+trace_winsys_buffer_unmap(struct pipe_winsys *_winsys, 
+                          struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   const void *map;
+   
+   map = hash_table_get(tr_ws->buffer_maps, buffer);
+   if(map) {
+      trace_dump_call_begin("pipe_winsys", "buffer_write");
+      
+      trace_dump_arg(ptr, winsys);
+      
+      trace_dump_arg(ptr, buffer);
+      
+      trace_dump_arg_begin("data");
+      trace_dump_bytes(map, buffer->size);
+      trace_dump_arg_end();
+
+      trace_dump_arg_begin("size");
+      trace_dump_uint(buffer->size);
+      trace_dump_arg_end();
+   
+      trace_dump_call_end();
+
+      hash_table_remove(tr_ws->buffer_maps, buffer);
+   }
+   
+   winsys->buffer_unmap(winsys, buffer);
+}
+
+
+static void
+trace_winsys_buffer_destroy(struct pipe_winsys *_winsys,
+                            struct pipe_buffer *buffer)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   
+   trace_dump_call_begin("pipe_winsys", "buffer_destroy");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, buffer);
+
+   winsys->buffer_destroy(winsys, buffer);
+   
+   trace_dump_call_end();
+}
+
+
+static void
+trace_winsys_fence_reference(struct pipe_winsys *_winsys,
+                             struct pipe_fence_handle **pdst,
+                             struct pipe_fence_handle *src)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   struct pipe_fence_handle *dst = *pdst;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_reference");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, dst);
+   trace_dump_arg(ptr, src);
+
+   winsys->fence_reference(winsys, pdst, src);
+   
+   trace_dump_call_end();
+}
+
+
+static int
+trace_winsys_fence_signalled(struct pipe_winsys *_winsys,
+                             struct pipe_fence_handle *fence,
+                             unsigned flag)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   int result;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_signalled");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flag);
+
+   result = winsys->fence_signalled(winsys, fence, flag);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static int
+trace_winsys_fence_finish(struct pipe_winsys *_winsys,
+                          struct pipe_fence_handle *fence,
+                          unsigned flag)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   int result;
+   
+   trace_dump_call_begin("pipe_winsys", "fence_finish");
+   
+   trace_dump_arg(ptr, winsys);
+   trace_dump_arg(ptr, fence);
+   trace_dump_arg(uint, flag);
+
+   result = winsys->fence_finish(winsys, fence, flag);
+   
+   trace_dump_ret(int, result);
+   
+   trace_dump_call_end();
+   
+   return result;
+}
+
+
+static void
+trace_winsys_destroy(struct pipe_winsys *_winsys)
+{
+   struct trace_winsys *tr_ws = trace_winsys(_winsys);
+   struct pipe_winsys *winsys = tr_ws->winsys;
+   
+   trace_dump_call_begin("pipe_winsys", "destroy");
+   
+   trace_dump_arg(ptr, winsys);
+
+   /* 
+   winsys->destroy(winsys); 
+   */
+   
+   trace_dump_call_end();
+   
+   hash_table_destroy(tr_ws->buffer_maps);
+
+   FREE(tr_ws);
+}
+
+
+struct pipe_winsys *
+trace_winsys_create(struct pipe_winsys *winsys)
+{
+   struct trace_winsys *tr_ws;
+   
+   if(!winsys)
+      goto error1;
+   
+   tr_ws = CALLOC_STRUCT(trace_winsys);
+   if(!tr_ws)
+      goto error1;
+
+   tr_ws->base.destroy = trace_winsys_destroy;
+   tr_ws->base.get_name = trace_winsys_get_name;
+   tr_ws->base.flush_frontbuffer = trace_winsys_flush_frontbuffer;
+   tr_ws->base.surface_alloc = trace_winsys_surface_alloc;
+   tr_ws->base.surface_alloc_storage = trace_winsys_surface_alloc_storage;
+   tr_ws->base.surface_release = trace_winsys_surface_release;
+   tr_ws->base.buffer_create = trace_winsys_buffer_create;
+   tr_ws->base.user_buffer_create = trace_winsys_user_buffer_create;
+   tr_ws->base.buffer_map = trace_winsys_buffer_map;
+   tr_ws->base.buffer_unmap = trace_winsys_buffer_unmap;
+   tr_ws->base.buffer_destroy = trace_winsys_buffer_destroy;
+   tr_ws->base.fence_reference = trace_winsys_fence_reference;
+   tr_ws->base.fence_signalled = trace_winsys_fence_signalled;
+   tr_ws->base.fence_finish = trace_winsys_fence_finish;
+   
+   tr_ws->winsys = winsys;
+
+   tr_ws->buffer_maps = hash_table_create(trace_buffer_hash, 
+                                          trace_buffer_compare);
+   if(!tr_ws->buffer_maps)
+      goto error2;
+   
+   trace_dump_call_begin("", "pipe_winsys_create");
+   trace_dump_ret(ptr, winsys);
+   trace_dump_call_end();
+
+   return &tr_ws->base;
+   
+error2:
+   FREE(tr_ws);
+error1:
+   return winsys;
+}
diff --git a/src/gallium/drivers/trace/tr_winsys.h b/src/gallium/drivers/trace/tr_winsys.h
new file mode 100644
index 0000000000..062ddf66a0
--- /dev/null
+++ b/src/gallium/drivers/trace/tr_winsys.h
@@ -0,0 +1,76 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef TR_WINSYS_H_
+#define TR_WINSYS_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_winsys.h"
+
+
+/**
+ * It often happens that new data is written directly to the user buffers 
+ * without mapping/unmapping. This flag marks user buffers, so that their 
+ * contents can be dumpped before being used by the pipe context.
+ */
+#define TRACE_BUFFER_USAGE_USER  (1 << 31)
+
+
+struct hash_table;
+
+
+struct trace_winsys
+{
+   struct pipe_winsys base;
+   
+   struct pipe_winsys *winsys;
+   
+   struct hash_table *buffer_maps;
+};
+
+
+static INLINE struct trace_winsys *
+trace_winsys(struct pipe_winsys *winsys)
+{
+   assert(winsys);
+   return (struct trace_winsys *)winsys;
+}
+
+
+
+struct pipe_winsys *
+trace_winsys_create(struct pipe_winsys *winsys);
+
+
+void
+trace_winsys_user_buffer_update(struct pipe_winsys *winsys, 
+                                struct pipe_buffer *buffer);
+
+
+#endif /* TR_WINSYS_H_ */
diff --git a/src/gallium/drivers/trace/trace.xsl b/src/gallium/drivers/trace/trace.xsl
new file mode 100644
index 0000000000..9cd621e7ab
--- /dev/null
+++ b/src/gallium/drivers/trace/trace.xsl
@@ -0,0 +1,185 @@
+<?xml version="1.0"?>
+
+<!--
+
+Copyright 2008 Tungsten Graphics, Inc.
+
+This program is free software: you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published
+by the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+!-->
+
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+	<xsl:output method="html" />
+
+	<xsl:strip-space elements="*" />
+
+	<xsl:template match="/trace">
+		<html>
+			<head>
+				<title>Gallium Trace</title>
+			</head>
+			<style>
+				body {
+					font-family: verdana, sans-serif;
+					font-size: 11px;
+					font-weight: normal;
+					text-align : left;
+				}
+
+				.fun {
+					font-weight: bold;
+				}
+
+				.var {
+					font-style: italic;
+				}
+
+				.typ {
+					display: none;
+				}
+
+				.lit {
+					color: #0000ff;
+				}
+
+				.ptr {
+					color: #008000;
+				}
+			</style>
+			<body>
+				<ol class="calls">
+					<xsl:apply-templates/>
+				</ol>
+			</body>
+		</html>
+	</xsl:template>
+
+	<xsl:template match="call">
+		<li>
+			<span class="fun">
+				<xsl:value-of select="@class"/>
+				<xsl:text>::</xsl:text>
+				<xsl:value-of select="@method"/>
+			</span>
+			<xsl:text>(</xsl:text>
+			<xsl:apply-templates select="arg"/>
+			<xsl:text>)</xsl:text>
+			<xsl:apply-templates select="ret"/>
+		</li>
+	</xsl:template>
+
+	<xsl:template match="arg|member">
+			<xsl:apply-templates select="@name"/>
+			<xsl:text> = </xsl:text>
+			<xsl:apply-templates />
+			<xsl:if test="position() != last()">
+				<xsl:text>, </xsl:text>
+			</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="ret">
+		<xsl:text> = </xsl:text>
+		<xsl:apply-templates />
+	</xsl:template>
+
+	<xsl:template match="bool|int|uint|float|enum">
+		<span class="lit">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="bytes">
+		<span class="lit">
+			<xsl:text>...</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="string">
+		<span class="lit">
+			<xsl:text>"</xsl:text>
+			<xsl:call-template name="break">
+				<xsl:with-param name="text" select="text()"/>
+			</xsl:call-template>
+			<xsl:text>"</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="array|struct">
+		<xsl:text>{</xsl:text>
+		<xsl:apply-templates />
+		<xsl:text>}</xsl:text>
+	</xsl:template>
+
+	<xsl:template match="elem">
+		<xsl:apply-templates />
+		<xsl:if test="position() != last()">
+			<xsl:text>, </xsl:text>
+		</xsl:if>
+	</xsl:template>
+
+	<xsl:template match="null">
+		<span class="ptr">
+			<xsl:text>NULL</xsl:text>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="ptr">
+		<span class="ptr">
+			<xsl:value-of select="text()"/>
+		</span>
+	</xsl:template>
+
+	<xsl:template match="@name">
+		<span class="var">
+			<xsl:value-of select="."/>
+		</span>
+	</xsl:template>
+	
+	<xsl:template name="break">
+		<xsl:param name="text" select="."/>
+		<xsl:choose>
+			<xsl:when test="contains($text, '&#xa;')">
+				<xsl:value-of select="substring-before($text, '&#xa;')"/>
+				<br/>
+				<xsl:call-template name="break">
+					 <xsl:with-param name="text" select="substring-after($text, '&#xa;')"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+	<xsl:template name="replace">
+		<xsl:param name="text"/>
+		<xsl:param name="from"/>
+		<xsl:param name="to"/>
+		<xsl:choose>
+			<xsl:when test="contains($text,$from)">
+				<xsl:value-of select="concat(substring-before($text,$from),$to)"/>
+				<xsl:call-template name="replace">
+					<xsl:with-param name="text" select="substring-after($text,$from)"/>
+					<xsl:with-param name="from" select="$from"/>
+					<xsl:with-param name="to" select="$to"/>
+				</xsl:call-template>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:value-of select="$text"/>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+</xsl:transform>
diff --git a/src/gallium/include/pipe/p_compiler.h b/src/gallium/include/pipe/p_compiler.h
new file mode 100644
index 0000000000..7bcebd3d6b
--- /dev/null
+++ b/src/gallium/include/pipe/p_compiler.h
@@ -0,0 +1,157 @@
+/**************************************************************************
+ * 
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_COMPILER_H
+#define P_COMPILER_H
+
+
+#include "p_config.h"
+
+#ifndef XFree86Server
+#include <stdlib.h>
+#include <string.h>
+#else
+#include "xf86_ansic.h"
+#include "xf86_libc.h"
+#endif
+
+
+#if defined(_WIN32) && !defined(__WIN32__)
+#define __WIN32__
+#endif
+
+#if defined(_MSC_VER)
+
+/* Avoid 'expression is always true' warning */
+#pragma warning(disable: 4296)
+
+#endif /* _MSC_VER */
+
+
+#if defined(_MSC_VER)
+
+typedef __int8             int8_t;
+typedef unsigned __int8    uint8_t;
+typedef __int16            int16_t;
+typedef unsigned __int16   uint16_t;
+#ifndef __eglplatform_h_
+typedef __int32            int32_t;
+#endif
+typedef unsigned __int32   uint32_t;
+typedef __int64            int64_t;
+typedef unsigned __int64   uint64_t;
+
+#if defined(_WIN64)
+typedef __int64            intptr_t;
+typedef unsigned __int64   uintptr_t;
+#else
+typedef __int32            intptr_t;
+typedef unsigned __int32   uintptr_t;
+#endif
+
+#define INT64_C(__val) __val##i64
+#define UINT64_C(__val) __val##ui64
+
+#ifndef __cplusplus
+#define false   0
+#define true    1
+#define bool    _Bool
+typedef int     _Bool;
+#define __bool_true_false_are_defined   1
+#endif /* !__cplusplus */
+
+#else
+#ifndef __STDC_LIMIT_MACROS
+#define __STDC_LIMIT_MACROS 1
+#endif
+#include <stdint.h>
+#include <stdbool.h>
+#endif
+
+
+typedef unsigned int       uint;
+typedef unsigned char      ubyte;
+typedef unsigned short     ushort;
+typedef uint64_t           uint64;
+
+#if 0
+#define boolean bool
+#else
+typedef unsigned char boolean;
+#endif
+#ifndef TRUE
+#define TRUE  true
+#endif
+#ifndef FALSE
+#define FALSE false
+#endif
+
+
+/* Function inlining */
+#ifdef __cplusplus
+#  define INLINE inline
+#elif defined(__GNUC__)
+#  define INLINE __inline__
+#elif defined(_MSC_VER)
+#  define INLINE __inline
+#elif defined(__ICL)
+#  define INLINE __inline
+#elif defined(__INTEL_COMPILER)
+#  define INLINE inline
+#elif defined(__WATCOMC__) && (__WATCOMC__ >= 1100)
+#  define INLINE __inline
+#else
+#  define INLINE
+#endif
+
+
+/* This should match linux gcc cdecl semantics everywhere, so that we
+ * just codegen one calling convention on all platforms.
+ */
+#ifdef _MSC_VER
+#define PIPE_CDECL __cdecl
+#else
+#define PIPE_CDECL
+#endif
+
+
+
+#if defined(__GNUC__)
+#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___aligned[SIZE] __attribute__(( aligned( 16 ) ))
+#define ALIGN16_ASSIGN(NAME) NAME##___aligned
+#define ALIGN16_ATTRIB  __attribute__(( aligned( 16 ) ))
+#define ALIGN8_ATTRIB  __attribute__(( aligned( 8 ) ))
+#else
+#define ALIGN16_DECL(TYPE, NAME, SIZE)  TYPE NAME##___unaligned[SIZE + 1]
+#define ALIGN16_ASSIGN(NAME) align16(NAME##___unaligned)
+#define ALIGN16_ATTRIB
+#define ALIGN8_ATTRIB
+#endif
+
+
+
+#endif /* P_COMPILER_H */
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
new file mode 100644
index 0000000000..05cbd2fc4d
--- /dev/null
+++ b/src/gallium/include/pipe/p_config.h
@@ -0,0 +1,148 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Gallium configuration defines.
+ * 
+ * This header file sets several defines based on the compiler, processor 
+ * architecture, and operating system being used. These defines should be used 
+ * throughout the code to facilitate porting to new platforms. It is likely that 
+ * this file is auto-generated by an autoconf-like tool at some point, as some 
+ * things cannot be determined by pre-defined environment alone. 
+ * 
+ * See also:
+ * - http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
+ * - echo | gcc -dM -E - | sort
+ * - http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+ * 
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_CONFIG_H_
+#define P_CONFIG_H_
+
+
+/*
+ * Compiler
+ */
+
+#if defined(__GNUC__)
+#define PIPE_CC_GCC
+#endif
+
+/*
+ * Meaning of _MSC_VER value:
+ * - 1400: Visual C++ 2005
+ * - 1310: Visual C++ .NET 2003
+ * - 1300: Visual C++ .NET 2002
+ * 
+ * __MSC__ seems to be an old macro -- it is not pre-defined on recent MSVC 
+ * versions.
+ */
+#if defined(_MSC_VER) || defined(__MSC__)
+#define PIPE_CC_MSVC
+#endif
+
+#if defined(__ICL)
+#define PIPE_CC_ICL
+#endif
+
+
+/*
+ * Processor architecture
+ */
+
+#if defined(__i386__) /* gcc */ || defined(_M_IX86) /* msvc */ || defined(_X86_) || defined(__386__) || defined(i386)
+#define PIPE_ARCH_X86
+#endif
+
+#if defined(__x86_64__) /* gcc */ || defined(_M_X64) /* msvc */ || defined(_M_AMD64) /* msvc */
+#define PIPE_ARCH_X86_64
+#endif
+
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
+#if defined(__PPC__)
+#define PIPE_ARCH_PPC
+#if defined(__PPC64__)
+#define PIPE_ARCH_PPC_64
+#endif
+#endif
+
+
+/*
+ * Operating system family.
+ * 
+ * See subsystem below for a more fine-grained distinction.
+ */
+
+#if defined(__linux__)
+#define PIPE_OS_LINUX
+#endif
+
+#if defined(_WIN32) || defined(WIN32)
+#define PIPE_OS_WINDOWS
+#endif
+
+
+/*
+ * Subsystem.
+ * 
+ * NOTE: There is no way to auto-detect most of these.
+ */
+
+#if defined(PIPE_OS_LINUX)
+#define PIPE_SUBSYSTEM_DRI
+#endif /* PIPE_OS_LINUX */
+
+#if defined(PIPE_OS_WINDOWS)
+#if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+/* Windows 2000/XP Display Driver */ 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+/* Windows 2000/XP Miniport Driver */ 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+/* Windows User-space Library */
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+/* Windows CE 5.0/6.0 */
+#else
+#ifdef _WIN32_WCE
+#define PIPE_SUBSYSTEM_WINDOWS_CE
+#else /* !_WIN32_WCE */
+#error No PIPE_SUBSYSTEM_WINDOWS_xxx subsystem defined. 
+#endif /* !_WIN32_WCE */
+#endif
+#endif /* PIPE_OS_WINDOWS */
+
+
+#endif /* P_CONFIG_H_ */
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
new file mode 100644
index 0000000000..2646706ff2
--- /dev/null
+++ b/src/gallium/include/pipe/p_context.h
@@ -0,0 +1,226 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef PIPE_CONTEXT_H
+#define PIPE_CONTEXT_H
+
+#include "p_state.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+struct pipe_screen;
+struct pipe_fence_handle;
+struct pipe_state_cache;
+struct pipe_query;
+
+
+/**
+ * Gallium rendering context.  Basically:
+ *  - state setting functions
+ *  - VBO drawing functions
+ *  - surface functions
+ */
+struct pipe_context {
+   struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   void *priv;  /**< context private data (for DRI for example) */
+   void *draw;  /**< private, for draw module (temporary?) */
+
+   void (*destroy)( struct pipe_context * );
+
+   
+   /* Possible interface for setting edgeflags.  These aren't really
+    * vertex elements, so don't fit there.
+    */
+   void (*set_edgeflags)( struct pipe_context *,
+                          const unsigned *bitfield );
+
+
+   /**
+    * VBO drawing (return false on fallbacks (temporary??))
+    */
+   /*@{*/
+   boolean (*draw_arrays)( struct pipe_context *pipe,
+			   unsigned mode, unsigned start, unsigned count);
+
+   boolean (*draw_elements)( struct pipe_context *pipe,
+			     struct pipe_buffer *indexBuffer,
+			     unsigned indexSize,
+			     unsigned mode, unsigned start, unsigned count);
+
+   /* XXX: this is (probably) a temporary entrypoint, as the range
+    * information should be available from the vertex_buffer state.
+    * Using this to quickly evaluate a specialized path in the draw
+    * module.
+    */
+   boolean (*draw_range_elements)( struct pipe_context *pipe,
+                                   struct pipe_buffer *indexBuffer,
+                                   unsigned indexSize,
+                                   unsigned minIndex,
+                                   unsigned maxIndex,
+                                   unsigned mode, 
+                                   unsigned start, 
+                                   unsigned count);
+   /*@}*/
+
+
+   /**
+    * Query objects
+    */
+   /*@{*/
+   struct pipe_query *(*create_query)( struct pipe_context *pipe,
+                                              unsigned query_type );
+
+   void (*destroy_query)(struct pipe_context *pipe,
+                         struct pipe_query *q);
+
+   void (*begin_query)(struct pipe_context *pipe, struct pipe_query *q);
+   void (*end_query)(struct pipe_context *pipe, struct pipe_query *q);
+
+   boolean (*get_query_result)(struct pipe_context *pipe, 
+                               struct pipe_query *q,
+                               boolean wait,
+                               uint64 *result);
+   /*@}*/
+
+   /**
+    * State functions (create/bind/destroy state objects)
+    */
+   /*@{*/
+   void * (*create_blend_state)(struct pipe_context *,
+                                const struct pipe_blend_state *);
+   void   (*bind_blend_state)(struct pipe_context *, void *);
+   void   (*delete_blend_state)(struct pipe_context *, void  *);
+
+   void * (*create_sampler_state)(struct pipe_context *,
+                                  const struct pipe_sampler_state *);
+   void   (*bind_sampler_states)(struct pipe_context *, unsigned num, void **);
+   void   (*delete_sampler_state)(struct pipe_context *, void *);
+
+   void * (*create_rasterizer_state)(struct pipe_context *,
+                                     const struct pipe_rasterizer_state *);
+   void   (*bind_rasterizer_state)(struct pipe_context *, void *);
+   void   (*delete_rasterizer_state)(struct pipe_context *, void *);
+
+   void * (*create_depth_stencil_alpha_state)(struct pipe_context *,
+                                        const struct pipe_depth_stencil_alpha_state *);
+   void   (*bind_depth_stencil_alpha_state)(struct pipe_context *, void *);
+   void   (*delete_depth_stencil_alpha_state)(struct pipe_context *, void *);
+
+   void * (*create_fs_state)(struct pipe_context *,
+                             const struct pipe_shader_state *);
+   void   (*bind_fs_state)(struct pipe_context *, void *);
+   void   (*delete_fs_state)(struct pipe_context *, void *);
+
+   void * (*create_vs_state)(struct pipe_context *,
+                             const struct pipe_shader_state *);
+   void   (*bind_vs_state)(struct pipe_context *, void *);
+   void   (*delete_vs_state)(struct pipe_context *, void *);
+   /*@}*/
+
+   /**
+    * Parameter-like state (or properties)
+    */
+   /*@{*/
+   void (*set_blend_color)( struct pipe_context *,
+                            const struct pipe_blend_color * );
+
+   void (*set_clip_state)( struct pipe_context *,
+			   const struct pipe_clip_state * );
+
+   void (*set_constant_buffer)( struct pipe_context *,
+                                uint shader, uint index,
+                                const struct pipe_constant_buffer *buf );
+
+   void (*set_framebuffer_state)( struct pipe_context *,
+                                  const struct pipe_framebuffer_state * );
+
+   void (*set_polygon_stipple)( struct pipe_context *,
+				const struct pipe_poly_stipple * );
+
+   void (*set_scissor_state)( struct pipe_context *,
+                              const struct pipe_scissor_state * );
+
+   void (*set_viewport_state)( struct pipe_context *,
+                               const struct pipe_viewport_state * );
+
+   void (*set_sampler_textures)( struct pipe_context *,
+                                 unsigned num_textures,
+                                 struct pipe_texture ** );
+
+   void (*set_vertex_buffers)( struct pipe_context *,
+                               unsigned num_buffers,
+                               const struct pipe_vertex_buffer * );
+
+   void (*set_vertex_elements)( struct pipe_context *,
+                                unsigned num_elements,
+                                const struct pipe_vertex_element * );
+   /*@}*/
+
+
+   /**
+    * Surface functions
+    */
+   /*@{*/
+   void (*surface_copy)(struct pipe_context *pipe,
+                        boolean do_flip,/**< flip surface contents vertically */
+			struct pipe_surface *dest,
+			unsigned destx, unsigned desty,
+			struct pipe_surface *src, /* don't make this const - 
+						     need to map/unmap */
+			unsigned srcx, unsigned srcy,
+			unsigned width, unsigned height);
+
+   void (*surface_fill)(struct pipe_context *pipe,
+			struct pipe_surface *dst,
+			unsigned dstx, unsigned dsty,
+			unsigned width, unsigned height,
+			unsigned value);
+
+   void (*clear)(struct pipe_context *pipe, 
+		 struct pipe_surface *ps,
+		 unsigned clearValue);
+   /*@}*/
+
+
+   /** Flush rendering (flags = bitmask of PIPE_FLUSH_x tokens) */
+   void (*flush)( struct pipe_context *pipe,
+                  unsigned flags,
+                  struct pipe_fence_handle **fence );
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* PIPE_CONTEXT_H */
diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
new file mode 100644
index 0000000000..3b00fb9aa8
--- /dev/null
+++ b/src/gallium/include/pipe/p_debug.h
@@ -0,0 +1,356 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Cross-platform debugging helpers.
+ * 
+ * For now it just has assert and printf replacements, but it might be extended 
+ * with stack trace reports and more advanced logging in the near future. 
+ * 
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_DEBUG_H_
+#define P_DEBUG_H_
+
+
+#include <stdarg.h>
+
+#include "p_compiler.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#if defined(DBG) || defined(DEBUG)
+#ifndef DEBUG
+#define DEBUG 1
+#endif
+#else
+#ifndef NDEBUG
+#define NDEBUG 1
+#endif
+#endif
+
+   
+/* MSVC bebore VC7 does not have the __FUNCTION__ macro */
+#if defined(_MSC_VER) && _MSC_VER < 1300
+#define __FUNCTION__ "???"
+#endif
+
+
+void _debug_vprintf(const char *format, va_list ap);
+   
+
+static INLINE void
+_debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   _debug_vprintf(format, ap);
+   va_end(ap);
+}
+
+
+/**
+ * Print debug messages.
+ *
+ * The actual channel used to output debug message is platform specific. To 
+ * avoid misformating or truncation, follow these rules of thumb:   
+ * - output whole lines
+ * - avoid outputing large strings (512 bytes is the current maximum length 
+ * that is guaranteed to be printed in all platforms)
+ */
+static INLINE void
+debug_printf(const char *format, ...)
+{
+#ifdef DEBUG
+   va_list ap;
+   va_start(ap, format);
+   _debug_vprintf(format, ap);
+   va_end(ap);
+#else
+   (void) format; /* silence warning */
+#endif
+}
+
+
+#ifdef DEBUG
+#define debug_vprintf(_format, _ap) _debug_vprintf(_format, _ap)
+#else
+#define debug_vprintf(_format, _ap) ((void)0)
+#endif
+
+
+#ifdef DEBUG
+/**
+ * Dump a blob in hex to the same place that debug_printf sends its
+ * messages.
+ */
+void debug_print_blob( const char *name, const void *blob, unsigned size );
+
+/* Print a message along with a prettified format string
+ */
+void debug_print_format(const char *msg, unsigned fmt );
+#else
+#define debug_print_blob(_name, _blob, _size) ((void)0)
+#define debug_print_format(_msg, _fmt) ((void)0)
+#endif
+
+
+void _debug_break(void);
+
+
+/**
+ * Hard-coded breakpoint.
+ */
+#ifdef DEBUG
+#if defined(PIPE_ARCH_X86) && defined(PIPE_CC_GCC)
+#define debug_break() __asm("int3")
+#elif defined(PIPE_ARCH_X86) && defined(PIPE_CC_MSVC)
+#define debug_break()  do { _asm {int 3} } while(0)
+#else
+#define debug_break() _debug_break()
+#endif
+#else /* !DEBUG */
+#define debug_break() ((void)0)
+#endif /* !DEBUG */
+
+
+long
+debug_get_num_option(const char *name, long dfault);
+
+void _debug_assert_fail(const char *expr, 
+                        const char *file, 
+                        unsigned line, 
+                        const char *function);
+
+
+/** 
+ * Assert macro
+ * 
+ * Do not expect that the assert call terminates -- errors must be handled 
+ * regardless of assert behavior.
+ */
+#ifdef DEBUG
+#define debug_assert(expr) ((expr) ? (void)0 : _debug_assert_fail(#expr, __FILE__, __LINE__, __FUNCTION__))
+#else
+#define debug_assert(expr) ((void)0)
+#endif
+
+
+/** Override standard assert macro */
+#ifdef assert
+#undef assert
+#endif
+#define assert(expr) debug_assert(expr)
+
+
+/**
+ * Output the current function name.
+ */
+#ifdef DEBUG
+#define debug_checkpoint() \
+   _debug_printf("%s\n", __FUNCTION__)
+#else
+#define debug_checkpoint() \
+   ((void)0) 
+#endif
+
+
+/**
+ * Output the full source code position.
+ */
+#ifdef DEBUG
+#define debug_checkpoint_full() \
+   _debug_printf("%s:%u:%s", __FILE__, __LINE__, __FUNCTION__) 
+#else
+#define debug_checkpoint_full() \
+   ((void)0) 
+#endif
+
+
+/**
+ * Output a warning message. Muted on release version.
+ */
+#ifdef DEBUG
+#define debug_warning(__msg) \
+   _debug_printf("%s:%u:%s: warning: %s\n", __FILE__, __LINE__, __FUNCTION__, __msg)
+#else
+#define debug_warning(__msg) \
+   ((void)0) 
+#endif
+
+
+/**
+ * Output an error message. Not muted on release version.
+ */
+#ifdef DEBUG
+#define debug_error(__msg) \
+   _debug_printf("%s:%u:%s: error: %s\n", __FILE__, __LINE__, __FUNCTION__, __msg) 
+#else
+#define debug_error(__msg) \
+   _debug_printf("error: %s\n", __msg)
+#endif
+
+
+/**
+ * Used by debug_dump_enum and debug_dump_flags to describe symbols.
+ */
+struct debug_named_value
+{
+   const char *name;
+   unsigned long value;
+};
+
+
+/**
+ * Some C pre-processor magic to simplify creating named values.
+ * 
+ * Example:
+ * @code
+ * static const debug_named_value my_names[] = {
+ *    DEBUG_NAMED_VALUE(MY_ENUM_VALUE_X),
+ *    DEBUG_NAMED_VALUE(MY_ENUM_VALUE_Y),
+ *    DEBUG_NAMED_VALUE(MY_ENUM_VALUE_Z),
+ *    DEBUG_NAMED_VALUE_END
+ * };
+ * 
+ *    ...
+ *    debug_printf("%s = %s\n", 
+ *                 name,
+ *                 debug_dump_enum(my_names, my_value));
+ *    ...
+ * @endcode
+ */
+#define DEBUG_NAMED_VALUE(__symbol) {#__symbol, (unsigned long)__symbol} 
+#define DEBUG_NAMED_VALUE_END {NULL, 0} 
+
+
+/**
+ * Convert a enum value to a string.
+ */
+const char *
+debug_dump_enum(const struct debug_named_value *names, 
+                unsigned long value);
+
+
+/**
+ * Convert binary flags value to a string.
+ */
+const char *
+debug_dump_flags(const struct debug_named_value *names, 
+                 unsigned long value);
+
+
+/**
+ * Get option.
+ * 
+ * It is an alias for getenv on Linux. 
+ * 
+ * On Windows it reads C:\gallium.cfg, which is a text file with CR+LF line 
+ * endings with one option per line as
+ *  
+ *   NAME=value
+ * 
+ * This file must be terminated with an extra empty line.
+ */
+const char *
+debug_get_option(const char *name, const char *dfault);
+
+boolean
+debug_get_bool_option(const char *name, boolean dfault);
+
+long
+debug_get_num_option(const char *name, long dfault);
+
+unsigned long
+debug_get_flags_option(const char *name, 
+                       const struct debug_named_value *flags,
+                       unsigned long dfault);
+
+
+void *
+debug_malloc(const char *file, unsigned line, const char *function,
+             size_t size);
+
+void
+debug_free(const char *file, unsigned line, const char *function,
+           void *ptr);
+
+void *
+debug_calloc(const char *file, unsigned line, const char *function,
+             size_t count, size_t size );
+
+void *
+debug_realloc(const char *file, unsigned line, const char *function,
+              void *old_ptr, size_t old_size, size_t new_size );
+
+unsigned long
+debug_memory_begin(void);
+
+void 
+debug_memory_end(unsigned long beginning);
+
+
+#if defined(PROFILE) && defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY)
+
+void
+debug_profile_start(void);
+
+void 
+debug_profile_stop(void);
+
+#endif
+
+
+#ifdef DEBUG
+struct pipe_surface;
+void debug_dump_image(const char *prefix,
+                      unsigned format, unsigned cpp,
+                      unsigned width, unsigned height,
+                      unsigned stride,
+                      const void *data);
+void debug_dump_surface(const char *prefix,
+                        struct pipe_surface *surface);   
+void debug_dump_surface_bmp(const char *filename,
+                            struct pipe_surface *surface);
+#else
+#define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
+#define debug_dump_surface(prefix, surface) ((void)0)
+#define debug_dump_surface_bmp(filename, surface) ((void)0)
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* P_DEBUG_H_ */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
new file mode 100644
index 0000000000..6bfac581f9
--- /dev/null
+++ b/src/gallium/include/pipe/p_defines.h
@@ -0,0 +1,305 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef PIPE_DEFINES_H
+#define PIPE_DEFINES_H
+
+#include "p_format.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PIPE_BLENDFACTOR_ONE                 0x1
+#define PIPE_BLENDFACTOR_SRC_COLOR           0x2
+#define PIPE_BLENDFACTOR_SRC_ALPHA           0x3
+#define PIPE_BLENDFACTOR_DST_ALPHA           0x4
+#define PIPE_BLENDFACTOR_DST_COLOR           0x5
+#define PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE  0x6
+#define PIPE_BLENDFACTOR_CONST_COLOR         0x7
+#define PIPE_BLENDFACTOR_CONST_ALPHA         0x8
+#define PIPE_BLENDFACTOR_SRC1_COLOR          0x9
+#define PIPE_BLENDFACTOR_SRC1_ALPHA          0x0A
+#define PIPE_BLENDFACTOR_ZERO                0x11
+#define PIPE_BLENDFACTOR_INV_SRC_COLOR       0x12
+#define PIPE_BLENDFACTOR_INV_SRC_ALPHA       0x13
+#define PIPE_BLENDFACTOR_INV_DST_ALPHA       0x14
+#define PIPE_BLENDFACTOR_INV_DST_COLOR       0x15
+#define PIPE_BLENDFACTOR_INV_CONST_COLOR     0x17
+#define PIPE_BLENDFACTOR_INV_CONST_ALPHA     0x18
+#define PIPE_BLENDFACTOR_INV_SRC1_COLOR      0x19
+#define PIPE_BLENDFACTOR_INV_SRC1_ALPHA      0x1A
+
+#define PIPE_BLEND_ADD               0
+#define PIPE_BLEND_SUBTRACT          1
+#define PIPE_BLEND_REVERSE_SUBTRACT  2
+#define PIPE_BLEND_MIN               3
+#define PIPE_BLEND_MAX               4
+
+#define PIPE_LOGICOP_CLEAR            0
+#define PIPE_LOGICOP_NOR              1
+#define PIPE_LOGICOP_AND_INVERTED     2
+#define PIPE_LOGICOP_COPY_INVERTED    3
+#define PIPE_LOGICOP_AND_REVERSE      4
+#define PIPE_LOGICOP_INVERT           5
+#define PIPE_LOGICOP_XOR              6
+#define PIPE_LOGICOP_NAND             7
+#define PIPE_LOGICOP_AND              8
+#define PIPE_LOGICOP_EQUIV            9
+#define PIPE_LOGICOP_NOOP             10
+#define PIPE_LOGICOP_OR_INVERTED      11
+#define PIPE_LOGICOP_COPY             12
+#define PIPE_LOGICOP_OR_REVERSE       13
+#define PIPE_LOGICOP_OR               14
+#define PIPE_LOGICOP_SET              15  
+
+#define PIPE_MASK_R  0x1
+#define PIPE_MASK_G  0x2
+#define PIPE_MASK_B  0x4
+#define PIPE_MASK_A  0x8
+#define PIPE_MASK_RGBA 0xf
+
+
+/**
+ * Inequality functions.  Used for depth test, stencil compare, alpha
+ * test, shadow compare, etc.
+ */
+#define PIPE_FUNC_NEVER    0
+#define PIPE_FUNC_LESS     1
+#define PIPE_FUNC_EQUAL    2
+#define PIPE_FUNC_LEQUAL   3
+#define PIPE_FUNC_GREATER  4
+#define PIPE_FUNC_NOTEQUAL 5
+#define PIPE_FUNC_GEQUAL   6
+#define PIPE_FUNC_ALWAYS   7
+
+/** Polygon fill mode */
+#define PIPE_POLYGON_MODE_FILL  0
+#define PIPE_POLYGON_MODE_LINE  1
+#define PIPE_POLYGON_MODE_POINT 2
+
+/** Polygon front/back window, also for culling */
+#define PIPE_WINDING_NONE 0
+#define PIPE_WINDING_CW   1
+#define PIPE_WINDING_CCW  2
+#define PIPE_WINDING_BOTH (PIPE_WINDING_CW | PIPE_WINDING_CCW)
+
+/** Stencil ops */
+#define PIPE_STENCIL_OP_KEEP       0
+#define PIPE_STENCIL_OP_ZERO       1
+#define PIPE_STENCIL_OP_REPLACE    2
+#define PIPE_STENCIL_OP_INCR       3
+#define PIPE_STENCIL_OP_DECR       4
+#define PIPE_STENCIL_OP_INCR_WRAP  5
+#define PIPE_STENCIL_OP_DECR_WRAP  6
+#define PIPE_STENCIL_OP_INVERT     7
+
+/** Texture types */
+enum pipe_texture_target {
+   PIPE_TEXTURE_1D   = 0,
+   PIPE_TEXTURE_2D   = 1,
+   PIPE_TEXTURE_3D   = 2,
+   PIPE_TEXTURE_CUBE = 3
+};
+
+#define PIPE_TEX_FACE_POS_X 0
+#define PIPE_TEX_FACE_NEG_X 1
+#define PIPE_TEX_FACE_POS_Y 2
+#define PIPE_TEX_FACE_NEG_Y 3
+#define PIPE_TEX_FACE_POS_Z 4
+#define PIPE_TEX_FACE_NEG_Z 5
+
+#define PIPE_TEX_WRAP_REPEAT                   0
+#define PIPE_TEX_WRAP_CLAMP                    1
+#define PIPE_TEX_WRAP_CLAMP_TO_EDGE            2
+#define PIPE_TEX_WRAP_CLAMP_TO_BORDER          3
+#define PIPE_TEX_WRAP_MIRROR_REPEAT            4
+#define PIPE_TEX_WRAP_MIRROR_CLAMP             5
+#define PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE     6
+#define PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER   7
+
+/* Between mipmaps, ie mipfilter
+ */
+#define PIPE_TEX_MIPFILTER_NEAREST  0
+#define PIPE_TEX_MIPFILTER_LINEAR   1
+#define PIPE_TEX_MIPFILTER_NONE     2
+
+/* Within a mipmap, ie min/mag filter 
+ */
+#define PIPE_TEX_FILTER_NEAREST      0
+#define PIPE_TEX_FILTER_LINEAR       1
+#define PIPE_TEX_FILTER_ANISO        2 
+
+
+#define PIPE_TEX_COMPARE_NONE          0
+#define PIPE_TEX_COMPARE_R_TO_TEXTURE  1
+
+#define PIPE_TEX_FACE_POS_X   0
+#define PIPE_TEX_FACE_NEG_X   1
+#define PIPE_TEX_FACE_POS_Y   2
+#define PIPE_TEX_FACE_NEG_Y   3
+#define PIPE_TEX_FACE_POS_Z   4
+#define PIPE_TEX_FACE_NEG_Z   5
+#define PIPE_TEX_FACE_MAX     6
+
+#define PIPE_TEXTURE_USAGE_RENDER_TARGET   0x1
+#define PIPE_TEXTURE_USAGE_DISPLAY_TARGET  0x2 /* ie a backbuffer */
+#define PIPE_TEXTURE_USAGE_PRIMARY         0x4 /* ie a frontbuffer */
+#define PIPE_TEXTURE_USAGE_DEPTH_STENCIL   0x8
+#define PIPE_TEXTURE_USAGE_SAMPLER         0x10
+/** Pipe driver custom usage flags should be greater or equal to this value */
+#define PIPE_TEXTURE_USAGE_CUSTOM          (1 << 16)
+
+#define PIPE_TEXTURE_GEOM_NON_SQUARE       0x1
+#define PIPE_TEXTURE_GEOM_NON_POWER_OF_TWO 0x2
+
+
+/**
+ * Surface layout
+ */
+#define PIPE_SURFACE_LAYOUT_LINEAR  0
+
+
+/**
+ * Surface status
+ */
+#define PIPE_SURFACE_STATUS_UNDEFINED  0
+#define PIPE_SURFACE_STATUS_DEFINED    1
+#define PIPE_SURFACE_STATUS_CLEAR      2
+
+
+/**
+ * Buffer usage flags
+ */
+#define PIPE_BUFFER_USAGE_CPU_READ  (1 << 0)
+#define PIPE_BUFFER_USAGE_CPU_WRITE (1 << 1)
+#define PIPE_BUFFER_USAGE_GPU_READ  (1 << 2)
+#define PIPE_BUFFER_USAGE_GPU_WRITE (1 << 3)
+#define PIPE_BUFFER_USAGE_PIXEL     (1 << 4)
+#define PIPE_BUFFER_USAGE_VERTEX    (1 << 5)
+#define PIPE_BUFFER_USAGE_INDEX     (1 << 6)
+#define PIPE_BUFFER_USAGE_CONSTANT  (1 << 7)
+/** Pipe driver custom usage flags should be greater or equal to this value */
+#define PIPE_BUFFER_USAGE_CUSTOM    (1 << 16)
+
+/* Convenient shortcuts */
+#define PIPE_BUFFER_USAGE_CPU_READ_WRITE \
+   ( PIPE_BUFFER_USAGE_CPU_READ | PIPE_BUFFER_USAGE_CPU_WRITE )
+#define PIPE_BUFFER_USAGE_GPU_READ_WRITE \
+   ( PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE )
+#define PIPE_BUFFER_USAGE_WRITE \
+   ( PIPE_BUFFER_USAGE_CPU_WRITE | PIPE_BUFFER_USAGE_GPU_WRITE )
+
+
+/** 
+ * Flush types:
+ */
+#define PIPE_FLUSH_RENDER_CACHE   0x1
+#define PIPE_FLUSH_TEXTURE_CACHE  0x2
+#define PIPE_FLUSH_SWAPBUFFERS    0x4
+#define PIPE_FLUSH_FRAME          0x8 /**< Mark the end of a frame */
+
+
+/**
+ * Shaders
+ */
+#define PIPE_SHADER_VERTEX   0
+#define PIPE_SHADER_FRAGMENT 1
+#define PIPE_SHADER_TYPES    2
+
+
+/**
+ * Primitive types:
+ */
+#define PIPE_PRIM_POINTS          0
+#define PIPE_PRIM_LINES           1
+#define PIPE_PRIM_LINE_LOOP       2
+#define PIPE_PRIM_LINE_STRIP      3
+#define PIPE_PRIM_TRIANGLES       4
+#define PIPE_PRIM_TRIANGLE_STRIP  5
+#define PIPE_PRIM_TRIANGLE_FAN    6
+#define PIPE_PRIM_QUADS           7
+#define PIPE_PRIM_QUAD_STRIP      8
+#define PIPE_PRIM_POLYGON         9
+
+
+/**
+ * Query object types
+ */
+#define PIPE_QUERY_OCCLUSION_COUNTER     0
+#define PIPE_QUERY_PRIMITIVES_GENERATED  1
+#define PIPE_QUERY_PRIMITIVES_EMITTED    2
+#define PIPE_QUERY_TYPES                 3
+
+
+/**
+ * Point sprite coord modes
+ */
+#define PIPE_SPRITE_COORD_NONE       0
+#define PIPE_SPRITE_COORD_UPPER_LEFT 1
+#define PIPE_SPRITE_COORD_LOWER_LEFT 2
+
+
+/**
+ * Implementation capabilities/limits
+ * Passed to pipe->get_param()
+ * XXX this will need some fine tuning...
+ */
+#define PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS 1
+#define PIPE_CAP_NPOT_TEXTURES           2
+#define PIPE_CAP_TWO_SIDED_STENCIL       3
+#define PIPE_CAP_GLSL                    4  /* XXX need something better */
+#define PIPE_CAP_S3TC                    5
+#define PIPE_CAP_ANISOTROPIC_FILTER      6
+#define PIPE_CAP_POINT_SPRITE            7
+#define PIPE_CAP_MAX_RENDER_TARGETS      8
+#define PIPE_CAP_OCCLUSION_QUERY         9
+#define PIPE_CAP_TEXTURE_SHADOW_MAP      10
+#define PIPE_CAP_MAX_TEXTURE_2D_LEVELS   11
+#define PIPE_CAP_MAX_TEXTURE_3D_LEVELS   12
+#define PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS 13
+#define PIPE_CAP_MAX_LINE_WIDTH          14
+#define PIPE_CAP_MAX_LINE_WIDTH_AA       15
+#define PIPE_CAP_MAX_POINT_WIDTH         16
+#define PIPE_CAP_MAX_POINT_WIDTH_AA      17
+#define PIPE_CAP_MAX_TEXTURE_ANISOTROPY  18
+#define PIPE_CAP_MAX_TEXTURE_LOD_BIAS    19
+#define PIPE_CAP_GUARD_BAND_LEFT         20  /*< float */
+#define PIPE_CAP_GUARD_BAND_TOP          21  /*< float */
+#define PIPE_CAP_GUARD_BAND_RIGHT        22  /*< float */
+#define PIPE_CAP_GUARD_BAND_BOTTOM       23  /*< float */
+#define PIPE_CAP_TEXTURE_MIRROR_CLAMP    24
+#define PIPE_CAP_TEXTURE_MIRROR_REPEAT   25
+#define PIPE_CAP_MAX_VERTEX_TEXTURE_UNITS 26
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/include/pipe/p_error.h b/src/gallium/include/pipe/p_error.h
new file mode 100644
index 0000000000..b865b22635
--- /dev/null
+++ b/src/gallium/include/pipe/p_error.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Gallium error codes.
+ * 
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef P_ERROR_H_
+#define P_ERROR_H_
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+   
+/**
+ * Gallium error codes.
+ * 
+ * - A zero value always means success.
+ * - A negative value always means failure.
+ * - The meaning of a positive value is function dependent. 
+ */
+enum pipe_error {
+   PIPE_OK = 0,
+   PIPE_ERROR = -1,    /**< Generic error */
+   PIPE_ERROR_BAD_INPUT = -2, 
+   PIPE_ERROR_OUT_OF_MEMORY = -3,
+   PIPE_ERROR_RETRY = -4
+   /* TODO */
+};
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* P_ERROR_H_ */
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
new file mode 100644
index 0000000000..6bd55d7735
--- /dev/null
+++ b/src/gallium/include/pipe/p_format.h
@@ -0,0 +1,586 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (c) 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef PIPE_FORMAT_H
+#define PIPE_FORMAT_H
+
+#include "p_compiler.h"
+#include "p_debug.h"
+
+#include "util/u_string.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * The pipe_format enum is a 32-bit wide bitfield that encodes all the
+ * information needed to uniquely describe a pixel format.
+ */
+
+/**
+ * Possible format layouts are encoded in the first 2 bits.
+ * The interpretation of the remaining 30 bits depends on a particular
+ * format layout.
+ */
+#define PIPE_FORMAT_LAYOUT_RGBAZS   0
+#define PIPE_FORMAT_LAYOUT_YCBCR    1
+#define PIPE_FORMAT_LAYOUT_DXT      2  /**< XXX temporary? */
+#define PIPE_FORMAT_LAYOUT_MIXED    3
+
+static INLINE uint pf_layout(uint f)  /**< PIPE_FORMAT_LAYOUT_ */
+{
+   return f & 0x3;
+}
+
+/**
+ * RGBAZS Format Layout.
+ */
+
+/**
+ * Format component selectors for RGBAZS & MIXED layout.
+ */
+#define PIPE_FORMAT_COMP_R    0
+#define PIPE_FORMAT_COMP_G    1
+#define PIPE_FORMAT_COMP_B    2
+#define PIPE_FORMAT_COMP_A    3
+#define PIPE_FORMAT_COMP_0    4
+#define PIPE_FORMAT_COMP_1    5
+#define PIPE_FORMAT_COMP_Z    6
+#define PIPE_FORMAT_COMP_S    7
+
+/**
+ * Format types for RGBAZS layout.
+ */
+#define PIPE_FORMAT_TYPE_UNKNOWN 0
+#define PIPE_FORMAT_TYPE_FLOAT   1  /**< 16/32/64-bit/channel formats */
+#define PIPE_FORMAT_TYPE_UNORM   2  /**< uints, normalized to [0,1] */
+#define PIPE_FORMAT_TYPE_SNORM   3  /**< ints, normalized to [-1,1] */
+#define PIPE_FORMAT_TYPE_USCALED 4  /**< uints, not normalized */
+#define PIPE_FORMAT_TYPE_SSCALED 5  /**< ints, not normalized */
+#define PIPE_FORMAT_TYPE_SRGB    6  /**< sRGB colorspace */
+#define PIPE_FORMAT_TYPE_FIXED   7  /**< 16.16 fixed point */
+
+
+/**
+ * Because the destination vector is assumed to be RGBA FLOAT, we
+ * need to know how to swizzle and expand components from the source
+ * vector.
+ * Let's take U_A1_R5_G5_B5 as an example. X swizzle is A, X size
+ * is 1 bit and type is UNORM. So we take the most significant bit
+ * from source vector, convert 0 to 0.0 and 1 to 1.0 and save it
+ * in the last component of the destination RGBA component.
+ * Next, Y swizzle is R, Y size is 5 and type is UNORM. We normalize
+ * those 5 bits into [0.0; 1.0] range and put it into second
+ * component of the destination vector. Rinse and repeat for
+ * components Z and W.
+ * If any of size fields is zero, it means the source format contains
+ * less than four components.
+ * If any swizzle is 0 or 1, the corresponding destination component
+ * should be filled with 0.0 and 1.0, respectively.
+ */
+typedef uint pipe_format_rgbazs_t;
+
+static INLINE uint pf_get(pipe_format_rgbazs_t f, uint shift, uint mask)
+{
+   return (f >> shift) & mask;
+}
+
+#define pf_swizzle_x(f)       pf_get(f, 2, 0x7)  /**< PIPE_FORMAT_COMP_ */
+#define pf_swizzle_y(f)       pf_get(f, 5, 0x7)  /**< PIPE_FORMAT_COMP_ */
+#define pf_swizzle_z(f)       pf_get(f, 8, 0x7)  /**< PIPE_FORMAT_COMP_ */
+#define pf_swizzle_w(f)       pf_get(f, 11, 0x7) /**< PIPE_FORMAT_COMP_ */
+#define pf_swizzle_xyzw(f,i)  pf_get(f, 2+((i)*3), 0x7)
+#define pf_size_x(f)          pf_get(f, 14, 0x7) /**< Size of X */
+#define pf_size_y(f)          pf_get(f, 17, 0x7) /**< Size of Y */
+#define pf_size_z(f)          pf_get(f, 20, 0x7) /**< Size of Z */
+#define pf_size_w(f)          pf_get(f, 23, 0x7) /**< Size of W */
+#define pf_size_xyzw(f,i)     pf_get(f, 14+((i)*3), 0x7)
+#define pf_exp2(f)            pf_get(f, 26, 0x7) /**< Scale size by 2 ^ exp2 */
+#define pf_type(f)            pf_get(f, 29, 0x7) /**< PIPE_FORMAT_TYPE_ */
+
+/**
+ * Helper macro to encode the above structure into a 32-bit value.
+ */
+#define _PIPE_FORMAT_RGBAZS( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, EXP2, TYPE ) (\
+   (PIPE_FORMAT_LAYOUT_RGBAZS << 0) |\
+   ((SWZ) << 2) |\
+   ((SIZEX) << 14) |\
+   ((SIZEY) << 17) |\
+   ((SIZEZ) << 20) |\
+   ((SIZEW) << 23) |\
+   ((EXP2) << 26) |\
+   ((TYPE) << 29) )
+
+/**
+ * Helper macro to encode the swizzle part of the structure above.
+ */
+#define _PIPE_FORMAT_SWZ( SWZX, SWZY, SWZZ, SWZW ) (((SWZX) << 0) | ((SWZY) << 3) | ((SWZZ) << 6) | ((SWZW) << 9))
+
+/**
+ * Shorthand macro for RGBAZS layout with component sizes in 1-bit units.
+ */
+#define _PIPE_FORMAT_RGBAZS_1( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, TYPE )\
+   _PIPE_FORMAT_RGBAZS( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, 0, TYPE )
+
+/**
+ * Shorthand macro for RGBAZS layout with component sizes in 2-bit units.
+ */
+#define _PIPE_FORMAT_RGBAZS_2( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, TYPE )\
+   _PIPE_FORMAT_RGBAZS( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, 1, TYPE )
+
+/**
+ * Shorthand macro for RGBAZS layout with component sizes in 8-bit units.
+ */
+#define _PIPE_FORMAT_RGBAZS_8( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, TYPE )\
+   _PIPE_FORMAT_RGBAZS( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, 3, TYPE )
+
+/**
+ * Shorthand macro for RGBAZS layout with component sizes in 64-bit units.
+ */
+#define _PIPE_FORMAT_RGBAZS_64( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, TYPE )\
+   _PIPE_FORMAT_RGBAZS( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, 6, TYPE )
+
+typedef uint pipe_format_mixed_t;
+
+/* NOTE: Use pf_swizzle_* and pf_size_* macros for swizzles and sizes.
+ */
+
+#define pf_mixed_sign_x(f)       pf_get( f, 26, 0x1 )       /*< Sign of X */
+#define pf_mixed_sign_y(f)       pf_get( f, 27, 0x1 )       /*< Sign of Y */
+#define pf_mixed_sign_z(f)       pf_get( f, 28, 0x1 )       /*< Sign of Z */
+#define pf_mixed_sign_w(f)       pf_get( f, 29, 0x1 )       /*< Sign of W */
+#define pf_mixed_sign_xyzw(f, i) pf_get( f, 26 + (i), 0x1 )
+#define pf_mixed_normalized(f)   pf_get( f, 30, 0x1 )       /*< Type is NORM (1) or SCALED (0) */
+#define pf_mixed_scale8(f)       pf_get( f, 31, 0x1 )       /*< Scale size by either one (0) or eight (1) */
+
+/**
+ * Helper macro to encode the above structure into a 32-bit value.
+ */
+#define _PIPE_FORMAT_MIXED( SWZ, SIZEX, SIZEY, SIZEZ, SIZEW, SIGNX, SIGNY, SIGNZ, SIGNW, NORMALIZED, SCALE8 ) (\
+   (PIPE_FORMAT_LAYOUT_MIXED << 0) |\
+   ((SWZ) << 2) |\
+   ((SIZEX) << 14) |\
+   ((SIZEY) << 17) |\
+   ((SIZEZ) << 20) |\
+   ((SIZEW) << 23) |\
+   ((SIGNX) << 26) |\
+   ((SIGNY) << 27) |\
+   ((SIGNZ) << 28) |\
+   ((SIGNW) << 29) |\
+   ((NORMALIZED) << 30) |\
+   ((SCALE8) << 31) )
+
+/**
+ * Shorthand macro for common format swizzles.
+ */
+#define _PIPE_FORMAT_R001 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_1 )
+#define _PIPE_FORMAT_RG01 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_1 )
+#define _PIPE_FORMAT_RGB1 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_1 )
+#define _PIPE_FORMAT_RGBA _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_A )
+#define _PIPE_FORMAT_ARGB _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_A, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_B )
+#define _PIPE_FORMAT_ABGR _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_A, PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_R )
+#define _PIPE_FORMAT_BGRA _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_A )
+#define _PIPE_FORMAT_1RGB _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_1, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_B )
+#define _PIPE_FORMAT_1BGR _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_1, PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_R )
+#define _PIPE_FORMAT_BGR1 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_B, PIPE_FORMAT_COMP_G, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_1 )
+#define _PIPE_FORMAT_0000 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+#define _PIPE_FORMAT_000R _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_R )
+#define _PIPE_FORMAT_RRR1 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_1 )
+#define _PIPE_FORMAT_RRRR _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R )
+#define _PIPE_FORMAT_RRRG _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_R, PIPE_FORMAT_COMP_G )
+#define _PIPE_FORMAT_Z000 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_Z, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+#define _PIPE_FORMAT_0Z00 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_Z, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+#define _PIPE_FORMAT_SZ00 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_S, PIPE_FORMAT_COMP_Z, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+#define _PIPE_FORMAT_ZS00 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_Z, PIPE_FORMAT_COMP_S, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+#define _PIPE_FORMAT_S000 _PIPE_FORMAT_SWZ( PIPE_FORMAT_COMP_S, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0, PIPE_FORMAT_COMP_0 )
+
+/**
+ * YCBCR Format Layout.
+ */
+
+/**
+ * This only contains a flag that indicates whether the format is reversed or
+ * not.
+ */
+typedef uint pipe_format_ycbcr_t;
+
+/**
+ * Helper macro to encode the above structure into a 32-bit value.
+ */
+#define _PIPE_FORMAT_YCBCR( REV ) (\
+   (PIPE_FORMAT_LAYOUT_YCBCR << 0) |\
+   ((REV) << 2) )
+
+static INLINE uint pf_rev(pipe_format_ycbcr_t f)
+{
+   return (f >> 2) & 0x1;
+}
+
+
+/**
+  * Compresssed format layouts (this will probably change)
+  */
+#define _PIPE_FORMAT_DXT( LEVEL, RSIZE, GSIZE, BSIZE, ASIZE, TYPE ) \
+   ((PIPE_FORMAT_LAYOUT_DXT << 0) | \
+    ((LEVEL) << 2) | \
+    ((RSIZE) << 5) | \
+    ((GSIZE) << 8) | \
+    ((BSIZE) << 11) | \
+    ((ASIZE) << 14) | \
+    ((TYPE) << 29))
+
+
+
+/**
+ * Texture/surface image formats (preliminary)
+ */
+
+/* KW: Added lots of surface formats to support vertex element layout
+ * definitions, and eventually render-to-vertex-buffer.  Could
+ * consider making float/int/uint/scaled/normalized a separate
+ * parameter, but on the other hand there are special cases like
+ * z24s8, compressed textures, ycbcr, etc that won't fit that model.
+ */
+
+enum pipe_format {
+   PIPE_FORMAT_NONE                  = _PIPE_FORMAT_RGBAZS_1 ( _PIPE_FORMAT_0000, 0, 0, 0, 0, PIPE_FORMAT_TYPE_UNKNOWN ),
+   PIPE_FORMAT_A8R8G8B8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_ARGB, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_X8R8G8B8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_1RGB, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_B8G8R8A8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_BGRA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_B8G8R8X8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_BGR1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_A1R5G5B5_UNORM        = _PIPE_FORMAT_RGBAZS_1 ( _PIPE_FORMAT_ARGB, 1, 5, 5, 5, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_A4R4G4B4_UNORM        = _PIPE_FORMAT_RGBAZS_1 ( _PIPE_FORMAT_ARGB, 4, 4, 4, 4, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R5G6B5_UNORM          = _PIPE_FORMAT_RGBAZS_1 ( _PIPE_FORMAT_RGB1, 5, 6, 5, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_A2B10G10R10_UNORM     = _PIPE_FORMAT_RGBAZS_2 ( _PIPE_FORMAT_ABGR, 1, 5, 5, 5, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_L8_UNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRR1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_UNORM ), /**< ubyte luminance */
+   PIPE_FORMAT_A8_UNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_000R, 0, 0, 0, 1, PIPE_FORMAT_TYPE_UNORM ), /**< ubyte alpha */
+   PIPE_FORMAT_I8_UNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRRR, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ), /**< ubyte intensity */
+   PIPE_FORMAT_A8L8_UNORM            = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRRG, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ), /**< ubyte alpha, luminance */
+   PIPE_FORMAT_L16_UNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRR1, 2, 2, 2, 0, PIPE_FORMAT_TYPE_UNORM ), /**< ushort luminance */
+   PIPE_FORMAT_YCBCR                 = _PIPE_FORMAT_YCBCR( 0 ),
+   PIPE_FORMAT_YCBCR_REV             = _PIPE_FORMAT_YCBCR( 1 ),
+   PIPE_FORMAT_Z16_UNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_Z000, 2, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_Z32_UNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_Z000, 4, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_Z32_FLOAT             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_Z000, 4, 0, 0, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_S8Z24_UNORM           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_SZ00, 1, 3, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_Z24S8_UNORM           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_ZS00, 3, 1, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_X8Z24_UNORM           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_0Z00, 1, 3, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_Z24X8_UNORM           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_Z000, 3, 1, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_S8_UNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_S000, 1, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ), /**< ubyte stencil */
+   PIPE_FORMAT_R64_FLOAT             = _PIPE_FORMAT_RGBAZS_64( _PIPE_FORMAT_R001, 1, 0, 0, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R64G64_FLOAT          = _PIPE_FORMAT_RGBAZS_64( _PIPE_FORMAT_RG01, 1, 1, 0, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R64G64B64_FLOAT       = _PIPE_FORMAT_RGBAZS_64( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R64G64B64A64_FLOAT    = _PIPE_FORMAT_RGBAZS_64( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R32_FLOAT             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R32G32_FLOAT          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R32G32B32_FLOAT       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R32G32B32A32_FLOAT    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_FLOAT ),
+   PIPE_FORMAT_R32_UNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R32G32_UNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R32G32B32_UNORM       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R32G32B32A32_UNORM    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R32_USCALED           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R32G32_USCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R32G32B32_USCALED     = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R32G32B32A32_USCALED  = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R32_SNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R32G32_SNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R32G32B32_SNORM       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R32G32B32A32_SNORM    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R32_SSCALED           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R32G32_SSCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R32G32B32_SSCALED     = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R32G32B32A32_SSCALED  = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R16_UNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 2, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R16G16_UNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 2, 2, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R16G16B16_UNORM       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 2, 2, 2, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R16G16B16A16_UNORM    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 2, 2, 2, 2, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R16_USCALED           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 2, 0, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R16G16_USCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 2, 2, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R16G16B16_USCALED     = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 2, 2, 2, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R16G16B16A16_USCALED  = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 2, 2, 2, 2, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R16_SNORM             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 2, 0, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R16G16_SNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 2, 2, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R16G16B16_SNORM       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 2, 2, 2, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R16G16B16A16_SNORM    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 2, 2, 2, 2, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R16_SSCALED           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 2, 0, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R16G16_SSCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 2, 2, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R16G16B16_SSCALED     = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 2, 2, 2, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R16G16B16A16_SSCALED  = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 2, 2, 2, 2, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R8_UNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 1, 0, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R8G8_UNORM            = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 1, 1, 0, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R8G8B8_UNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R8G8B8A8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R8G8B8X8_UNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_R8_USCALED            = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 1, 0, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R8G8_USCALED          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 1, 1, 0, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R8G8B8_USCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R8G8B8A8_USCALED      = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R8G8B8X8_USCALED      = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_USCALED ),
+   PIPE_FORMAT_R8_SNORM              = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 1, 0, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R8G8_SNORM            = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 1, 1, 0, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R8G8B8_SNORM          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R8G8B8A8_SNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R8G8B8X8_SNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_B6G5R5_SNORM          = _PIPE_FORMAT_RGBAZS_1 ( _PIPE_FORMAT_BGR1, 6, 5, 5, 0, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_A8B8G8R8_SNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_BGRA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_X8B8G8R8_SNORM        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SNORM ),
+   PIPE_FORMAT_R8_SSCALED            = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 1, 0, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R8G8_SSCALED          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 1, 1, 0, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R8G8B8_SSCALED        = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R8G8B8A8_SSCALED      = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R8G8B8X8_SSCALED      = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SSCALED ),
+   PIPE_FORMAT_R32_FIXED             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_R001, 4, 0, 0, 0, PIPE_FORMAT_TYPE_FIXED ),
+   PIPE_FORMAT_R32G32_FIXED          = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RG01, 4, 4, 0, 0, PIPE_FORMAT_TYPE_FIXED ),
+   PIPE_FORMAT_R32G32B32_FIXED       = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 4, 4, 4, 0, PIPE_FORMAT_TYPE_FIXED ),
+   PIPE_FORMAT_R32G32B32A32_FIXED    = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 4, 4, 4, 4, PIPE_FORMAT_TYPE_FIXED ),
+   /* sRGB formats */
+   PIPE_FORMAT_L8_SRGB               = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRR1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_A8L8_SRGB             = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RRRG, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_R8G8B8_SRGB           = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 0, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_R8G8B8A8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGBA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_R8G8B8X8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_RGB1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_A8R8G8B8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_ARGB, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_X8R8G8B8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_1RGB, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_B8G8R8A8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_BGRA, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_B8G8R8X8_SRGB         = _PIPE_FORMAT_RGBAZS_8 ( _PIPE_FORMAT_BGR1, 1, 1, 1, 1, PIPE_FORMAT_TYPE_SRGB ),
+
+   /* mixed formats */
+   PIPE_FORMAT_X8UB8UG8SR8S_NORM     = _PIPE_FORMAT_MIXED( _PIPE_FORMAT_1BGR, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1 ),
+   PIPE_FORMAT_B6UG5SR5S_NORM        = _PIPE_FORMAT_MIXED( _PIPE_FORMAT_BGR1, 6, 5, 5, 0, 0, 1, 1, 0, 1, 0 ),
+
+   /* compressed formats */
+   PIPE_FORMAT_DXT1_RGB              = _PIPE_FORMAT_DXT( 1, 8, 8, 8, 0, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_DXT1_RGBA             = _PIPE_FORMAT_DXT( 1, 8, 8, 8, 8, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_DXT3_RGBA             = _PIPE_FORMAT_DXT( 3, 8, 8, 8, 8, PIPE_FORMAT_TYPE_UNORM ),
+   PIPE_FORMAT_DXT5_RGBA             = _PIPE_FORMAT_DXT( 5, 8, 8, 8, 8, PIPE_FORMAT_TYPE_UNORM ),
+
+   /* sRGB, compressed */
+   PIPE_FORMAT_DXT1_SRGB             = _PIPE_FORMAT_DXT( 1, 8, 8, 8, 0, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_DXT1_SRGBA            = _PIPE_FORMAT_DXT( 1, 8, 8, 8, 8, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_DXT3_SRGBA            = _PIPE_FORMAT_DXT( 3, 8, 8, 8, 8, PIPE_FORMAT_TYPE_SRGB ),
+   PIPE_FORMAT_DXT5_SRGBA            = _PIPE_FORMAT_DXT( 5, 8, 8, 8, 8, PIPE_FORMAT_TYPE_SRGB )
+};
+
+/**
+ * Builds pipe format name from format token.
+ */
+extern const char *pf_name( enum pipe_format format );
+
+/**
+ * Return bits for a particular component.
+ * \param comp  component index, starting at 0
+ */
+static INLINE uint pf_get_component_bits( enum pipe_format format, uint comp )
+{
+   uint size;
+
+   if (pf_swizzle_x(format) == comp) {
+      size = pf_size_x(format);
+   }
+   else if (pf_swizzle_y(format) == comp) {
+      size = pf_size_y(format);
+   }
+   else if (pf_swizzle_z(format) == comp) {
+      size = pf_size_z(format);
+   }
+   else if (pf_swizzle_w(format) == comp) {
+      size = pf_size_w(format);
+   }
+   else {
+      size = 0;
+   }
+   if (pf_layout( format ) == PIPE_FORMAT_LAYOUT_RGBAZS)
+      return size << pf_exp2( format );
+   return size << (pf_mixed_scale8( format ) * 3);
+}
+
+/**
+ * Return total bits needed for the pixel format.
+ */
+static INLINE uint pf_get_bits( enum pipe_format format )
+{
+   switch (pf_layout(format)) {
+   case PIPE_FORMAT_LAYOUT_RGBAZS:
+   case PIPE_FORMAT_LAYOUT_MIXED:
+      return
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_0 ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_1 ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_R ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_G ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_B ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_A ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_Z ) +
+         pf_get_component_bits( format, PIPE_FORMAT_COMP_S );
+   case PIPE_FORMAT_LAYOUT_YCBCR:
+      assert( format == PIPE_FORMAT_YCBCR || format == PIPE_FORMAT_YCBCR_REV );
+      /* return effective bits per pixel */
+      return 16; 
+   default:
+      assert( 0 );
+      return 0;
+   }
+}
+
+/**
+ * Return bytes per pixel for the given format.
+ */
+static INLINE uint pf_get_size( enum pipe_format format )
+{
+   assert(pf_get_bits(format) % 8 == 0);
+   return pf_get_bits(format) / 8;
+}
+
+/**
+ * Describe accurately the pixel format.
+ * 
+ * The chars-per-pixel concept falls apart with compressed and yuv images, where
+ * more than one pixel are coded in a single data block. This structure 
+ * describes that block.
+ * 
+ * Simple pixel formats are effectively a 1x1xcpp block.
+ */
+struct pipe_format_block
+{
+   /** Block size in bytes */
+   unsigned size;
+   
+   /** Block width in pixels */
+   unsigned width;
+   
+   /** Block height in pixels */
+   unsigned height;
+};
+
+/**
+ * Describe pixel format's block.   
+ * 
+ * @sa http://msdn2.microsoft.com/en-us/library/ms796147.aspx
+ */
+static INLINE void 
+pf_get_block(enum pipe_format format, struct pipe_format_block *block)
+{
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_SRGBA:
+   case PIPE_FORMAT_DXT1_SRGB:
+      block->size = 8;
+      block->width = 4;
+      block->height = 4;
+      break;
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+   case PIPE_FORMAT_DXT3_SRGBA:
+   case PIPE_FORMAT_DXT5_SRGBA:
+      block->size = 16;
+      block->width = 4;
+      block->height = 4;
+      break;
+   case PIPE_FORMAT_YCBCR:
+   case PIPE_FORMAT_YCBCR_REV:
+      block->size = 4; /* 2*cpp */
+      block->width = 2;
+      block->height = 1;
+      break;
+   default:
+      block->size = pf_get_size(format);
+      block->width = 1;
+      block->height = 1;
+      break;
+   }
+}
+
+static INLINE unsigned
+pf_get_nblocksx(const struct pipe_format_block *block, unsigned x)
+{
+   return (x + block->width - 1)/block->width;
+}
+
+static INLINE unsigned
+pf_get_nblocksy(const struct pipe_format_block *block, unsigned y)
+{
+   return (y + block->height - 1)/block->height;
+}
+
+static INLINE unsigned
+pf_get_nblocks(const struct pipe_format_block *block, unsigned width, unsigned height)
+{
+   return pf_get_nblocksx(block, width)*pf_get_nblocksy(block, height);
+}
+
+static INLINE boolean 
+pf_is_compressed( enum pipe_format format )
+{
+   return pf_layout(format) == PIPE_FORMAT_LAYOUT_DXT ? TRUE : FALSE;
+}
+
+static INLINE boolean 
+pf_is_ycbcr( enum pipe_format format )
+{
+   return pf_layout(format) == PIPE_FORMAT_LAYOUT_YCBCR ? TRUE : FALSE;
+}
+
+static INLINE boolean 
+pf_has_alpha( enum pipe_format format )
+{
+   switch (pf_layout(format)) {
+   case PIPE_FORMAT_LAYOUT_RGBAZS:
+   case PIPE_FORMAT_LAYOUT_MIXED:
+      /* FIXME: pf_get_component_bits( PIPE_FORMAT_A8L8_UNORM, PIPE_FORMAT_COMP_A ) should not return 0 right? */
+      if(format == PIPE_FORMAT_A8_UNORM || 
+         format == PIPE_FORMAT_A8L8_UNORM || 
+         format == PIPE_FORMAT_A8L8_SRGB)
+         return TRUE;
+      return pf_get_component_bits( format, PIPE_FORMAT_COMP_A ) ? TRUE : FALSE;
+   case PIPE_FORMAT_LAYOUT_YCBCR:
+      return FALSE; 
+   case PIPE_FORMAT_LAYOUT_DXT:
+      switch (format) {
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT5_RGBA:
+      case PIPE_FORMAT_DXT1_SRGBA:
+      case PIPE_FORMAT_DXT3_SRGBA:
+      case PIPE_FORMAT_DXT5_SRGBA:
+         return TRUE;
+      default:
+         return FALSE;
+      }
+   default:
+      assert( 0 );
+      return FALSE;
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
new file mode 100644
index 0000000000..5e79b7f485
--- /dev/null
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -0,0 +1,222 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_INLINES_H
+#define P_INLINES_H
+
+#include "p_context.h"
+#include "p_defines.h"
+#include "p_screen.h"
+#include "p_winsys.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* XXX: these are a kludge.  will fix when all surfaces are views into
+ * textures, and free-floating winsys surfaces go away.
+ */
+static INLINE void *
+pipe_surface_map( struct pipe_surface *surf, unsigned flags )
+{
+   if (surf->texture) {
+      struct pipe_screen *screen = surf->texture->screen;
+      return surf->texture->screen->surface_map( screen, surf, flags );
+   }
+   else {
+      struct pipe_winsys *winsys = surf->winsys;
+      char *map = (char *)winsys->buffer_map( winsys, surf->buffer, flags );
+      if (map == NULL)
+         return NULL;
+      return (void *)(map + surf->offset);
+   }
+}
+
+static INLINE void
+pipe_surface_unmap( struct pipe_surface *surf )
+{
+   if (surf->texture) {
+      struct pipe_screen *screen = surf->texture->screen;
+      surf->texture->screen->surface_unmap( screen, surf );
+   }
+   else {
+      struct pipe_winsys *winsys = surf->winsys;
+      winsys->buffer_unmap( winsys, surf->buffer );
+   }
+}
+
+
+
+/**
+ * Set 'ptr' to point to 'surf' and update reference counting.
+ * The old thing pointed to, if any, will be unreferenced first.
+ * 'surf' may be NULL.
+ */
+static INLINE void
+pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
+{
+   /* bump the refcount first */
+   if (surf) {
+      assert(surf->refcount);
+      surf->refcount++;
+   }
+
+   if (*ptr) {
+      assert((*ptr)->refcount);
+      
+      /* There are currently two sorts of surfaces... This needs to be
+       * fixed so that all surfaces are views into a texture.
+       */
+      if ((*ptr)->texture) {
+         struct pipe_screen *screen = (*ptr)->texture->screen;
+         screen->tex_surface_release( screen, ptr );
+      }
+      else {
+         struct pipe_winsys *winsys = (*ptr)->winsys;
+         winsys->surface_release(winsys, ptr);
+      }
+
+      assert(!*ptr);
+   }
+
+   *ptr = surf;
+}
+
+
+/* XXX: thread safety issues!
+ */
+static INLINE void
+winsys_buffer_reference(struct pipe_winsys *winsys,
+		      struct pipe_buffer **ptr,
+		      struct pipe_buffer *buf)
+{
+   if (buf) {
+      assert(buf->refcount);
+      buf->refcount++;
+   }
+
+   if (*ptr) {
+      assert((*ptr)->refcount);
+      if(--(*ptr)->refcount == 0)
+         winsys->buffer_destroy( winsys, *ptr );
+   }
+
+   *ptr = buf;
+}
+
+
+
+/**
+ * \sa pipe_surface_reference
+ */
+static INLINE void
+pipe_texture_reference(struct pipe_texture **ptr,
+		       struct pipe_texture *pt)
+{
+   assert(ptr);
+
+   if (pt) { 
+      assert(pt->refcount);
+      pt->refcount++;
+   }
+
+   if (*ptr) {
+      struct pipe_screen *screen = (*ptr)->screen;
+      assert(screen);
+      assert((*ptr)->refcount);
+      screen->texture_release(screen, ptr);
+
+      assert(!*ptr);
+   }
+
+   *ptr = pt;
+}
+
+
+static INLINE void
+pipe_texture_release(struct pipe_texture **ptr)
+{
+   struct pipe_screen *screen;
+   assert(ptr);
+   screen = (*ptr)->screen;
+   assert((*ptr)->refcount);
+   screen->texture_release(screen, ptr);
+   *ptr = NULL;
+}
+
+
+/**
+ * Convenience wrappers for winsys buffer functions.
+ */
+
+static INLINE struct pipe_buffer *
+pipe_buffer_create( struct pipe_screen *screen,
+                    unsigned alignment, unsigned usage, unsigned size )
+{
+   return screen->winsys->buffer_create(screen->winsys, alignment, usage, size);
+}
+
+static INLINE struct pipe_buffer *
+pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
+{
+   return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
+}
+
+static INLINE void *
+pipe_buffer_map(struct pipe_screen *screen,
+                struct pipe_buffer *buf,
+                unsigned usage)
+{
+   return screen->winsys->buffer_map(screen->winsys, buf, usage);
+}
+
+static INLINE void
+pipe_buffer_unmap(struct pipe_screen *screen,
+                  struct pipe_buffer *buf)
+{
+   screen->winsys->buffer_unmap(screen->winsys, buf);
+}
+
+/* XXX when we're using this everywhere, get rid of
+ * winsys_buffer_reference() above.
+ */
+static INLINE void
+pipe_buffer_reference(struct pipe_screen *screen,
+		      struct pipe_buffer **ptr,
+		      struct pipe_buffer *buf)
+{
+   winsys_buffer_reference(screen->winsys, ptr, buf);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* P_INLINES_H */
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
new file mode 100644
index 0000000000..3bedc75294
--- /dev/null
+++ b/src/gallium/include/pipe/p_screen.h
@@ -0,0 +1,139 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * 
+ * Screen, Adapter or GPU
+ *
+ * These are driver functions/facilities that are context independent.
+ */
+
+
+#ifndef P_SCREEN_H
+#define P_SCREEN_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/**
+ * Gallium screen/adapter context.  Basically everything
+ * hardware-specific that doesn't actually require a rendering
+ * context.
+ */
+struct pipe_screen {
+   struct pipe_winsys *winsys;
+
+   void (*destroy)( struct pipe_screen * );
+
+
+   const char *(*get_name)( struct pipe_screen * );
+
+   const char *(*get_vendor)( struct pipe_screen * );
+
+   /**
+    * Query an integer-valued capability/parameter/limit
+    * \param param  one of PIPE_CAP_x
+    */
+   int (*get_param)( struct pipe_screen *, int param );
+
+   /**
+    * Query a float-valued capability/parameter/limit
+    * \param param  one of PIPE_CAP_x
+    */
+   float (*get_paramf)( struct pipe_screen *, int param );
+
+   /**
+    * Check if the given pipe_format is supported as a texture or
+    * drawing surface.
+    * \param tex_usage  bitmask of PIPE_TEXTURE_USAGE_*
+    * \param flags  bitmask of PIPE_TEXTURE_GEOM_*
+    */
+   boolean (*is_format_supported)( struct pipe_screen *,
+                                   enum pipe_format format,
+                                   enum pipe_texture_target target,
+                                   unsigned tex_usage, 
+                                   unsigned geom_flags );
+
+   /**
+    * Create a new texture object, using the given template info.
+    */
+   struct pipe_texture * (*texture_create)(struct pipe_screen *,
+                                           const struct pipe_texture *templat);
+
+   /**
+    * Create a new texture object, using the given template info, but on top of 
+    * existing memory.
+    * 
+    * It is assumed that the buffer data is layed out according to the expected
+    * by the hardware. NULL will be returned if any inconsistency is found.  
+    */
+   struct pipe_texture * (*texture_blanket)(struct pipe_screen *,
+                                            const struct pipe_texture *templat,
+                                            const unsigned *pitch,
+                                            struct pipe_buffer *buffer);
+
+   void (*texture_release)(struct pipe_screen *,
+                           struct pipe_texture **pt);
+
+   /** Get a surface which is a "view" into a texture */
+   struct pipe_surface *(*get_tex_surface)(struct pipe_screen *,
+                                           struct pipe_texture *texture,
+                                           unsigned face, unsigned level,
+                                           unsigned zslice,
+                                           unsigned usage );
+
+   /* Surfaces allocated by the above must be released here:
+    */
+   void (*tex_surface_release)( struct pipe_screen *,
+                                struct pipe_surface ** );
+   
+
+   void *(*surface_map)( struct pipe_screen *,
+                         struct pipe_surface *surface,
+                         unsigned flags );
+
+   void (*surface_unmap)( struct pipe_screen *,
+                          struct pipe_surface *surface );
+   
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* P_SCREEN_H */
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
new file mode 100644
index 0000000000..ad43d799f3
--- /dev/null
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -0,0 +1,811 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TOKEN_H
+#define TGSI_TOKEN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "p_compiler.h"
+
+struct tgsi_version
+{
+   unsigned MajorVersion  : 8;
+   unsigned MinorVersion  : 8;
+   unsigned Padding       : 16;
+};
+
+struct tgsi_header
+{
+   unsigned HeaderSize : 8;
+   unsigned BodySize   : 24;
+};
+
+#define TGSI_PROCESSOR_FRAGMENT  0
+#define TGSI_PROCESSOR_VERTEX    1
+#define TGSI_PROCESSOR_GEOMETRY  2
+
+struct tgsi_processor
+{
+   unsigned Processor  : 4;  /* TGSI_PROCESSOR_ */
+   unsigned Padding    : 28;
+};
+
+#define TGSI_TOKEN_TYPE_DECLARATION    0
+#define TGSI_TOKEN_TYPE_IMMEDIATE      1
+#define TGSI_TOKEN_TYPE_INSTRUCTION    2
+
+struct tgsi_token
+{
+   unsigned Type       : 4;  /**< TGSI_TOKEN_TYPE_x */
+   unsigned Size       : 8;  /**< UINT */
+   unsigned Padding    : 19;
+   unsigned Extended   : 1;  /**< BOOL */
+};
+
+enum tgsi_file_type {
+   TGSI_FILE_NULL        =0,
+   TGSI_FILE_CONSTANT    =1,
+   TGSI_FILE_INPUT       =2,
+   TGSI_FILE_OUTPUT      =3,
+   TGSI_FILE_TEMPORARY   =4,
+   TGSI_FILE_SAMPLER     =5,
+   TGSI_FILE_ADDRESS     =6,
+   TGSI_FILE_IMMEDIATE   =7,
+   TGSI_FILE_COUNT      /**< how many TGSI_FILE_ types */
+};
+
+
+#define TGSI_WRITEMASK_NONE     0x00
+#define TGSI_WRITEMASK_X        0x01
+#define TGSI_WRITEMASK_Y        0x02
+#define TGSI_WRITEMASK_XY       0x03
+#define TGSI_WRITEMASK_Z        0x04
+#define TGSI_WRITEMASK_XZ       0x05
+#define TGSI_WRITEMASK_YZ       0x06
+#define TGSI_WRITEMASK_XYZ      0x07
+#define TGSI_WRITEMASK_W        0x08
+#define TGSI_WRITEMASK_XW       0x09
+#define TGSI_WRITEMASK_YW       0x0A
+#define TGSI_WRITEMASK_XYW      0x0B
+#define TGSI_WRITEMASK_ZW       0x0C
+#define TGSI_WRITEMASK_XZW      0x0D
+#define TGSI_WRITEMASK_YZW      0x0E
+#define TGSI_WRITEMASK_XYZW     0x0F
+
+#define TGSI_INTERPOLATE_CONSTANT      0
+#define TGSI_INTERPOLATE_LINEAR        1
+#define TGSI_INTERPOLATE_PERSPECTIVE   2
+#define TGSI_INTERPOLATE_COUNT         3
+
+struct tgsi_declaration
+{
+   unsigned Type        : 4;  /**< TGSI_TOKEN_TYPE_DECLARATION */
+   unsigned Size        : 8;  /**< UINT */
+   unsigned File        : 4;  /**< one of TGSI_FILE_x */
+   unsigned UsageMask   : 4;  /**< bitmask of TGSI_WRITEMASK_x flags */
+   unsigned Interpolate : 4;  /**< one of TGSI_INTERPOLATE_x */
+   unsigned Semantic    : 1;  /**< BOOL, any semantic info? */
+   unsigned Centroid    : 1;  /**< centroid sampling? */
+   unsigned Invariant   : 1;  /**< invariant optimization? */
+   unsigned Padding     : 4;
+   unsigned Extended    : 1;  /**< BOOL */
+};
+
+struct tgsi_declaration_range
+{
+   unsigned First   : 16; /**< UINT */
+   unsigned Last    : 16; /**< UINT */
+};
+
+#define TGSI_SEMANTIC_POSITION 0
+#define TGSI_SEMANTIC_COLOR    1
+#define TGSI_SEMANTIC_BCOLOR   2 /**< back-face color */
+#define TGSI_SEMANTIC_FOG      3
+#define TGSI_SEMANTIC_PSIZE    4
+#define TGSI_SEMANTIC_GENERIC  5
+#define TGSI_SEMANTIC_NORMAL   6
+#define TGSI_SEMANTIC_COUNT    7 /**< number of semantic values */
+
+struct tgsi_declaration_semantic
+{
+   unsigned SemanticName   : 8;  /**< one of TGSI_SEMANTIC_x */
+   unsigned SemanticIndex  : 16; /**< UINT */
+   unsigned Padding        : 8;
+};
+
+#define TGSI_IMM_FLOAT32   0
+
+struct tgsi_immediate
+{
+   unsigned Type       : 4;  /**< TGSI_TOKEN_TYPE_IMMEDIATE */
+   unsigned Size       : 8;  /**< UINT */
+   unsigned DataType   : 4;  /**< one of TGSI_IMM_x */
+   unsigned Padding    : 15;
+   unsigned Extended   : 1;  /**< BOOL */
+};
+
+struct tgsi_immediate_float32
+{
+   float Float;
+};
+
+/*
+ * GL_NV_vertex_program
+ */
+#define TGSI_OPCODE_ARL                 0
+#define TGSI_OPCODE_MOV                 1
+#define TGSI_OPCODE_LIT                 2
+#define TGSI_OPCODE_RCP                 3
+#define TGSI_OPCODE_RSQ                 4
+#define TGSI_OPCODE_EXP                 5
+#define TGSI_OPCODE_LOG                 6
+#define TGSI_OPCODE_MUL                 7
+#define TGSI_OPCODE_ADD                 8
+#define TGSI_OPCODE_DP3                 9
+#define TGSI_OPCODE_DP4                 10
+#define TGSI_OPCODE_DST                 11
+#define TGSI_OPCODE_MIN                 12
+#define TGSI_OPCODE_MAX                 13
+#define TGSI_OPCODE_SLT                 14
+#define TGSI_OPCODE_SGE                 15
+#define TGSI_OPCODE_MAD                 16
+
+/*
+ * GL_ATI_fragment_shader
+ */
+#define TGSI_OPCODE_SUB                 17
+#define TGSI_OPCODE_DOT3                TGSI_OPCODE_DP3
+#define TGSI_OPCODE_DOT4                TGSI_OPCODE_DP4
+#define TGSI_OPCODE_LERP                18
+#define TGSI_OPCODE_CND                 19
+#define TGSI_OPCODE_CND0                20
+#define TGSI_OPCODE_DOT2ADD             21
+
+/*
+ * GL_EXT_vertex_shader
+ */
+#define TGSI_OPCODE_INDEX               22
+#define TGSI_OPCODE_NEGATE              23
+#define TGSI_OPCODE_MADD                TGSI_OPCODE_MAD
+#define TGSI_OPCODE_FRAC                24
+#define TGSI_OPCODE_SETGE               TGSI_OPCODE_SGE
+#define TGSI_OPCODE_SETLT               TGSI_OPCODE_SLT
+#define TGSI_OPCODE_CLAMP               25
+#define TGSI_OPCODE_FLOOR               26
+#define TGSI_OPCODE_ROUND               27
+#define TGSI_OPCODE_EXPBASE2            28
+#define TGSI_OPCODE_LOGBASE2            29
+#define TGSI_OPCODE_POWER               30
+#define TGSI_OPCODE_RECIP               TGSI_OPCODE_RCP
+#define TGSI_OPCODE_RECIPSQRT           TGSI_OPCODE_RSQ
+#define TGSI_OPCODE_CROSSPRODUCT        31
+#define TGSI_OPCODE_MULTIPLYMATRIX      32
+
+/*
+ * GL_NV_vertex_program1_1
+ */
+#define TGSI_OPCODE_ABS                 33
+#define TGSI_OPCODE_RCC                 34
+#define TGSI_OPCODE_DPH                 35
+
+/*
+ * GL_NV_fragment_program
+ */
+#define TGSI_OPCODE_COS                 36
+#define TGSI_OPCODE_DDX                 37
+#define TGSI_OPCODE_DDY                 38
+#define TGSI_OPCODE_EX2                 TGSI_OPCODE_EXPBASE2
+#define TGSI_OPCODE_FLR                 TGSI_OPCODE_FLOOR
+#define TGSI_OPCODE_FRC                 TGSI_OPCODE_FRAC
+#define TGSI_OPCODE_KILP                39  /* predicated kill */
+#define TGSI_OPCODE_LG2                 TGSI_OPCODE_LOGBASE2
+#define TGSI_OPCODE_LRP                 TGSI_OPCODE_LERP
+#define TGSI_OPCODE_PK2H                40
+#define TGSI_OPCODE_PK2US               41
+#define TGSI_OPCODE_PK4B                42
+#define TGSI_OPCODE_PK4UB               43
+#define TGSI_OPCODE_POW                 TGSI_OPCODE_POWER
+#define TGSI_OPCODE_RFL                 44
+#define TGSI_OPCODE_SEQ                 45
+#define TGSI_OPCODE_SFL                 46
+#define TGSI_OPCODE_SGT                 47
+#define TGSI_OPCODE_SIN                 48
+#define TGSI_OPCODE_SLE                 49
+#define TGSI_OPCODE_SNE                 50
+#define TGSI_OPCODE_STR                 51
+#define TGSI_OPCODE_TEX                 52
+#define TGSI_OPCODE_TXD                 53
+#define TGSI_OPCODE_TXP                 54
+#define TGSI_OPCODE_UP2H                55
+#define TGSI_OPCODE_UP2US               56
+#define TGSI_OPCODE_UP4B                57
+#define TGSI_OPCODE_UP4UB               58
+#define TGSI_OPCODE_X2D                 59
+
+/*
+ * GL_NV_vertex_program2
+ */
+#define TGSI_OPCODE_ARA                 60
+#define TGSI_OPCODE_ARR                 61
+#define TGSI_OPCODE_BRA                 62
+#define TGSI_OPCODE_CAL                 63
+#define TGSI_OPCODE_RET                 64
+#define TGSI_OPCODE_SSG                 65
+
+/*
+ * GL_ARB_vertex_program
+ */
+#define TGSI_OPCODE_SWZ                 118
+#define TGSI_OPCODE_XPD                 TGSI_OPCODE_CROSSPRODUCT
+
+/*
+ * GL_ARB_fragment_program
+ */
+#define TGSI_OPCODE_CMP                 66
+#define TGSI_OPCODE_KIL                 116  /* conditional kill */
+#define TGSI_OPCODE_SCS                 67
+#define TGSI_OPCODE_TXB                 68
+
+/*
+ * GL_NV_fragment_program_option
+ */
+/* No new opcode */
+
+/*
+ * GL_NV_fragment_program2
+ */
+#define TGSI_OPCODE_NRM                 69
+#define TGSI_OPCODE_DIV                 70
+#define TGSI_OPCODE_DP2                 71
+#define TGSI_OPCODE_DP2A                TGSI_OPCODE_DOT2ADD
+#define TGSI_OPCODE_TXL                 72
+#define TGSI_OPCODE_BRK                 73
+#define TGSI_OPCODE_IF                  74
+#define TGSI_OPCODE_LOOP                75
+#define TGSI_OPCODE_REP                 76
+#define TGSI_OPCODE_ELSE                77
+#define TGSI_OPCODE_ENDIF               78
+#define TGSI_OPCODE_ENDLOOP             79
+#define TGSI_OPCODE_ENDREP              80
+
+/*
+ * GL_NV_vertex_program2_option
+ */
+
+/*
+ * GL_NV_vertex_program3
+ */
+#define TGSI_OPCODE_PUSHA               81
+#define TGSI_OPCODE_POPA                82
+
+/*
+ * GL_NV_gpu_program4
+ */
+#define TGSI_OPCODE_CEIL                83
+#define TGSI_OPCODE_I2F                 84
+#define TGSI_OPCODE_NOT                 85
+#define TGSI_OPCODE_TRUNC               86
+#define TGSI_OPCODE_SHL                 87
+#define TGSI_OPCODE_SHR                 88
+#define TGSI_OPCODE_AND                 89
+#define TGSI_OPCODE_OR                  90
+#define TGSI_OPCODE_MOD                 91
+#define TGSI_OPCODE_XOR                 92
+#define TGSI_OPCODE_SAD                 93
+#define TGSI_OPCODE_TXF                 94
+#define TGSI_OPCODE_TXQ                 95
+#define TGSI_OPCODE_CONT                96
+
+/*
+ * GL_NV_vertex_program4
+ */
+/* Same as GL_NV_gpu_program4 */
+
+/*
+ * GL_NV_fragment_program4
+ */
+/* Same as GL_NV_gpu_program4 */
+
+/*
+ * GL_NV_geometry_program4
+ */
+/* Same as GL_NV_gpu_program4 */
+#define TGSI_OPCODE_EMIT                97
+#define TGSI_OPCODE_ENDPRIM             98
+
+/*
+ * GLSL
+ */
+#define TGSI_OPCODE_BGNLOOP2            99
+#define TGSI_OPCODE_BGNSUB              100
+#define TGSI_OPCODE_ENDLOOP2            101
+#define TGSI_OPCODE_ENDSUB              102
+#define TGSI_OPCODE_INT                 TGSI_OPCODE_TRUNC
+#define TGSI_OPCODE_NOISE1              103
+#define TGSI_OPCODE_NOISE2              104
+#define TGSI_OPCODE_NOISE3              105
+#define TGSI_OPCODE_NOISE4              106
+#define TGSI_OPCODE_NOP                 107
+
+/*
+ * ps_1_1
+ */
+#define TGSI_OPCODE_TEXKILL             TGSI_OPCODE_KIL
+
+/*
+ * ps_1_2
+ */
+/* CMP - use TGSI_OPCODE_CND0 */
+
+/*
+ * ps_1_3
+ */
+/* CMP - use TGSI_OPCODE_CND0 */
+
+/*
+ * ps_1_4
+ */
+#define TGSI_OPCODE_TEXLD               TGSI_OPCODE_TEX
+
+/*
+ * ps_2_0
+ */
+#define TGSI_OPCODE_M4X4                TGSI_OPCODE_MULTIPLYMATRIX
+#define TGSI_OPCODE_M4X3                108
+#define TGSI_OPCODE_M3X4                109
+#define TGSI_OPCODE_M3X3                110
+#define TGSI_OPCODE_M3X2                111
+#define TGSI_OPCODE_CRS                 TGSI_OPCODE_XPD
+#define TGSI_OPCODE_NRM4                112
+#define TGSI_OPCODE_SINCOS              TGSI_OPCODE_SCS
+#define TGSI_OPCODE_TEXLDB              TGSI_OPCODE_TXB
+#define TGSI_OPCODE_DP2ADD              TGSI_OPCODE_DP2A
+
+/*
+ * ps_2_x
+ */
+#define TGSI_OPCODE_CALL                TGSI_OPCODE_CAL
+#define TGSI_OPCODE_CALLNZ              113
+#define TGSI_OPCODE_IFC                 114
+#define TGSI_OPCODE_BREAK               TGSI_OPCODE_BRK
+#define TGSI_OPCODE_BREAKC              115
+#define TGSI_OPCODE_DSX                 TGSI_OPCODE_DDX
+#define TGSI_OPCODE_DSY                 TGSI_OPCODE_DDY
+#define TGSI_OPCODE_TEXLDD              TGSI_OPCODE_TXD
+
+/*
+ * vs_1_1
+ */
+#define TGSI_OPCODE_EXPP                TGSI_OPCODE_EXP
+#define TGSI_OPCODE_LOGP                TGSI_OPCODE_LG2
+
+/*
+ * vs_2_0
+ */
+#define TGSI_OPCODE_SGN                 TGSI_OPCODE_SSG
+#define TGSI_OPCODE_MOVA                TGSI_OPCODE_ARR
+/* EXPP - use TGSI_OPCODE_EX2 */
+
+/*
+ * vs_2_x
+ */
+
+#define TGSI_OPCODE_END                 117  /* aka HALT */
+
+#define TGSI_OPCODE_LAST                119
+
+#define TGSI_SAT_NONE            0  /* do not saturate */
+#define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
+#define TGSI_SAT_MINUS_PLUS_ONE  2  /* clamp to [-1,1] */
+
+/**
+ * Opcode is the operation code to execute. A given operation defines the
+ * semantics how the source registers (if any) are interpreted and what is
+ * written to the destination registers (if any) as a result of execution.
+ *
+ * NumDstRegs and NumSrcRegs is the number of destination and source registers,
+ * respectively. For a given operation code, those numbers are fixed and are
+ * present here only for convenience.
+ *
+ * If Extended is TRUE, it is now executed.
+ *
+ * Saturate controls how are final results in destination registers modified.
+ */
+
+struct tgsi_instruction
+{
+   unsigned Type       : 4;  /* TGSI_TOKEN_TYPE_INSTRUCTION */
+   unsigned Size       : 8;  /* UINT */
+   unsigned Opcode     : 8;  /* TGSI_OPCODE_ */
+   unsigned Saturate   : 2;  /* TGSI_SAT_ */
+   unsigned NumDstRegs : 2;  /* UINT */
+   unsigned NumSrcRegs : 4;  /* UINT */
+   unsigned Padding    : 3;
+   unsigned Extended   : 1;  /* BOOL */
+};
+
+/*
+ * If tgsi_instruction::Extended is TRUE, tgsi_instruction_ext follows.
+ * 
+ * Then, tgsi_instruction::NumDstRegs of tgsi_dst_register follow.
+ * 
+ * Then, tgsi_instruction::NumSrcRegs of tgsi_src_register follow.
+ *
+ * tgsi_instruction::Size contains the total number of words that make the
+ * instruction, including the instruction word.
+ */
+
+#define TGSI_INSTRUCTION_EXT_TYPE_NV        0
+#define TGSI_INSTRUCTION_EXT_TYPE_LABEL     1
+#define TGSI_INSTRUCTION_EXT_TYPE_TEXTURE   2
+#define TGSI_INSTRUCTION_EXT_TYPE_PREDICATE 3
+
+struct tgsi_instruction_ext
+{
+   unsigned Type       : 4;  /* TGSI_INSTRUCTION_EXT_TYPE_ */
+   unsigned Padding    : 27;
+   unsigned Extended   : 1;  /* BOOL */
+};
+
+/*
+ * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_NV, it should
+ * be cast to tgsi_instruction_ext_nv.
+ * 
+ * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_LABEL, it
+ * should be cast to tgsi_instruction_ext_label.
+ * 
+ * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_TEXTURE, it
+ * should be cast to tgsi_instruction_ext_texture.
+ * 
+ * If tgsi_instruction_ext::Type is TGSI_INSTRUCTION_EXT_TYPE_PREDICATE, it
+ * should be cast to tgsi_instruction_ext_predicate.
+ * 
+ * If tgsi_instruction_ext::Extended is TRUE, another tgsi_instruction_ext
+ * follows.
+ */
+
+#define TGSI_PRECISION_DEFAULT      0
+#define TGSI_PRECISION_FLOAT32      1
+#define TGSI_PRECISION_FLOAT16      2
+#define TGSI_PRECISION_FIXED12      3
+
+#define TGSI_CC_GT      0
+#define TGSI_CC_EQ      1
+#define TGSI_CC_LT      2
+#define TGSI_CC_GE      3
+#define TGSI_CC_LE      4
+#define TGSI_CC_NE      5
+#define TGSI_CC_TR      6
+#define TGSI_CC_FL      7
+
+#define TGSI_SWIZZLE_X      0
+#define TGSI_SWIZZLE_Y      1
+#define TGSI_SWIZZLE_Z      2
+#define TGSI_SWIZZLE_W      3
+
+/**
+ * Precision controls the precision at which the operation should be executed.
+ *
+ * CondDstUpdate enables condition code register writes. When this field is
+ * TRUE, CondDstIndex specifies the index of the condition code register to
+ * update.
+ *
+ * CondFlowEnable enables conditional execution of the operation. When this
+ * field is TRUE, CondFlowIndex specifies the index of the condition code
+ * register to test against CondMask with component swizzle controled by
+ * CondSwizzleX, CondSwizzleY, CondSwizzleZ and CondSwizzleW. If the test fails,
+ * the operation is not executed.
+ */
+
+struct tgsi_instruction_ext_nv
+{
+   unsigned Type             : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_NV */
+   unsigned Precision        : 4;    /* TGSI_PRECISION_ */
+   unsigned CondDstIndex     : 4;    /* UINT */
+   unsigned CondFlowIndex    : 4;    /* UINT */
+   unsigned CondMask         : 4;    /* TGSI_CC_ */
+   unsigned CondSwizzleX     : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleY     : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleZ     : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleW     : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondDstUpdate    : 1;    /* BOOL */
+   unsigned CondFlowEnable   : 1;    /* BOOL */
+   unsigned Padding          : 1;
+   unsigned Extended         : 1;    /* BOOL */
+};
+
+struct tgsi_instruction_ext_label
+{
+   unsigned Type     : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_LABEL */
+   unsigned Label    : 24;   /* UINT */
+   unsigned Padding  : 3;
+   unsigned Extended : 1;    /* BOOL */
+};
+
+#define TGSI_TEXTURE_UNKNOWN        0
+#define TGSI_TEXTURE_1D             1
+#define TGSI_TEXTURE_2D             2
+#define TGSI_TEXTURE_3D             3
+#define TGSI_TEXTURE_CUBE           4
+#define TGSI_TEXTURE_RECT           5
+#define TGSI_TEXTURE_SHADOW1D       6
+#define TGSI_TEXTURE_SHADOW2D       7
+#define TGSI_TEXTURE_SHADOWRECT     8
+#define TGSI_TEXTURE_COUNT          9
+
+struct tgsi_instruction_ext_texture
+{
+   unsigned Type     : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_TEXTURE */
+   unsigned Texture  : 8;    /* TGSI_TEXTURE_ */
+   unsigned Padding  : 19;
+   unsigned Extended : 1;    /* BOOL */
+};
+
+struct tgsi_instruction_ext_predicate
+{
+   unsigned Type             : 4;    /* TGSI_INSTRUCTION_EXT_TYPE_PREDICATE */
+   unsigned PredDstIndex     : 4;    /* UINT */
+   unsigned PredWriteMask    : 4;    /* TGSI_WRITEMASK_ */
+   unsigned Padding          : 19;
+   unsigned Extended         : 1;    /* BOOL */
+};
+
+/**
+ * File specifies the register array to access.
+ *
+ * Index specifies the element number of a register in the register file.
+ *
+ * If Indirect is TRUE, Index should be offset by the X component of a source
+ * register that follows. The register can be now fetched into local storage
+ * for further processing.
+ *
+ * If Negate is TRUE, all components of the fetched register are negated.
+ *
+ * The fetched register components are swizzled according to SwizzleX, SwizzleY,
+ * SwizzleZ and SwizzleW.
+ *
+ * If Extended is TRUE, any further modifications to the source register are
+ * made to this temporary storage.
+ */
+
+struct tgsi_src_register
+{
+   unsigned File        : 4;  /* TGSI_FILE_ */
+   unsigned SwizzleX    : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleY    : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
+   unsigned Negate      : 1;  /* BOOL */
+   unsigned Indirect    : 1;  /* BOOL */
+   unsigned Dimension   : 1;  /* BOOL */
+   int      Index       : 16; /* SINT */
+   unsigned Extended    : 1;  /* BOOL */
+};
+
+/**
+ * If tgsi_src_register::Extended is TRUE, tgsi_src_register_ext follows.
+ * 
+ * Then, if tgsi_src_register::Indirect is TRUE, another tgsi_src_register
+ * follows.
+ *
+ * Then, if tgsi_src_register::Dimension is TRUE, tgsi_dimension follows.
+ */
+
+#define TGSI_SRC_REGISTER_EXT_TYPE_SWZ      0
+#define TGSI_SRC_REGISTER_EXT_TYPE_MOD      1
+
+struct tgsi_src_register_ext
+{
+   unsigned Type     : 4;    /* TGSI_SRC_REGISTER_EXT_TYPE_ */
+   unsigned Padding  : 27;
+   unsigned Extended : 1;    /* BOOL */
+};
+
+/**
+ * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_SWZ,
+ * it should be cast to tgsi_src_register_ext_swz.
+ * 
+ * If tgsi_src_register_ext::Type is TGSI_SRC_REGISTER_EXT_TYPE_MOD,
+ * it should be cast to tgsi_src_register_ext_mod.
+ * 
+ * If tgsi_dst_register_ext::Extended is TRUE, another tgsi_dst_register_ext
+ * follows.
+ */
+
+#define TGSI_EXTSWIZZLE_X       TGSI_SWIZZLE_X
+#define TGSI_EXTSWIZZLE_Y       TGSI_SWIZZLE_Y
+#define TGSI_EXTSWIZZLE_Z       TGSI_SWIZZLE_Z
+#define TGSI_EXTSWIZZLE_W       TGSI_SWIZZLE_W
+#define TGSI_EXTSWIZZLE_ZERO    4
+#define TGSI_EXTSWIZZLE_ONE     5
+
+/**
+ * ExtSwizzleX, ExtSwizzleY, ExtSwizzleZ and ExtSwizzleW swizzle the source
+ * register in an extended manner.
+ *
+ * NegateX, NegateY, NegateZ and NegateW negate individual components of the
+ * source register.
+ *
+ * NOTE: To simplify matter, if this token is present, the corresponding Swizzle
+ *       and Negate fields in tgsi_src_register should be set to X,Y,Z,W
+ *       and FALSE, respectively.
+ */
+
+struct tgsi_src_register_ext_swz
+{
+   unsigned Type         : 4;    /* TGSI_SRC_REGISTER_EXT_TYPE_SWZ */
+   unsigned ExtSwizzleX  : 4;    /* TGSI_EXTSWIZZLE_ */
+   unsigned ExtSwizzleY  : 4;    /* TGSI_EXTSWIZZLE_ */
+   unsigned ExtSwizzleZ  : 4;    /* TGSI_EXTSWIZZLE_ */
+   unsigned ExtSwizzleW  : 4;    /* TGSI_EXTSWIZZLE_ */
+   unsigned NegateX      : 1;    /* BOOL */
+   unsigned NegateY      : 1;    /* BOOL */
+   unsigned NegateZ      : 1;    /* BOOL */
+   unsigned NegateW      : 1;    /* BOOL */
+   unsigned Padding      : 7;
+   unsigned Extended     : 1;    /* BOOL */
+};
+
+/**
+ * Extra src register modifiers
+ *
+ * If Complement is TRUE, the source register is modified by subtracting it
+ * from 1.0.
+ *
+ * If Bias is TRUE, the source register is modified by subtracting 0.5 from it.
+ *
+ * If Scale2X is TRUE, the source register is modified by multiplying it by 2.0.
+ *
+ * If Absolute is TRUE, the source register is modified by removing the sign.
+ *
+ * If Negate is TRUE, the source register is modified by negating it.
+ */
+
+struct tgsi_src_register_ext_mod
+{
+   unsigned Type         : 4;    /* TGSI_SRC_REGISTER_EXT_TYPE_MOD */
+   unsigned Complement   : 1;    /* BOOL */
+   unsigned Bias         : 1;    /* BOOL */
+   unsigned Scale2X      : 1;    /* BOOL */
+   unsigned Absolute     : 1;    /* BOOL */
+   unsigned Negate       : 1;    /* BOOL */
+   unsigned Padding      : 22;
+   unsigned Extended     : 1;    /* BOOL */
+};
+
+struct tgsi_dimension
+{
+   unsigned Indirect    : 1;  /* BOOL */
+   unsigned Dimension   : 1;  /* BOOL */
+   unsigned Padding     : 13;
+   int      Index       : 16; /* SINT */
+   unsigned Extended    : 1;  /* BOOL */
+};
+
+struct tgsi_dst_register
+{
+   unsigned File        : 4;  /* TGSI_FILE_ */
+   unsigned WriteMask   : 4;  /* TGSI_WRITEMASK_ */
+   unsigned Indirect    : 1;  /* BOOL */
+   unsigned Dimension   : 1;  /* BOOL */
+   int      Index       : 16; /* SINT */
+   unsigned Padding     : 5;
+   unsigned Extended    : 1;  /* BOOL */
+};
+
+/*
+ * If tgsi_dst_register::Extended is TRUE, tgsi_dst_register_ext follows.
+ * 
+ * Then, if tgsi_dst_register::Indirect is TRUE, tgsi_src_register follows.
+ */
+
+#define TGSI_DST_REGISTER_EXT_TYPE_CONDCODE     0
+#define TGSI_DST_REGISTER_EXT_TYPE_MODULATE     1
+#define TGSI_DST_REGISTER_EXT_TYPE_PREDICATE    2
+
+struct tgsi_dst_register_ext
+{
+   unsigned Type     : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_ */
+   unsigned Padding  : 27;
+   unsigned Extended : 1;    /* BOOL */
+};
+
+/**
+ * Extra destination register modifiers
+ *
+ * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_CONDCODE,
+ * it should be cast to tgsi_dst_register_ext_condcode.
+ * 
+ * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_MODULATE,
+ * it should be cast to tgsi_dst_register_ext_modulate.
+ * 
+ * If tgsi_dst_register_ext::Type is TGSI_DST_REGISTER_EXT_TYPE_PREDICATE,
+ * it should be cast to tgsi_dst_register_ext_predicate.
+ * 
+ * If tgsi_dst_register_ext::Extended is TRUE, another tgsi_dst_register_ext
+ * follows.
+ */
+struct tgsi_dst_register_ext_concode
+{
+   unsigned Type         : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_CONDCODE */
+   unsigned CondMask     : 4;    /* TGSI_CC_ */
+   unsigned CondSwizzleX : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleY : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleZ : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSwizzleW : 2;    /* TGSI_SWIZZLE_ */
+   unsigned CondSrcIndex : 4;    /* UINT */
+   unsigned Padding      : 11;
+   unsigned Extended     : 1;    /* BOOL */
+};
+
+#define TGSI_MODULATE_1X        0
+#define TGSI_MODULATE_2X        1
+#define TGSI_MODULATE_4X        2
+#define TGSI_MODULATE_8X        3
+#define TGSI_MODULATE_HALF      4
+#define TGSI_MODULATE_QUARTER   5
+#define TGSI_MODULATE_EIGHTH    6
+#define TGSI_MODULATE_COUNT     7
+
+struct tgsi_dst_register_ext_modulate
+{
+   unsigned Type     : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_MODULATE */
+   unsigned Modulate : 4;    /* TGSI_MODULATE_ */
+   unsigned Padding  : 23;
+   unsigned Extended : 1;    /* BOOL */
+};
+
+/*
+ * Currently, the following constraints apply.
+ *
+ * - PredSwizzleXYZW is either set to identity or replicate.
+ * - PredSrcIndex is 0.
+ */
+
+struct tgsi_dst_register_ext_predicate
+{
+   unsigned Type         : 4;    /* TGSI_DST_REGISTER_EXT_TYPE_PREDICATE */
+   unsigned PredSwizzleX : 2;    /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleY : 2;    /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleZ : 2;    /* TGSI_SWIZZLE_ */
+   unsigned PredSwizzleW : 2;    /* TGSI_SWIZZLE_ */
+   unsigned PredSrcIndex : 4;    /* UINT */
+   unsigned Negate       : 1;    /* BOOL */
+   unsigned Padding      : 14;
+   unsigned Extended     : 1;    /* BOOL */
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TGSI_TOKEN_H */
+
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
new file mode 100644
index 0000000000..342f17260a
--- /dev/null
+++ b/src/gallium/include/pipe/p_state.h
@@ -0,0 +1,368 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * @file
+ * 
+ * Abstract graphics pipe state objects.
+ *
+ * Basic notes:
+ *   1. Want compact representations, so we use bitfields.
+ *   2. Put bitfields before other (GLfloat) fields.
+ */
+
+
+#ifndef PIPE_STATE_H
+#define PIPE_STATE_H
+
+#include "p_compiler.h"
+#include "p_defines.h"
+#include "p_format.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/**
+ * Implementation limits
+ */
+#define PIPE_MAX_ATTRIBS          32
+#define PIPE_MAX_CLIP_PLANES       6
+#define PIPE_MAX_COLOR_BUFS        8
+#define PIPE_MAX_CONSTANT         32
+#define PIPE_MAX_SAMPLERS         16
+#define PIPE_MAX_SHADER_INPUTS    16
+#define PIPE_MAX_SHADER_OUTPUTS   16
+#define PIPE_MAX_TEXTURE_LEVELS   16
+
+
+/* fwd decls */
+struct pipe_screen;
+struct pipe_surface;
+struct pipe_winsys;
+
+
+
+/**
+ * The driver will certainly subclass this to include actual memory
+ * management information.
+ */
+struct pipe_buffer
+{
+   unsigned alignment;
+   unsigned usage;
+   unsigned size;
+
+   /** Reference count */
+   unsigned refcount;
+};
+
+
+/**
+ * Primitive (point/line/tri) rasterization info
+ */
+struct pipe_rasterizer_state
+{
+   unsigned flatshade:1;
+   unsigned light_twoside:1;
+   unsigned front_winding:2;  /**< PIPE_WINDING_x */
+   unsigned cull_mode:2;      /**< PIPE_WINDING_x */
+   unsigned fill_cw:2;        /**< PIPE_POLYGON_MODE_x */
+   unsigned fill_ccw:2;       /**< PIPE_POLYGON_MODE_x */
+   unsigned offset_cw:1;
+   unsigned offset_ccw:1;
+   unsigned scissor:1;
+   unsigned poly_smooth:1;
+   unsigned poly_stipple_enable:1;
+   unsigned point_smooth:1;
+   unsigned point_sprite:1;
+   unsigned point_size_per_vertex:1; /**< size computed in vertex shader */
+   unsigned multisample:1;         /* XXX maybe more ms state in future */
+   unsigned line_smooth:1;
+   unsigned line_stipple_enable:1;
+   unsigned line_stipple_factor:8;  /**< [1..256] actually */
+   unsigned line_stipple_pattern:16;
+   unsigned line_last_pixel:1;
+   unsigned bypass_clipping:1;
+   unsigned bypass_vs:1; /**< Skip the vertex shader.  Note that the shader is
+                            still needed though, to indicate inputs/outputs */
+   unsigned origin_lower_left:1;  /**< Is (0,0) the lower-left corner? */
+   unsigned flatshade_first:1;   /**< take color attribute from the first vertex of a primitive */
+   unsigned gl_rasterization_rules:1; /**< enable tweaks for GL rasterization?  */
+
+   float line_width;
+   float point_size;           /**< used when no per-vertex size */
+   float point_size_min;        /* XXX - temporary, will go away */
+   float point_size_max;        /* XXX - temporary, will go away */
+   float offset_units;
+   float offset_scale;
+   ubyte sprite_coord_mode[PIPE_MAX_SHADER_OUTPUTS]; /**< PIPE_SPRITE_COORD_ */
+};
+
+
+struct pipe_poly_stipple
+{
+   unsigned stipple[32];
+};
+
+
+struct pipe_viewport_state
+{
+   float scale[4];
+   float translate[4];
+};
+
+
+struct pipe_scissor_state
+{
+   unsigned minx:16;
+   unsigned miny:16;
+   unsigned maxx:16;
+   unsigned maxy:16;
+};
+
+
+struct pipe_clip_state
+{
+   float ucp[PIPE_MAX_CLIP_PLANES][4];
+   unsigned nr;
+};
+
+
+/**
+ * Constants for vertex/fragment shaders
+ */
+struct pipe_constant_buffer
+{
+   struct pipe_buffer *buffer;
+   unsigned size;    /** in bytes (XXX: redundant!) */
+};
+
+
+struct pipe_shader_state
+{
+   const struct tgsi_token *tokens;
+};
+
+
+struct pipe_depth_state {
+   unsigned enabled:1;         /**< depth test enabled? */
+   unsigned writemask:1;       /**< allow depth buffer writes? */
+   unsigned func:3;            /**< depth test func (PIPE_FUNC_x) */
+   unsigned occlusion_count:1; /**< do occlusion counting? */
+};
+
+
+struct pipe_stencil_state {
+   unsigned enabled:1;  /**< stencil[0]: stencil enabled, stencil[1]: two-side enabled */
+   unsigned func:3;     /**< PIPE_FUNC_x */
+   unsigned fail_op:3;  /**< PIPE_STENCIL_OP_x */
+   unsigned zpass_op:3; /**< PIPE_STENCIL_OP_x */
+   unsigned zfail_op:3; /**< PIPE_STENCIL_OP_x */
+   ubyte ref_value;    
+   ubyte value_mask;
+   ubyte write_mask;
+};
+
+
+struct pipe_alpha_state {
+   unsigned enabled:1;
+   unsigned func:3;     /**< PIPE_FUNC_x */
+   float ref;           /**< reference value */
+};
+
+
+struct pipe_depth_stencil_alpha_state
+{
+   struct pipe_depth_state depth;
+   struct pipe_stencil_state stencil[2]; /**< [0] = front, [1] = back */
+   struct pipe_alpha_state alpha;
+};
+
+
+struct pipe_blend_state
+{
+   unsigned blend_enable:1;
+
+   unsigned rgb_func:3;          /**< PIPE_BLEND_x */
+   unsigned rgb_src_factor:5;    /**< PIPE_BLENDFACTOR_x */
+   unsigned rgb_dst_factor:5;    /**< PIPE_BLENDFACTOR_x */
+
+   unsigned alpha_func:3;        /**< PIPE_BLEND_x */
+   unsigned alpha_src_factor:5;  /**< PIPE_BLENDFACTOR_x */
+   unsigned alpha_dst_factor:5;  /**< PIPE_BLENDFACTOR_x */
+
+   unsigned logicop_enable:1;
+   unsigned logicop_func:4;      /**< PIPE_LOGICOP_x */
+
+   unsigned colormask:4;         /**< bitmask of PIPE_MASK_R/G/B/A */
+   unsigned dither:1;
+};
+
+
+struct pipe_blend_color
+{
+   float color[4];
+};
+
+
+struct pipe_framebuffer_state
+{
+   unsigned width, height;
+
+   /** multiple colorbuffers for multiple render targets */
+   unsigned num_cbufs;
+   struct pipe_surface *cbufs[PIPE_MAX_COLOR_BUFS];
+
+   struct pipe_surface *zsbuf;      /**< Z/stencil buffer */
+};
+
+
+/**
+ * Texture sampler state.
+ */
+struct pipe_sampler_state
+{
+   unsigned wrap_s:3;            /**< PIPE_TEX_WRAP_x */
+   unsigned wrap_t:3;            /**< PIPE_TEX_WRAP_x */
+   unsigned wrap_r:3;            /**< PIPE_TEX_WRAP_x */
+   unsigned min_img_filter:2;    /**< PIPE_TEX_FILTER_x */
+   unsigned min_mip_filter:2;    /**< PIPE_TEX_MIPFILTER_x */
+   unsigned mag_img_filter:2;    /**< PIPE_TEX_FILTER_x */
+   unsigned compare_mode:1;      /**< PIPE_TEX_COMPARE_x */
+   unsigned compare_func:3;      /**< PIPE_FUNC_x */
+   unsigned normalized_coords:1; /**< Are coords normalized to [0,1]? */
+   unsigned prefilter:4;         /**< Wierd sampling state exposed by some api's */
+   float shadow_ambient;         /**< shadow test fail color/intensity */
+   float lod_bias;               /**< LOD/lambda bias */
+   float min_lod, max_lod;       /**< LOD clamp range, after bias */
+   float border_color[4];
+   float max_anisotropy;
+};
+
+
+/**
+ * 2D surface.  This is basically a view into a memory buffer.
+ * May be a renderbuffer, texture mipmap level, etc.
+ */
+struct pipe_surface
+{
+   struct pipe_buffer *buffer;   /**< surface's buffer/memory */
+   enum pipe_format format;      /**< PIPE_FORMAT_x */
+   unsigned status;              /**< PIPE_SURFACE_STATUS_x */
+   unsigned clear_value;         /**< XXX may be temporary */
+   unsigned width;               /**< logical width in pixels */
+   unsigned height;              /**< logical height in pixels */
+   struct pipe_format_block block;
+   unsigned nblocksx;            /**< allocated width in blocks */
+   unsigned nblocksy;            /**< allocated height in blocks */
+   unsigned stride;              /**< stride in bytes between rows of blocks */
+   unsigned layout;              /**< PIPE_SURFACE_LAYOUT_x */
+   unsigned offset;              /**< offset from start of buffer, in bytes */
+   unsigned refcount;
+   unsigned usage;               /**< PIPE_BUFFER_USAGE_*  */
+
+   struct pipe_winsys *winsys;   /**< winsys which owns/created the surface */
+
+   struct pipe_texture *texture; /**< optional texture into which this is a view  */
+   unsigned face;
+   unsigned level;
+   unsigned zslice;
+};
+
+
+/**
+ * Texture object.
+ */
+struct pipe_texture
+{ 
+   enum pipe_texture_target target; /**< PIPE_TEXTURE_x */
+   enum pipe_format format;         /**< PIPE_FORMAT_x */
+
+   unsigned width[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned height[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned depth[PIPE_MAX_TEXTURE_LEVELS];
+
+   struct pipe_format_block block;
+   unsigned nblocksx[PIPE_MAX_TEXTURE_LEVELS]; /**< allocated width in blocks */
+   unsigned nblocksy[PIPE_MAX_TEXTURE_LEVELS]; /**< allocated height in blocks */
+
+   unsigned last_level:8;    /**< Index of last mipmap level present/defined */
+   unsigned compressed:1;
+
+   unsigned nr_samples:8;          /**< for multisampled surfaces, nr of samples */
+
+   unsigned tex_usage;          /* PIPE_TEXTURE_USAGE_* */
+
+   /* These are also refcounted:
+    */
+   unsigned refcount;
+
+   struct pipe_screen *screen; /**< screen that this texture belongs to */
+};
+
+
+/**
+ * A vertex buffer.  Typically, all the vertex data/attributes for
+ * drawing something will be in one buffer.  But it's also possible, for
+ * example, to put colors in one buffer and texcoords in another.
+ */
+struct pipe_vertex_buffer
+{
+   unsigned pitch;    /**< stride to same attrib in next vertex, in bytes */
+   unsigned max_index;   /**< number of vertices in this buffer */
+   unsigned buffer_offset;  /**< offset to start of data in buffer, in bytes */
+   struct pipe_buffer *buffer;  /**< the actual buffer */
+};
+
+
+/**
+ * Information to describe a vertex attribute (position, color, etc)
+ */
+struct pipe_vertex_element
+{
+   /** Offset of this attribute, in bytes, from the start of the vertex */
+   unsigned src_offset;
+
+   /** Which vertex_buffer (as given to pipe->set_vertex_buffer()) does
+    * this attribute live in?
+    */
+   unsigned vertex_buffer_index:8;
+   unsigned nr_components:8;
+ 
+   enum pipe_format src_format; 	   /**< PIPE_FORMAT_* */
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+   
+#endif
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
new file mode 100644
index 0000000000..8af3cd958b
--- /dev/null
+++ b/src/gallium/include/pipe/p_thread.h
@@ -0,0 +1,276 @@
+/**************************************************************************
+ * 
+ * Copyright 1999-2006 Brian Paul
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+/**
+ * @file
+ * 
+ * Thread, mutex, condition var and thread-specific data functions.
+ */
+
+
+#ifndef _P_THREAD2_H_
+#define _P_THREAD2_H_
+
+
+#include "pipe/p_compiler.h"
+
+
+#if defined(PIPE_OS_LINUX)
+
+#include <pthread.h> /* POSIX threads headers */
+#include <stdio.h> /* for perror() */
+
+typedef pthread_t pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void *name( void *param )
+
+static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+{
+   pipe_thread thread;
+   if (pthread_create( &thread, NULL, routine, param ))
+      return 0;
+   return thread;
+}
+
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   return pthread_join( thread, NULL );
+}
+
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   return pthread_detach( thread );
+}
+
+typedef pthread_mutex_t pipe_mutex;
+typedef pthread_cond_t pipe_condvar;
+
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = PTHREAD_MUTEX_INITIALIZER
+
+#define pipe_mutex_init(mutex) \
+   pthread_mutex_init(&(mutex), NULL)
+
+#define pipe_mutex_destroy(mutex) \
+   pthread_mutex_destroy(&(mutex))
+
+#define pipe_mutex_lock(mutex) \
+   (void) pthread_mutex_lock(&(mutex))
+
+#define pipe_mutex_unlock(mutex) \
+   (void) pthread_mutex_unlock(&(mutex))
+
+#define pipe_static_condvar(mutex) \
+   static pipe_condvar mutex = PTHREAD_COND_INITIALIZER
+
+#define pipe_condvar_init(cond)	\
+   pthread_cond_init(&(cond), NULL)
+
+#define pipe_condvar_destroy(cond) \
+   pthread_cond_destroy(&(cond))
+
+#define pipe_condvar_wait(cond, mutex) \
+  pthread_cond_wait(&(cond), &(mutex))
+
+#define pipe_condvar_signal(cond) \
+  pthread_cond_signal(&(cond))
+
+#define pipe_condvar_broadcast(cond) \
+  pthread_cond_broadcast(&(cond))
+
+
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+
+#include <windows.h>
+
+typedef HANDLE pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void * WINAPI name( void *param )
+
+static INLINE pipe_thread pipe_thread_create( void *(WINAPI * routine)( void *), void *param )
+{
+   DWORD id;
+   return CreateThread( NULL, 0, (LPTHREAD_START_ROUTINE) routine, param, 0, &id );
+}
+
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   if (WaitForSingleObject( thread, INFINITE ) == WAIT_OBJECT_0)
+      return 0;
+   return -1;
+}
+
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   if (CloseHandle( thread ))
+      return 0;
+   return -1;
+}
+
+typedef CRITICAL_SECTION pipe_mutex;
+
+#define pipe_static_mutex(name) \
+   /*static*/ pipe_mutex name = {0,0,0,0,0,0}
+
+#define pipe_mutex_init(name) \
+   InitializeCriticalSection(&name)
+
+#define pipe_mutex_destroy(name) \
+   DeleteCriticalSection(&name)
+
+#define pipe_mutex_lock(name) \
+   EnterCriticalSection(&name)
+
+#define pipe_mutex_unlock(name) \
+   LeaveCriticalSection(&name)
+
+/* XXX: dummy definitions, make it compile */
+
+typedef unsigned pipe_condvar;
+
+#define pipe_condvar_init(condvar) \
+   (void) condvar
+
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
+
+#else
+
+/** Dummy definitions */
+
+typedef unsigned pipe_thread;
+typedef unsigned pipe_mutex;
+typedef unsigned pipe_condvar;
+
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = 0
+
+#define pipe_mutex_init(mutex) \
+   (void) mutex
+
+#define pipe_mutex_destroy(mutex) \
+   (void) mutex
+
+#define pipe_mutex_lock(mutex) \
+   (void) mutex
+
+#define pipe_mutex_unlock(mutex) \
+   (void) mutex
+
+#define pipe_static_condvar(condvar) \
+   static unsigned condvar = 0
+
+#define pipe_condvar_init(condvar) \
+   (void) condvar
+
+#define pipe_condvar_destroy(condvar) \
+   (void) condvar
+
+#define pipe_condvar_wait(condvar, mutex) \
+   (void) condvar
+
+#define pipe_condvar_signal(condvar) \
+   (void) condvar
+
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
+
+
+#endif  /* PIPE_OS_? */
+
+
+
+/*
+ * Thread-specific data.
+ */
+
+typedef struct {
+#if defined(PIPE_OS_LINUX)
+   pthread_key_t key;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   DWORD key;
+#endif
+   int initMagic;
+} pipe_tsd;
+
+
+#define PIPE_TSD_INIT_MAGIC 0xff8adc98
+
+
+static INLINE void
+pipe_tsd_init(pipe_tsd *tsd)
+{
+#if defined(PIPE_OS_LINUX)
+   if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
+      perror("pthread_key_create(): failed to allocate key for thread specific data");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#endif
+   tsd->initMagic = PIPE_TSD_INIT_MAGIC;
+}
+
+static INLINE void *
+pipe_tsd_get(pipe_tsd *tsd)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   return pthread_getspecific(tsd->key);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+   return NULL;
+#else
+   assert(0);
+   return NULL;
+#endif
+}
+
+static INLINE void
+pipe_tsd_set(pipe_tsd *tsd, void *value)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   if (pthread_setspecific(tsd->key, value) != 0) {
+      perror("pthread_set_specific() failed");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#else
+   assert(0);
+#endif
+}
+
+
+
+#endif /* _P_THREAD2_H_ */
diff --git a/src/gallium/include/pipe/p_winsys.h b/src/gallium/include/pipe/p_winsys.h
new file mode 100644
index 0000000000..5d18291dc6
--- /dev/null
+++ b/src/gallium/include/pipe/p_winsys.h
@@ -0,0 +1,186 @@
+ /**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * This is the interface that Gallium3D requires any window system
+ * hosting it to implement.  This is the only include file in Gallium3D
+ * which is public.
+ */
+
+#ifndef P_WINSYS_H
+#define P_WINSYS_H
+
+
+#include "p_format.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** Opaque type */
+struct pipe_fence_handle;
+
+struct pipe_surface;
+
+
+/**
+ * Gallium3D drivers are (meant to be!) independent of both GL and the
+ * window system.  The window system provides a buffer manager and a
+ * set of additional hooks for things like command buffer submission,
+ * etc.
+ *
+ * There clearly has to be some agreement between the window system
+ * driver and the hardware driver about the format of command buffers,
+ * etc.
+ */
+struct pipe_winsys
+{
+   void (*destroy)( struct pipe_winsys *ws );
+
+   /** Returns name of this winsys interface */
+   const char *(*get_name)( struct pipe_winsys *ws );
+
+   /**
+    * Do any special operations to ensure frontbuffer contents are
+    * displayed, eg copy fake frontbuffer.
+    */
+   void (*flush_frontbuffer)( struct pipe_winsys *ws,
+                              struct pipe_surface *surf,
+                              void *context_private );
+
+
+   /** allocate a new surface (no context dependency) */
+   struct pipe_surface *(*surface_alloc)(struct pipe_winsys *ws);
+
+   /**
+    * Allocate storage for a pipe_surface.
+    * \param flags XXX unused, remove someday
+    * \return  0 if succeeds.
+    */
+   int (*surface_alloc_storage)(struct pipe_winsys *ws,
+                                struct pipe_surface *surf,
+                                unsigned width, unsigned height,
+                                enum pipe_format format,
+                                unsigned flags,
+                                unsigned tex_usage);
+   
+   void (*surface_release)(struct pipe_winsys *ws, struct pipe_surface **s);
+
+
+   /**
+    * Buffer management. Buffer attributes are mostly fixed over its lifetime.
+    *
+    * Remember that gallium gets to choose the interface it needs, and the
+    * window systems must then implement that interface (rather than the
+    * other way around...).
+    *
+    * usage is a bitmask of PIPE_BUFFER_USAGE_PIXEL/VERTEX/INDEX/CONSTANT. This
+    * usage argument is only an optimization hint, not a guarantee, therefore 
+    * proper behavior must be observed in all circumstances.
+    *
+    * alignment indicates the client's alignment requirements, eg for
+    * SSE instructions.
+    */
+   struct pipe_buffer *(*buffer_create)( struct pipe_winsys *ws, 
+                                         unsigned alignment, 
+                                         unsigned usage,
+                                         unsigned size );
+
+   /** 
+    * Create a buffer that wraps user-space data.
+    *
+    * Effectively this schedules a delayed call to buffer_create
+    * followed by an upload of the data at *some point in the future*,
+    * or perhaps never.  Basically the allocate/upload is delayed
+    * until the buffer is actually passed to hardware.
+    *
+    * The intention is to provide a quick way to turn regular data
+    * into a buffer, and secondly to avoid a copy operation if that
+    * data subsequently turns out to be only accessed by the CPU.  
+    *
+    * Common example is OpenGL vertex buffers that are subsequently
+    * processed either by software TNL in the driver or by passing to
+    * hardware.
+    *
+    * XXX: What happens if the delayed call to buffer_create() fails?
+    *
+    * Note that ptr may be accessed at any time upto the time when the
+    * buffer is destroyed, so the data must not be freed before then.
+    */
+   struct pipe_buffer *(*user_buffer_create)(struct pipe_winsys *ws, 
+                                                    void *ptr,
+                                                    unsigned bytes);
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is bitmask of PIPE_BUFFER_USAGE_CPU_READ/WRITE flags. 
+    */
+   void *(*buffer_map)( struct pipe_winsys *ws, 
+			struct pipe_buffer *buf,
+			unsigned usage );
+   
+   void (*buffer_unmap)( struct pipe_winsys *ws, 
+			 struct pipe_buffer *buf );
+
+   void (*buffer_destroy)( struct pipe_winsys *ws,
+			   struct pipe_buffer *buf );
+
+
+   /** Set ptr = fence, with reference counting */
+   void (*fence_reference)( struct pipe_winsys *ws,
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence );
+
+   /**
+    * Checks whether the fence has been signalled.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_signalled)( struct pipe_winsys *ws,
+                           struct pipe_fence_handle *fence,
+                           unsigned flag );
+
+   /**
+    * Wait for the fence to finish.
+    * \param flags  driver-specific meaning
+    * \return zero on success.
+    */
+   int (*fence_finish)( struct pipe_winsys *ws,
+                        struct pipe_fence_handle *fence,
+                        unsigned flag );
+
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* P_WINSYS_H */
diff --git a/src/gallium/state_trackers/README b/src/gallium/state_trackers/README
new file mode 100644
index 0000000000..28dd27bbd5
--- /dev/null
+++ b/src/gallium/state_trackers/README
@@ -0,0 +1,2 @@
+This directory is a placeholder for incubating state-trackers. Mesa's
+state-tracker is in src/mesa.
diff --git a/src/gallium/state_trackers/g3dvl/Makefile b/src/gallium/state_trackers/g3dvl/Makefile
new file mode 100644
index 0000000000..4f7a953484
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/Makefile
@@ -0,0 +1,18 @@
+TARGET		= libg3dvl.a
+OBJECTS		= vl_display.o vl_screen.o vl_context.o vl_surface.o vl_shader_build.o vl_util.o vl_basic_csc.o	\
+		  vl_r16snorm_mc.o vl_r16snorm_mc_buf.o
+GALLIUMDIR	= ../..
+
+CFLAGS		+= -g -Wall -fPIC -I${GALLIUMDIR}/include -I${GALLIUMDIR}/auxiliary -I${GALLIUMDIR}/winsys/g3dvl
+
+#############################################
+
+.PHONY	= all clean
+
+all: ${TARGET}
+
+${TARGET}: ${OBJECTS}
+	ar rcs $@ $^
+
+clean:
+	rm -rf ${OBJECTS} ${TARGET}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.c b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
new file mode 100644
index 0000000000..e3b3d03256
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.c
@@ -0,0 +1,712 @@
+#define VL_INTERNAL
+#include "vl_basic_csc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_csc.h"
+#include "vl_surface.h"
+#include "vl_shader_build.h"
+#include "vl_types.h"
+
+struct vlVertexShaderConsts
+{
+	struct vlVertex4f	dst_scale;
+	struct vlVertex4f	dst_trans;
+	struct vlVertex4f	src_scale;
+	struct vlVertex4f	src_trans;
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f	bias;
+	float			matrix[16];
+};
+
+struct vlBasicCSC
+{
+	struct vlCSC				base;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		framebuffer;
+	struct pipe_texture			*framebuffer_tex;
+	void					*sampler;
+	void					*vertex_shader, *fragment_shader;
+	struct pipe_vertex_buffer 		vertex_bufs[2];
+	struct pipe_vertex_element		vertex_elems[2];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlResizeFrameBuffer
+(
+	struct vlCSC *csc,
+	unsigned int width,
+	unsigned int height
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+	struct pipe_texture	template;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer.width == width && basic_csc->framebuffer.height == height)
+		return 0;
+
+	basic_csc->viewport.scale[0] = width;
+	basic_csc->viewport.scale[1] = height;
+	basic_csc->viewport.scale[2] = 1;
+	basic_csc->viewport.scale[3] = 1;
+	basic_csc->viewport.translate[0] = 0;
+	basic_csc->viewport.translate[1] = 0;
+	basic_csc->viewport.translate[2] = 0;
+	basic_csc->viewport.translate[3] = 0;
+	
+	if (basic_csc->framebuffer_tex)
+		pipe_texture_release(&basic_csc->framebuffer_tex);
+	
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+	template.last_level = 0;
+	template.width[0] = width;
+	template.height[0] = height;
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+	template.tex_usage = PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+
+	basic_csc->framebuffer_tex = pipe->screen->texture_create(pipe->screen, &template);
+
+	basic_csc->framebuffer.width = width;
+	basic_csc->framebuffer.height = height;
+	basic_csc->framebuffer.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		basic_csc->framebuffer_tex,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+
+	/* Clear to black, in case video doesn't fill the entire window */
+	pipe->clear(pipe, basic_csc->framebuffer.cbufs[0], 0);
+
+	return 0;
+}
+
+static int vlBegin
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	pipe->set_framebuffer_state(pipe, &basic_csc->framebuffer);
+	pipe->set_viewport_state(pipe, &basic_csc->viewport);
+	pipe->bind_sampler_states(pipe, 1, (void**)&basic_csc->sampler);
+	/* Source texture set in vlPutPictureCSC() */
+	pipe->bind_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->bind_fs_state(pipe, basic_csc->fragment_shader);
+	pipe->set_vertex_buffers(pipe, 2, basic_csc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 2, basic_csc->vertex_elems);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &basic_csc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &basic_csc->fs_const_buf);
+
+	return 0;
+}
+
+static int vlPutPictureCSC
+(
+	struct vlCSC *csc,
+	struct vlSurface *surface,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	enum vlPictureType picture_type
+)
+{
+	struct vlBasicCSC		*basic_csc;
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(csc);
+	assert(surface);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		basic_csc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->dst_scale.x = destw / (float)basic_csc->framebuffer.cbufs[0]->width;
+	vs_consts->dst_scale.y = desth / (float)basic_csc->framebuffer.cbufs[0]->height;
+	vs_consts->dst_scale.z = 1;
+	vs_consts->dst_scale.w = 1;
+	vs_consts->dst_trans.x = destx / (float)basic_csc->framebuffer.cbufs[0]->width;
+	vs_consts->dst_trans.y = desty / (float)basic_csc->framebuffer.cbufs[0]->height;
+	vs_consts->dst_trans.z = 0;
+	vs_consts->dst_trans.w = 0;
+
+	vs_consts->src_scale.x = srcw / (float)surface->texture->width[0];
+	vs_consts->src_scale.y = srch / (float)surface->texture->height[0];
+	vs_consts->src_scale.z = 1;
+	vs_consts->src_scale.w = 1;
+	vs_consts->src_trans.x = srcx / (float)surface->texture->width[0];
+	vs_consts->src_trans.y = srcy / (float)surface->texture->height[0];
+	vs_consts->src_trans.z = 0;
+	vs_consts->src_trans.w = 0;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, basic_csc->vs_const_buf.buffer);
+
+	pipe->set_sampler_textures(pipe, 1, &surface->texture);
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLE_STRIP, 0, 4);
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlCSC *csc
+)
+{
+	assert(csc);
+
+	return 0;
+}
+
+static struct pipe_surface* vlGetFrameBuffer
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+
+	return basic_csc->framebuffer.cbufs[0];
+}
+
+static int vlDestroy
+(
+	struct vlCSC *csc
+)
+{
+	struct vlBasicCSC	*basic_csc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(csc);
+
+	basic_csc = (struct vlBasicCSC*)csc;
+	pipe = basic_csc->pipe;
+
+	if (basic_csc->framebuffer_tex)
+		pipe_texture_release(&basic_csc->framebuffer_tex);
+
+	pipe->delete_sampler_state(pipe, basic_csc->sampler);
+	pipe->delete_vs_state(pipe, basic_csc->vertex_shader);
+	pipe->delete_fs_state(pipe, basic_csc->fragment_shader);
+
+	for (i = 0; i < 2; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vertex_bufs[i].buffer);
+
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, basic_csc->fs_const_buf.buffer);
+
+	free(basic_csc);
+
+	return 0;
+}
+
+/*
+ * Represents 2 triangles in a strip in normalized coords.
+ * Used to render the surface onto the frame buffer.
+ */
+static const struct vlVertex2f surface_verts[4] =
+{
+	{0.0f, 0.0f},
+	{0.0f, 1.0f},
+	{1.0f, 0.0f},
+	{1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above. We can use the position values directly.
+ * TODO: Duplicate these in the shader, no need to create a buffer.
+ */
+static const struct vlVertex2f *surface_texcoords = surface_verts;
+
+/*
+ * Identity color conversion constants, for debugging
+ */
+static const struct vlFragmentShaderConsts identity =
+{
+	{
+		0.0f, 0.0f, 0.0f, 0.0f
+	},
+	{
+		1.0f, 0.0f, 0.0f, 0.0f,
+		0.0f, 1.0f, 0.0f, 0.0f,
+		0.0f, 0.0f, 1.0f, 0.0f,
+		0.0f, 0.0f, 0.0f, 1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_601 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.371f,		0.0f,
+		1.0f,		-0.336f,	-0.698f,	0.0f,
+		1.0f,		1.732f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.601 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+static const struct vlFragmentShaderConsts bt_601_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.596f,		0.0f,
+		1.164f,		-0.391f,	-0.813f,	0.0f,
+		1.164f,		2.018f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [16,235]
+ */
+static const struct vlFragmentShaderConsts bt_709 =
+{
+	{
+		0.0f,		0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.0f,		0.0f,		1.540f,		0.0f,
+		1.0f,		-0.183f,	-0.459f,	0.0f,
+		1.0f,		1.816f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+/*
+ * Converts ITU-R BT.709 YCbCr pixels to RGB pixels where:
+ * Y is in [16,235], Cb and Cr are in [16,240]
+ * R, G, and B are in [0,255]
+ */
+const struct vlFragmentShaderConsts bt_709_full =
+{
+	{
+		0.062745098f,	0.501960784f,	0.501960784f,	0.0f
+	},
+	{
+		1.164f,		0.0f,		1.793f,		0.0f,
+		1.164f,		-0.213f,	-0.534f,	0.0f,
+		1.164f,		2.115f,		0.0f,		0.0f,
+		0.0f,		0.0f,		0.0f,		1.0f
+	}
+};
+
+static int vlCreateVertexShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale vertex pos rect to destination size
+	 * decl c1		; Translation vector to move vertex pos rect into position
+	 * decl c2		; Scaling vector to scale texcoord rect to source size
+	 * decl c3		; Translation vector to move texcoord rect into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Vertex texcoords
+	 */
+	for (i = 0; i < 2; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * madd o0, i0, c0, c1	; Scale and translate unit output rect to destination size and pos
+	 * madd o1, i1, c2, c3	; Scale and translate unit texcoord rect to source size and pos
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst4(TGSI_OPCODE_MADD, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i, TGSI_FILE_CONSTANT, i * 2, TGSI_FILE_CONSTANT, i * 2 + 1);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	csc->vertex_shader = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShader
+(
+	struct vlBasicCSC *csc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(context);
+
+	pipe = csc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/* decl i0		; Texcoords for s0 */
+	decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, 1, 0, 0, TGSI_INTERPOLATE_LINEAR);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl c0		; Bias vector for CSC
+	 * decl c1-c4		; CSC matrix c1-c4
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0		; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl s0		; Sampler for tex containing picture to display */
+	decl = vl_decl_samplers(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t0, i0, s0	; Read src pixel */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_SAMPLER, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t0, t0, c0	; Subtract bias vector from pixel */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * dp4 o0.x, t0, c1	; Multiply pixel by the color conversion matrix
+	 * dp4 o0.y, t0, c2
+	 * dp4 o0.z, t0, c3
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_DP4, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 1);
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	csc->fragment_shader = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context *pipe;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/*
+	 * Create our vertex buffer and vertex buffer element
+	 * VB contains 4 vertices that render a quad covering the entire window
+	 * to display a rendered surface
+	 * Quad is rendered as a tri strip
+	 */
+	csc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[0].max_index = 3;
+	csc->vertex_bufs[0].buffer_offset = 0;
+	csc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_verts,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[0].buffer);
+
+	csc->vertex_elems[0].src_offset = 0;
+	csc->vertex_elems[0].vertex_buffer_index = 0;
+	csc->vertex_elems[0].nr_components = 2;
+	csc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	 * Create our texcoord buffer and texcoord buffer element
+	 * Texcoord buffer contains the TCs for mapping the rendered surface to the 4 vertices
+	 */
+	csc->vertex_bufs[1].pitch = sizeof(struct vlVertex2f);
+	csc->vertex_bufs[1].max_index = 3;
+	csc->vertex_bufs[1].buffer_offset = 0;
+	csc->vertex_bufs[1].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		surface_texcoords,
+		sizeof(struct vlVertex2f) * 4
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->vertex_bufs[1].buffer);
+
+	csc->vertex_elems[1].src_offset = 0;
+	csc->vertex_elems[1].vertex_buffer_index = 1;
+	csc->vertex_elems[1].nr_components = 2;
+	csc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/*
+	 * Create our vertex shader's constant buffer
+	 * Const buffer contains scaling and translation vectors
+	 */
+	csc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	csc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->vs_const_buf.size
+	);
+
+	/*
+	 * Create our fragment shader's constant buffer
+	 * Const buffer contains the color conversion matrix and bias vectors
+	 */
+	csc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	csc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		csc->fs_const_buf.size
+	);
+
+	/*
+	 * TODO: Refactor this into a seperate function,
+	 * allow changing the CSC matrix at runtime to switch between regular & full versions
+	 */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, csc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&bt_601,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, csc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlBasicCSC *csc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+
+	assert(csc);
+
+	pipe = csc->pipe;
+
+	/* Delay creating the FB until vlPutPictureCSC() so we know window size */
+	csc->framebuffer_tex = NULL;
+	csc->framebuffer.width = 0;
+	csc->framebuffer.height = 0;
+	csc->framebuffer.num_cbufs = 1;
+	csc->framebuffer.cbufs[0] = NULL;
+	csc->framebuffer.zsbuf = NULL;
+
+	sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+	sampler.min_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+	sampler.mag_img_filter = PIPE_TEX_FILTER_LINEAR;
+	sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+	sampler.compare_func = PIPE_FUNC_ALWAYS;
+	sampler.normalized_coords = 1;
+	/*sampler.prefilter = ;*/
+	/*sampler.shadow_ambient = ;*/
+	/*sampler.lod_bias = ;*/
+	/*sampler.min_lod = ;*/
+	/*sampler.max_lod = ;*/
+	/*sampler.border_color[i] = ;*/
+	/*sampler.max_anisotropy = ;*/
+	csc->sampler = pipe->create_sampler_state(pipe, &sampler);
+
+	vlCreateVertexShader(csc);
+	vlCreateFragmentShader(csc);
+	vlCreateDataBufs(csc);
+
+	return 0;
+}
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+)
+{
+	struct vlBasicCSC *basic_csc;
+
+	assert(pipe);
+	assert(csc);
+
+	basic_csc = calloc(1, sizeof(struct vlBasicCSC));
+
+	if (!basic_csc)
+		return 1;
+
+	basic_csc->base.vlResizeFrameBuffer = &vlResizeFrameBuffer;
+	basic_csc->base.vlBegin = &vlBegin;
+	basic_csc->base.vlPutPicture = &vlPutPictureCSC;
+	basic_csc->base.vlEnd = &vlEnd;
+	basic_csc->base.vlGetFrameBuffer = &vlGetFrameBuffer;
+	basic_csc->base.vlDestroy = &vlDestroy;
+	basic_csc->pipe = pipe;
+
+	vlInit(basic_csc);
+
+	*csc = &basic_csc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_basic_csc.h b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
new file mode 100644
index 0000000000..2e17f1d814
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_basic_csc.h
@@ -0,0 +1,13 @@
+#ifndef vl_basic_csc_h
+#define vl_basic_csc_h
+
+struct pipe_context;
+struct vlCSC;
+
+int vlCreateBasicCSC
+(
+	struct pipe_context *pipe,
+	struct vlCSC **csc
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.c b/src/gallium/state_trackers/g3dvl/vl_context.c
new file mode 100644
index 0000000000..fe107e406d
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_context.c
@@ -0,0 +1,210 @@
+#define VL_INTERNAL
+#include "vl_context.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_state.h>
+#include "vl_render.h"
+#include "vl_r16snorm_mc.h"
+#include "vl_r16snorm_mc_buf.h"
+#include "vl_csc.h"
+#include "vl_basic_csc.h"
+
+static int vlInitCommon(struct vlContext *context)
+{
+	struct pipe_context			*pipe;
+	struct pipe_rasterizer_state		rast;
+	struct pipe_blend_state			blend;
+	struct pipe_depth_stencil_alpha_state	dsa;
+	unsigned int				i;
+
+	assert(context);
+
+	pipe = context->pipe;
+
+	rast.flatshade = 1;
+	rast.flatshade_first = 0;
+	rast.light_twoside = 0;
+	rast.front_winding = PIPE_WINDING_CCW;
+	rast.cull_mode = PIPE_WINDING_CW;
+	rast.fill_cw = PIPE_POLYGON_MODE_FILL;
+	rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
+	rast.offset_cw = 0;
+	rast.offset_ccw = 0;
+	rast.scissor = 0;
+	rast.poly_smooth = 0;
+	rast.poly_stipple_enable = 0;
+	rast.point_sprite = 0;
+	rast.point_size_per_vertex = 0;
+	rast.multisample = 0;
+	rast.line_smooth = 0;
+	rast.line_stipple_enable = 0;
+	rast.line_stipple_factor = 0;
+	rast.line_stipple_pattern = 0;
+	rast.line_last_pixel = 0;
+	/* Don't need clipping, but viewport mapping done here */
+	rast.bypass_clipping = 0;
+	rast.bypass_vs = 0;
+	rast.origin_lower_left = 0;
+	rast.line_width = 1;
+	rast.point_smooth = 0;
+	rast.point_size = 1;
+	rast.offset_units = 1;
+	rast.offset_scale = 1;
+	/*rast.sprite_coord_mode[i] = ;*/
+	context->raster = pipe->create_rasterizer_state(pipe, &rast);
+	pipe->bind_rasterizer_state(pipe, context->raster);
+
+	blend.blend_enable = 0;
+	blend.rgb_func = PIPE_BLEND_ADD;
+	blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+	blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+	blend.alpha_func = PIPE_BLEND_ADD;
+	blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+	blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+	blend.logicop_enable = 0;
+	blend.logicop_func = PIPE_LOGICOP_CLEAR;
+	/* Needed to allow color writes to FB, even if blending disabled */
+	blend.colormask = PIPE_MASK_RGBA;
+	blend.dither = 0;
+	context->blend = pipe->create_blend_state(pipe, &blend);
+	pipe->bind_blend_state(pipe, context->blend);
+
+	dsa.depth.enabled = 0;
+	dsa.depth.writemask = 0;
+	dsa.depth.func = PIPE_FUNC_ALWAYS;
+	dsa.depth.occlusion_count = 0;
+	for (i = 0; i < 2; ++i)
+	{
+		dsa.stencil[i].enabled = 0;
+		dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
+		dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
+		dsa.stencil[i].ref_value = 0;
+		dsa.stencil[i].value_mask = 0;
+		dsa.stencil[i].write_mask = 0;
+	}
+	dsa.alpha.enabled = 0;
+	dsa.alpha.func = PIPE_FUNC_ALWAYS;
+	dsa.alpha.ref = 0;
+	context->dsa = pipe->create_depth_stencil_alpha_state(pipe, &dsa);
+	pipe->bind_depth_stencil_alpha_state(pipe, context->dsa);
+
+	return 0;
+}
+
+int vlCreateContext
+(
+	struct vlScreen *screen,
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
+)
+{
+	struct vlContext *ctx;
+
+	assert(screen);
+	assert(context);
+	assert(pipe);
+
+	ctx = calloc(1, sizeof(struct vlContext));
+
+	if (!ctx)
+		return 1;
+
+	ctx->screen = screen;
+	ctx->pipe = pipe;
+	ctx->picture_width = picture_width;
+	ctx->picture_height = picture_height;
+	ctx->picture_format = picture_format;
+	ctx->profile = profile;
+	ctx->entry_point = entry_point;
+
+	vlInitCommon(ctx);
+
+	/*vlCreateR16SNormMC(pipe, picture_width, picture_height, picture_format, &ctx->render);*/
+	vlCreateR16SNormBufferedMC(pipe, picture_width, picture_height, picture_format, &ctx->render);
+	vlCreateBasicCSC(pipe, &ctx->csc);
+
+	*context = ctx;
+
+	return 0;
+}
+
+int vlDestroyContext
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	/* XXX: Must unbind shaders before we can delete them for some reason */
+	context->pipe->bind_vs_state(context->pipe, NULL);
+	context->pipe->bind_fs_state(context->pipe, NULL);
+
+	context->render->vlDestroy(context->render);
+	context->csc->vlDestroy(context->csc);
+
+	context->pipe->delete_blend_state(context->pipe, context->blend);
+	context->pipe->delete_rasterizer_state(context->pipe, context->raster);
+	context->pipe->delete_depth_stencil_alpha_state(context->pipe, context->dsa);
+
+	free(context);
+
+	return 0;
+}
+
+struct vlScreen* vlContextGetScreen
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->screen;
+}
+
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->pipe;
+}
+
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_width;
+}
+
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_height;
+}
+
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+)
+{
+	assert(context);
+
+	return context->picture_format;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_context.h b/src/gallium/state_trackers/g3dvl/vl_context.h
new file mode 100644
index 0000000000..3d14634c44
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_context.h
@@ -0,0 +1,73 @@
+#ifndef vl_context_h
+#define vl_context_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+
+#ifdef VL_INTERNAL
+struct vlRender;
+struct vlCSC;
+
+struct vlContext
+{
+	struct vlScreen		*screen;
+	struct pipe_context	*pipe;
+	unsigned int		picture_width;
+	unsigned int		picture_height;
+	enum vlFormat		picture_format;
+	enum vlProfile		profile;
+	enum vlEntryPoint	entry_point;
+
+	void			*raster;
+	void			*dsa;
+	void			*blend;
+
+	struct vlRender		*render;
+	struct vlCSC		*csc;
+};
+#endif
+
+int vlCreateContext
+(
+	struct vlScreen *screen,
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	enum vlProfile profile,
+	enum vlEntryPoint entry_point,
+	struct vlContext **context
+);
+
+int vlDestroyContext
+(
+	struct vlContext *context
+);
+
+struct vlScreen* vlContextGetScreen
+(
+	struct vlContext *context
+);
+
+struct pipe_context* vlGetPipeContext
+(
+	struct vlContext *context
+);
+
+unsigned int vlGetPictureWidth
+(
+	struct vlContext *context
+);
+
+unsigned int vlGetPictureHeight
+(
+	struct vlContext *context
+);
+
+enum vlFormat vlGetPictureFormat
+(
+	struct vlContext *context
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_csc.h b/src/gallium/state_trackers/g3dvl/vl_csc.h
new file mode 100644
index 0000000000..36417a2792
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_csc.h
@@ -0,0 +1,53 @@
+#ifndef vl_csc_h
+#define vl_csc_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlCSC
+{
+	int (*vlResizeFrameBuffer)
+	(
+		struct vlCSC *csc,
+		unsigned int width,
+		unsigned int height
+	);
+
+	int (*vlBegin)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlPutPicture)
+	(
+		struct vlCSC *csc,
+		struct vlSurface *surface,
+		int srcx,
+		int srcy,
+		int srcw,
+		int srch,
+		int destx,
+		int desty,
+		int destw,
+		int desth,
+		enum vlPictureType picture_type
+	);
+
+	int (*vlEnd)
+	(
+		struct vlCSC *csc
+	);
+
+	struct pipe_surface* (*vlGetFrameBuffer)
+	(
+		struct vlCSC *csc
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlCSC *csc
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_defs.h b/src/gallium/state_trackers/g3dvl/vl_defs.h
new file mode 100644
index 0000000000..d612d02502
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_defs.h
@@ -0,0 +1,11 @@
+#ifndef vl_defs_h
+#define vl_defs_h
+
+#define VL_BLOCK_WIDTH		8
+#define VL_BLOCK_HEIGHT		8
+#define VL_BLOCK_SIZE		(VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT)
+#define VL_MACROBLOCK_WIDTH	16
+#define VL_MACROBLOCK_HEIGHT	16
+#define VL_MACROBLOCK_SIZE	(VL_MACROBLOCK_WIDTH * VL_MACROBLOCK_HEIGHT)
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.c b/src/gallium/state_trackers/g3dvl/vl_display.c
new file mode 100644
index 0000000000..af80faa7f5
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.c
@@ -0,0 +1,48 @@
+#define VL_INTERNAL
+#include "vl_display.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+)
+{
+	struct vlDisplay *dpy;
+
+	assert(native_display);
+	assert(display);
+
+	dpy = calloc(1, sizeof(struct vlDisplay));
+
+	if (!dpy)
+		return 1;
+
+	dpy->native = native_display;
+	*display = dpy;
+
+	return 0;
+}
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	free(display);
+
+	return 0;
+}
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+)
+{
+	assert(display);
+
+	return display->native;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_display.h b/src/gallium/state_trackers/g3dvl/vl_display.h
new file mode 100644
index 0000000000..e11fd40799
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_display.h
@@ -0,0 +1,29 @@
+#ifndef vl_display_h
+#define vl_display_h
+
+#include "vl_types.h"
+
+#ifdef VL_INTERNAL
+struct vlDisplay
+{
+	vlNativeDisplay native;
+};
+#endif
+
+int vlCreateDisplay
+(
+	vlNativeDisplay native_display,
+	struct vlDisplay **display
+);
+
+int vlDestroyDisplay
+(
+	struct vlDisplay *display
+);
+
+vlNativeDisplay vlGetNativeDisplay
+(
+	struct vlDisplay *display
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
new file mode 100644
index 0000000000..3272220ef8
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.c
@@ -0,0 +1,2344 @@
+#define VL_INTERNAL
+#include "vl_r16snorm_mc.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include "vl_render.h"
+#include "vl_shader_build.h"
+#include "vl_surface.h"
+#include "vl_util.h"
+#include "vl_types.h"
+#include "vl_defs.h"
+
+#define NUM_BUFS 4	/* Number of rotating buffers to use */
+
+struct vlVertexShaderConsts
+{
+	/*struct vlVertex4f scale;
+	struct vlVertex4f denorm;*/
+	struct vlVertex4f	scale;
+	struct vlVertex4f	mb_pos_trans;
+	struct vlVertex4f	denorm;
+	struct
+	{
+		struct vlVertex4f	top_field;
+		struct vlVertex4f	bottom_field;
+	} mb_tc_trans[2];
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f multiplier;
+	struct vlVertex4f div;
+};
+
+struct vlR16SnormMC
+{
+	struct vlRender				base;
+
+	unsigned int				video_width, video_height;
+	enum vlFormat				video_format;
+	unsigned int				cur_buf;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		render_target;
+	struct pipe_sampler_state		*samplers[5];
+	struct pipe_texture			*textures[NUM_BUFS][5];
+	void					*i_vs, *p_vs[2], *b_vs[2];
+	void					*i_fs, *p_fs[2], *b_fs[2];
+	struct pipe_vertex_buffer 		vertex_bufs[3];
+	struct pipe_vertex_element		vertex_elems[3];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlBegin
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	/* Frame buffer set in vlRender*Macroblock() */
+	/* Shaders, samplers, textures set in vlRender*Macroblock() */
+	pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs);
+	pipe->set_vertex_elements(pipe, 3, mc->vertex_elems);
+	pipe->set_viewport_state(pipe, &mc->viewport);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
+
+	return 0;
+}
+
+/*static int vlGrabMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	struct vlMpeg2MacroBlock *macroblock
+)
+{
+	assert(mc);
+	assert(macroblock);
+
+
+
+	return 0;
+}*/
+
+/*#define DO_IDCT*/
+
+#ifdef DO_IDCT
+static int vlTransformBlock(short *src, short *dst, short bias)
+{
+	static const float basis[8][8] =
+	{
+		{0.3536,   0.4904,   0.4619,   0.4157,   0.3536,   0.2778,   0.1913,   0.0975},
+		{0.3536,   0.4157,   0.1913,  -0.0975,  -0.3536,  -0.4904,  -0.4619,  -0.2778},
+		{0.3536,   0.2778,  -0.1913,  -0.4904,  -0.3536,   0.0975,   0.4619,   0.4157},
+		{0.3536,   0.0975,  -0.4619,  -0.2778,   0.3536,   0.4157,  -0.1913,  -0.4904},
+		{0.3536,  -0.0975,  -0.4619,   0.2778,   0.3536,  -0.4157,  -0.1913,   0.4904},
+		{0.3536,  -0.2778,  -0.1913,   0.4904,  -0.3536,  -0.0975,   0.4619,  -0.4157},
+		{0.3536,  -0.4157,   0.1913,   0.0975,  -0.3536,   0.4904,  -0.4619,   0.2778},
+		{0.3536,  -0.4904,   0.4619,  -0.4157,   0.3536,  -0.2778,   0.1913,  -0.0975}
+	};
+
+	unsigned int	x, y;
+	short		tmp[64];
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+			tmp[y * VL_BLOCK_WIDTH + x] = (short)
+			(
+				src[y * VL_BLOCK_WIDTH + 0] * basis[x][0] +
+				src[y * VL_BLOCK_WIDTH + 1] * basis[x][1] +
+				src[y * VL_BLOCK_WIDTH + 2] * basis[x][2] +
+				src[y * VL_BLOCK_WIDTH + 3] * basis[x][3] +
+				src[y * VL_BLOCK_WIDTH + 4] * basis[x][4] +
+				src[y * VL_BLOCK_WIDTH + 5] * basis[x][5] +
+				src[y * VL_BLOCK_WIDTH + 6] * basis[x][6] +
+				src[y * VL_BLOCK_WIDTH + 7] * basis[x][7]
+			);
+
+	for (x = 0; x < VL_BLOCK_WIDTH; ++x)
+		for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		{
+			dst[y * VL_BLOCK_WIDTH + x] = bias + (short)
+			(
+				tmp[0 * VL_BLOCK_WIDTH + x] * basis[y][0] +
+				tmp[1 * VL_BLOCK_WIDTH + x] * basis[y][1] +
+				tmp[2 * VL_BLOCK_WIDTH + x] * basis[y][2] +
+				tmp[3 * VL_BLOCK_WIDTH + x] * basis[y][3] +
+				tmp[4 * VL_BLOCK_WIDTH + x] * basis[y][4] +
+				tmp[5 * VL_BLOCK_WIDTH + x] * basis[y][5] +
+				tmp[6 * VL_BLOCK_WIDTH + x] * basis[y][6] +
+				tmp[7 * VL_BLOCK_WIDTH + x] * basis[y][7]
+			);
+			if (dst[y * VL_BLOCK_WIDTH + x] > 255)
+				dst[y * VL_BLOCK_WIDTH + x] = 255;
+			else if (bias > 0 && dst[y * VL_BLOCK_WIDTH + x] < 0)
+				dst[y * VL_BLOCK_WIDTH + x] = 0;
+		}
+	return 0;
+}
+#endif
+
+static int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT / 2; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	dst += VL_BLOCK_HEIGHT * dst_pitch;
+
+	for (; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memset
+		(
+			dst + y * dst_pitch,
+			0,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+enum vlSampleType
+{
+	vlSampleTypeFull,
+	vlSampleTypeDiff
+};
+
+static int vlGrabBlocks
+(
+	struct vlR16SnormMC *mc,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	enum vlSampleType sample_type,
+	short *blocks
+)
+{
+	struct pipe_surface	*tex_surface;
+	short			*texels;
+	unsigned int		tex_pitch;
+	unsigned int		tb, sb = 0;
+
+	assert(mc);
+	assert(blocks);
+
+	tex_surface = mc->pipe->screen->get_tex_surface
+	(
+		mc->pipe->screen,
+		mc->textures[mc->cur_buf % NUM_BUFS][0],
+		0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+	tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+	for (tb = 0; tb < 4; ++tb)
+	{
+		if ((coded_block_pattern >> (5 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			if (dct_type == vlDCTTypeFrameCoded)
+				vlGrabFrameCodedBlock
+				(
+					cur_block,
+					texels + tb * tex_pitch * VL_BLOCK_HEIGHT,
+					tex_pitch
+				);
+			else
+				vlGrabFieldCodedBlock
+				(
+					cur_block,
+					texels + (tb % 2) * tex_pitch * VL_BLOCK_HEIGHT + (tb / 2) * tex_pitch,
+					tex_pitch
+				);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels + tb * tex_pitch * VL_BLOCK_HEIGHT, tex_pitch);
+	}
+
+	pipe_surface_unmap(tex_surface);
+
+	/* TODO: Implement 422, 444 */
+	for (tb = 0; tb < 2; ++tb)
+	{
+		tex_surface = mc->pipe->screen->get_tex_surface
+		(
+			mc->pipe->screen,
+			mc->textures[mc->cur_buf % NUM_BUFS][tb + 1],
+			0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+		);
+
+		texels = pipe_surface_map(tex_surface, PIPE_BUFFER_USAGE_CPU_WRITE);
+		tex_pitch = tex_surface->stride / tex_surface->block.size;
+
+		if ((coded_block_pattern >> (1 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+#ifdef DO_IDCT
+			vlTransformBlock(cur_block, cur_block, sample_type == vlSampleTypeFull ? 128 : 0);
+#endif
+
+			vlGrabFrameCodedBlock
+			(
+				cur_block,
+				texels,
+				tex_pitch
+			);
+
+			++sb;
+		}
+		else
+			vlGrabNoBlock(texels, tex_pitch);
+
+		pipe_surface_unmap(tex_surface);
+	}
+
+	return 0;
+}
+
+static int vlRenderIMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(blocks);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeFull, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+	pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
+	pipe->bind_vs_state(pipe, mc->i_vs);
+	pipe->bind_fs_state(pipe, mc->i_fs);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderPMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_x,
+	short top_y,
+	short bottom_x,
+	short bottom_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *ref_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[mc->cur_buf % NUM_BUFS][3] = ref_surface->texture;
+	pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderBMacroBlock
+(
+	struct vlR16SnormMC *mc,
+	enum vlPictureType picture_type,
+	enum vlFieldOrder field_order,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlMotionType mc_type,
+	short top_past_x,
+	short top_past_y,
+	short bottom_past_x,
+	short bottom_past_y,
+	short top_future_x,
+	short top_future_y,
+	short bottom_future_x,
+	short bottom_future_y,
+	unsigned int coded_block_pattern,
+	enum vlDCTType dct_type,
+	short *blocks,
+	struct vlSurface *past_surface,
+	struct vlSurface *future_surface,
+	struct vlSurface *surface
+)
+{
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+
+	assert(motion_vectors);
+	assert(blocks);
+	assert(ref_surface);
+	assert(surface);
+
+	/* TODO: Implement interlaced rendering */
+	if (picture_type != vlPictureTypeFrame)
+		return 0;
+	/* TODO: Implement other MC types */
+	if (mc_type != vlMotionTypeFrame && mc_type != vlMotionTypeField)
+		return 0;
+
+	vlGrabBlocks(mc, coded_block_pattern, dct_type, vlSampleTypeDiff, blocks);
+
+	pipe = mc->pipe;
+
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->scale.x = VL_MACROBLOCK_WIDTH / (float)surface->texture->width[0];
+	vs_consts->scale.y = VL_MACROBLOCK_HEIGHT / (float)surface->texture->height[0];
+	vs_consts->scale.z = 1.0f;
+	vs_consts->scale.w = 1.0f;
+	vs_consts->mb_pos_trans.x = (mbx * VL_MACROBLOCK_WIDTH) / (float)surface->texture->width[0];
+	vs_consts->mb_pos_trans.y = (mby * VL_MACROBLOCK_HEIGHT) / (float)surface->texture->height[0];
+	vs_consts->mb_pos_trans.z = 0.0f;
+	vs_consts->mb_pos_trans.w = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_past_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[0].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_past_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[0].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[0].top_field.w = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.x = (mbx * VL_MACROBLOCK_WIDTH + top_future_x * 0.5f) / (float)surface->texture->width[0];
+	vs_consts->mb_tc_trans[1].top_field.y = (mby * VL_MACROBLOCK_HEIGHT + top_future_y * 0.5f) / (float)surface->texture->height[0];
+	vs_consts->mb_tc_trans[1].top_field.z = 0.0f;
+	vs_consts->mb_tc_trans[1].top_field.w = 0.0f;
+
+	if (mc_type == vlMotionTypeField)
+	{
+		vs_consts->denorm.x = (float)surface->texture->width[0];
+		vs_consts->denorm.y = (float)surface->texture->height[0];
+
+		vs_consts->mb_tc_trans[0].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_past_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[0].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_past_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[0].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[0].bottom_field.w = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.x = (mbx * VL_MACROBLOCK_WIDTH + bottom_future_x * 0.5f) / (float)surface->texture->width[0];
+		vs_consts->mb_tc_trans[1].bottom_field.y = (mby * VL_MACROBLOCK_HEIGHT + bottom_future_y * 0.5f) / (float)surface->texture->height[0];
+		vs_consts->mb_tc_trans[1].bottom_field.z = 0.0f;
+		vs_consts->mb_tc_trans[1].bottom_field.w = 0.0f;
+
+		pipe->bind_vs_state(pipe, mc->b_vs[1]);
+		pipe->bind_fs_state(pipe, mc->b_fs[1]);
+	}
+	else
+	{
+		pipe->bind_vs_state(pipe, mc->b_vs[0]);
+		pipe->bind_fs_state(pipe, mc->b_fs[0]);
+	}
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+
+	mc->textures[mc->cur_buf % NUM_BUFS][3] = past_surface->texture;
+	mc->textures[mc->cur_buf % NUM_BUFS][4] = future_surface->texture;
+	pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUFS]);
+	pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+
+	pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, 0, 24);
+
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderMacroBlocksMpeg2R16Snorm
+(
+	struct vlRender *render,
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	struct vlR16SnormMC	*mc;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+
+	/*for (i = 0; i < batch->num_macroblocks; ++i)
+		vlGrabMacroBlock(batch->macroblocks[i]);*/
+
+	for (i = 0; i < batch->num_macroblocks; ++i)
+	{
+		switch (batch->macroblocks[i].mb_type)
+		{
+			case vlMacroBlockTypeIntra:
+			{
+				vlRenderIMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeFwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBkwdPredicted:
+			{
+				vlRenderPMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			case vlMacroBlockTypeBiPredicted:
+			{
+				vlRenderBMacroBlock
+				(
+					mc,
+					batch->picture_type,
+					batch->field_order,
+					batch->macroblocks[i].mbx,
+					batch->macroblocks[i].mby,
+					batch->macroblocks[i].mo_type,
+					batch->macroblocks[i].PMV[0][0][0],
+					batch->macroblocks[i].PMV[0][0][1],
+					batch->macroblocks[i].PMV[1][0][0],
+					batch->macroblocks[i].PMV[1][0][1],
+					batch->macroblocks[i].PMV[0][1][0],
+					batch->macroblocks[i].PMV[0][1][1],
+					batch->macroblocks[i].PMV[1][1][0],
+					batch->macroblocks[i].PMV[1][1][1],
+					batch->macroblocks[i].cbp,
+					batch->macroblocks[i].dct_type,
+					batch->macroblocks[i].blocks,
+					batch->past_surface,
+					batch->future_surface,
+					surface
+				);
+				break;
+			}
+			default:
+				assert(0);
+		}
+	}
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlFlush
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlDestroy
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormMC	*mc;
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormMC*)render;
+	pipe = mc->pipe;
+
+	for (i = 0; i < 5; ++i)
+		pipe->delete_sampler_state(pipe, mc->samplers[i]);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Textures 3 & 4 are not created directly, no need to release them here */
+	for (i = 0; i < NUM_BUFS; ++i)
+	{
+		pipe_texture_release(&mc->textures[i][0]);
+		pipe_texture_release(&mc->textures[i][1]);
+		pipe_texture_release(&mc->textures[i][2]);
+	}
+
+	pipe->delete_vs_state(pipe, mc->i_vs);
+	pipe->delete_fs_state(pipe, mc->i_fs);
+
+	for (i = 0; i < 2; ++i)
+	{
+		pipe->delete_vs_state(pipe, mc->p_vs[i]);
+		pipe->delete_fs_state(pipe, mc->p_fs[i]);
+		pipe->delete_vs_state(pipe, mc->b_vs[i]);
+		pipe->delete_fs_state(pipe, mc->b_fs[i]);
+	}
+
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
+
+	free(mc);
+
+	return 0;
+}
+
+/*
+ * Represents 8 triangles (4 quads, 1 per block) in noormalized coords
+ * that render a macroblock.
+ * Need to be scaled to cover mbW*mbH macroblock pixels and translated into
+ * position on target surface.
+ */
+static const struct vlVertex2f macroblock_verts[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.0f},
+	{0.5f, 0.0f}, {0.0f, 0.5f}, {0.5f, 0.5f},
+
+	{0.5f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.5f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 1.0f}, {0.5f, 0.5f},
+	{0.5f, 0.5f}, {0.0f, 1.0f}, {0.5f, 1.0f},
+
+	{0.5f, 0.5f}, {0.5f, 1.0f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.5f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 4 luma blocks arranged
+ * in a bW*(bH*4) texture. First luma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH; third at 0,2bH->bW,3bH; fourth at 0,3bH->bW,4bH.
+ */
+static const struct vlVertex2f macroblock_luma_texcoords[24] =
+{
+	{0.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.0f},
+	{1.0f, 0.0f}, {0.0f, 0.25f}, {1.0f, 0.25f},
+
+	{0.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.25f},
+	{1.0f, 0.25f}, {0.0f, 0.5f}, {1.0f, 0.5f},
+
+	{0.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.5f},
+	{1.0f, 0.5f}, {0.0f, 0.75f}, {1.0f, 0.75f},
+
+	{0.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 0.75f},
+	{1.0f, 0.75f}, {0.0f, 1.0f}, {1.0f, 1.0f}
+};
+
+/*
+ * Represents texcoords for the above for rendering 1 chroma block.
+ * Straight forward 0,0->1,1 mapping so we can reuse the MB pos vectors.
+ */
+static const struct vlVertex2f *macroblock_chroma_420_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 2 chroma blocks arranged
+ * in a bW*(bH*2) texture. First chroma block located at 0,0->bW,bH; second at
+ * 0,bH->bW,2bH. We can render this with 0,0->1,1 mapping.
+ * Straight forward 0,0->1,1 mapping so we can reuse MB pos vectors.
+ */
+static const struct vlVertex2f *macroblock_chroma_422_texcoords = macroblock_verts;
+
+/*
+ * Represents texcoords for the above for rendering 4 chroma blocks.
+ * Same case as 4 luma blocks.
+ */
+static const struct vlVertex2f *macroblock_chroma_444_texcoords = macroblock_luma_texcoords;
+
+/*
+ * Used when rendering P and B macroblocks, multiplier is applied to the A channel,
+ * which is then added to the L channel, then the bias is subtracted from that to
+ * get back the differential. The differential is then added to the samples from the
+ * reference surface(s).
+ */
+static const struct vlFragmentShaderConsts fs_consts =
+{
+	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+	{0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+static int vlCreateVertexShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->i_vs = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderIMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul o0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->i_fs = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 3);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on ref macroblock */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration
+		(
+			&decl,
+			&tokens[ti],
+			header,
+			max_tokens - ti
+		);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field ref macroblock texcoords
+	 * decl o4		; Bottom field ref macroblock texcoords
+	 * decl o5		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 5) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	mov o1, i1		; Move input luma texcoords to output
+	mov o2, i2		; Move input chroma texcoords to output
+	*/
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate top field rect into position on ref macroblock
+	   add o4, t0, c4	; Translate bottom field rect into position on ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o5, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 5, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFramePMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t1, i2, s3		; Read texel from ref macroblock */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 2, TGSI_FILE_SAMPLER, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldPMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t4 */
+	decl = vl_decl_temps(0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i4.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Unused
+	 * decl c3		; Translation vector to move past ref macroblock texcoords into position
+	 * decl c4		; Unused
+	 * decl c5		; Translation vector to move future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Past ref macroblock texcoords
+	 * decl o4		; Future ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0 */
+	decl = vl_decl_temps(0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o3, t0, c3	; Translate rect into position on past ref macroblock
+	   add o4, t0, c5	; Translate rect into position on future ref macroblock */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i * 2 + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma texcoords
+	 */
+	for (i = 0; i < 3; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0		; Scaling vector to scale unit rect to macroblock size
+	 * decl c1		; Translation vector to move macroblock into position
+	 * decl c2		; Denorm coefficients
+	 * decl c3		; Translation vector to move top field past ref macroblock texcoords into position
+	 * decl c4		; Translation vector to move bottom field past ref macroblock texcoords into position
+	 * decl c5		; Translation vector to move top field future ref macroblock texcoords into position
+	 * decl c6		; Translation vector to move bottom field future ref macroblock texcoords into position
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 6);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma texcoords
+	 * decl o3		; Top field past ref macroblock texcoords
+	 * decl o4		; Bottom field past ref macroblock texcoords
+	 * decl o5		; Top field future ref macroblock texcoords
+	 * decl o6		; Bottom field future ref macroblock texcoords
+	 * decl o7		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_output((i == 0 || i == 7) ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t0, i0, c0	; Scale unit rect to normalized MB size */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add t1, t0, c1	; Translate rect into position */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mov o0, t1		; Move vertex pos to output */
+	inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma texcoords to output
+	 */
+	for (i = 1; i < 3; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o3, t0, c3	; Translate top field rect into position on past ref macroblock
+	 * add o4, t0, c4	; Translate bottom field rect into position on past ref macroblock
+	 * add o5, t0, c5	; Translate top field rect into position on future ref macroblock
+	 * add o6, t0, c6	; Translate bottom field rect into position on future ref macroblock
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 3, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o7, t1, c2	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 7, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFrameBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s4
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t2 */
+	decl = vl_decl_temps(0, 2);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock
+	 * tex2d t2, i3, s4		; Read texel from future ref macroblock
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldBMB
+(
+	struct vlR16SnormMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Texcoords for s0
+	 * decl i1			; Texcoords for s1, s2
+	 * decl i2			; Texcoords for s3
+	 * decl i3			; Texcoords for s3
+	 * decl i4			; Texcoords for s4
+	 * decl i5			; Texcoords for s4
+	 * decl i6			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 7; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
+	 *				; and for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t5 */
+	decl = vl_decl_temps(0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for past ref surface texture
+	 * decl s4			; Sampler for future ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i1, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i > 0 ? 1 : 0, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i6.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 6, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i2, s3		; Read texel from past ref macroblock top field
+	 * tex2d t2, i3, s3		; Read texel from past ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 2, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t4, i4, s4		; Read texel from future ref macroblock top field
+	 * tex2d t5, i5, s4		; Read texel from future ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 4, TGSI_FILE_SAMPLER, 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateDataBufs
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context	*pipe;
+	unsigned int		i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* Create our vertex buffer and vertex buffer element */
+	mc->vertex_bufs[0].pitch = sizeof(struct vlVertex2f);
+	mc->vertex_bufs[0].max_index = 23;
+	mc->vertex_bufs[0].buffer_offset = 0;
+	mc->vertex_bufs[0].buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_VERTEX,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	mc->vertex_elems[0].src_offset = 0;
+	mc->vertex_elems[0].vertex_buffer_index = 0;
+	mc->vertex_elems[0].nr_components = 2;
+	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Create our texcoord buffers and texcoord buffer elements */
+	for (i = 1; i < 3; ++i)
+	{
+		mc->vertex_bufs[i].pitch = sizeof(struct vlVertex2f);
+		mc->vertex_bufs[i].max_index = 23;
+		mc->vertex_bufs[i].buffer_offset = 0;
+		mc->vertex_bufs[i].buffer = pipe->winsys->buffer_create
+		(
+			pipe->winsys,
+			1,
+			PIPE_BUFFER_USAGE_VERTEX,
+			sizeof(struct vlVertex2f) * 24
+		);
+
+		mc->vertex_elems[i].src_offset = 0;
+		mc->vertex_elems[i].vertex_buffer_index = i;
+		mc->vertex_elems[i].nr_components = 2;
+		mc->vertex_elems[i].src_format = PIPE_FORMAT_R32G32_FLOAT;
+	}
+
+	/* Fill buffers */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[0].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_verts,
+		sizeof(struct vlVertex2f) * 24
+	);
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[1].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_luma_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+	/* TODO: Accomodate 422, 444 */
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->vertex_bufs[2].buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		macroblock_chroma_420_texcoords,
+		sizeof(struct vlVertex2f) * 24
+	);
+
+	for (i = 0; i < 3; ++i)
+		pipe->winsys->buffer_unmap(pipe->winsys, mc->vertex_bufs[i].buffer);
+
+	/* Create our constant buffer */
+	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	mc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->vs_const_buf.size
+	);
+
+	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	mc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->fs_const_buf.size
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&fs_consts,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlR16SnormMC *mc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+	struct pipe_texture		template;
+	unsigned int			filters[5];
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* For MC we render to textures, which are rounded up to nearest POT */
+	mc->viewport.scale[0] = vlRoundUpPOT(mc->video_width);
+	mc->viewport.scale[1] = vlRoundUpPOT(mc->video_height);
+	mc->viewport.scale[2] = 1;
+	mc->viewport.scale[3] = 1;
+	mc->viewport.translate[0] = 0;
+	mc->viewport.translate[1] = 0;
+	mc->viewport.translate[2] = 0;
+	mc->viewport.translate[3] = 0;
+
+	mc->render_target.width = vlRoundUpPOT(mc->video_width);
+	mc->render_target.height = vlRoundUpPOT(mc->video_height);
+	mc->render_target.num_cbufs = 1;
+	/* FB for MC stage is a vlSurface, set in vlSetRenderSurface() */
+	mc->render_target.zsbuf = NULL;
+
+	filters[0] = PIPE_TEX_FILTER_NEAREST;
+	filters[1] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[2] = mc->video_format == vlFormatYCbCr444 ? PIPE_TEX_FILTER_NEAREST : PIPE_TEX_FILTER_LINEAR;
+	filters[3] = PIPE_TEX_FILTER_LINEAR;
+	filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+	for (i = 0; i < 5; ++i)
+	{
+		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.min_img_filter = filters[i];
+		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+		sampler.mag_img_filter = filters[i];
+		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+		sampler.compare_func = PIPE_FUNC_ALWAYS;
+		sampler.normalized_coords = 1;
+		/*sampler.prefilter = ;*/
+		/*sampler.shadow_ambient = ;*/
+		/*sampler.lod_bias = ;*/
+		sampler.min_lod = 0;
+		/*sampler.max_lod = ;*/
+		/*sampler.border_color[i] = ;*/
+		/*sampler.max_anisotropy = ;*/
+		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
+	}
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_R16_SNORM;
+	template.last_level = 0;
+	template.width[0] = 8;
+	template.height[0] = 8 * 4;
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+
+	for (i = 0; i < NUM_BUFS; ++i)
+		mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template);
+
+	if (mc->video_format == vlFormatYCbCr420)
+		template.height[0] = 8;
+	else if (mc->video_format == vlFormatYCbCr422)
+		template.height[0] = 8 * 2;
+	else if (mc->video_format == vlFormatYCbCr444)
+		template.height[0] = 8 * 4;
+	else
+		assert(0);
+
+	for (i = 0; i < NUM_BUFS; ++i)
+	{
+		mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template);
+		mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template);
+	}
+
+	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
+
+	vlCreateVertexShaderIMB(mc);
+	vlCreateFragmentShaderIMB(mc);
+	vlCreateVertexShaderFramePMB(mc);
+	vlCreateVertexShaderFieldPMB(mc);
+	vlCreateFragmentShaderFramePMB(mc);
+	vlCreateFragmentShaderFieldPMB(mc);
+	vlCreateVertexShaderFrameBMB(mc);
+	vlCreateVertexShaderFieldBMB(mc);
+	vlCreateFragmentShaderFrameBMB(mc);
+	vlCreateFragmentShaderFieldBMB(mc);
+	vlCreateDataBufs(mc);
+
+	return 0;
+}
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+)
+{
+	struct vlR16SnormMC *mc;
+
+	assert(pipe);
+	assert(render);
+
+	mc = calloc(1, sizeof(struct vlR16SnormMC));
+
+	mc->base.vlBegin = &vlBegin;
+	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16Snorm;
+	mc->base.vlEnd = &vlEnd;
+	mc->base.vlFlush = &vlFlush;
+	mc->base.vlDestroy = &vlDestroy;
+	mc->pipe = pipe;
+	mc->video_width = video_width;
+	mc->video_height = video_height;
+	mc->cur_buf = 0;
+
+	vlInit(mc);
+
+	*render = &mc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
new file mode 100644
index 0000000000..9842926bf7
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc.h
@@ -0,0 +1,18 @@
+#ifndef vl_r16snorm_mc_h
+#define vl_r16snorm_mc_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+struct vlRender;
+
+int vlCreateR16SNormMC
+(
+	struct pipe_context *pipe,
+	unsigned int video_width,
+	unsigned int video_height,
+	enum vlFormat video_format,
+	struct vlRender **render
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
new file mode 100644
index 0000000000..650528ed8f
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.c
@@ -0,0 +1,1147 @@
+#define VL_INTERNAL
+#include "vl_r16snorm_mc_buf.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <pipe/p_context.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+#include <util/u_math.h>
+#include "vl_render.h"
+#include "vl_shader_build.h"
+#include "vl_surface.h"
+#include "vl_util.h"
+#include "vl_types.h"
+#include "vl_defs.h"
+
+/*
+ * TODO: Dynamically determine number of buf sets to use, based on
+ * video size and available mem, since we can easily run out of memory
+ * for high res videos.
+ * Note: Destroying previous frame's buffers and creating new ones
+ * doesn't work, since the buffer are not actually destroyed until their
+ * fence is signalled, and if we render fast enough we will create faster
+ * than we destroy.
+ */
+#define NUM_BUF_SETS 4	/* Number of rotating buffer sets to use */
+
+enum vlMacroBlockTypeEx
+{
+	vlMacroBlockExTypeIntra,
+	vlMacroBlockExTypeFwdPredictedFrame,
+	vlMacroBlockExTypeFwdPredictedField,
+	vlMacroBlockExTypeBkwdPredictedFrame,
+	vlMacroBlockExTypeBkwdPredictedField,
+	vlMacroBlockExTypeBiPredictedFrame,
+	vlMacroBlockExTypeBiPredictedField,
+
+	vlNumMacroBlockExTypes
+};
+
+struct vlVertexShaderConsts
+{
+	struct vlVertex4f denorm;
+};
+
+struct vlFragmentShaderConsts
+{
+	struct vlVertex4f multiplier;
+	struct vlVertex4f div;
+};
+
+struct vlR16SnormBufferedMC
+{
+	struct vlRender				base;
+
+	unsigned int				picture_width, picture_height;
+	enum vlFormat				picture_format;
+
+	unsigned int				cur_buf;
+	struct vlSurface			*buffered_surface;
+	struct vlSurface			*past_surface, *future_surface;
+	struct vlVertex2f			surface_tex_inv_size;
+	struct vlVertex2f			zero_block[3];
+	unsigned int				num_macroblocks;
+	struct vlMpeg2MacroBlock		*macroblocks;
+
+	struct pipe_context			*pipe;
+	struct pipe_viewport_state		viewport;
+	struct pipe_framebuffer_state		render_target;
+	struct pipe_sampler_state		*samplers[5];
+	struct pipe_texture			*textures[NUM_BUF_SETS][5];
+	struct pipe_surface			*tex_surface[3];
+	short					*texels[3];
+	void					*i_vs, *p_vs[2], *b_vs[2];
+	void					*i_fs, *p_fs[2], *b_fs[2];
+	struct pipe_vertex_buffer 		vertex_bufs[NUM_BUF_SETS][3];
+	struct pipe_vertex_element		vertex_elems[8];
+	struct pipe_constant_buffer		vs_const_buf, fs_const_buf;
+};
+
+static int vlBegin
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static inline int vlGrabFrameCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabFieldCodedBlock(short *src, short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memcpy
+		(
+			dst + y * dst_pitch * 2,
+			src + y * VL_BLOCK_WIDTH,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabNoBlock(short *dst, unsigned int dst_pitch)
+{
+	unsigned int y;
+
+	for (y = 0; y < VL_BLOCK_HEIGHT; ++y)
+		memset
+		(
+			dst + y * dst_pitch,
+			0,
+			VL_BLOCK_WIDTH * 2
+		);
+
+	return 0;
+}
+
+static inline int vlGrabBlocks
+(
+	struct vlR16SnormBufferedMC *mc,
+	unsigned int mbx,
+	unsigned int mby,
+	enum vlDCTType dct_type,
+	unsigned int coded_block_pattern,
+	short *blocks
+)
+{
+	short			*texels;
+	unsigned int		tex_pitch;
+	unsigned int		x, y, tb = 0, sb = 0;
+	unsigned int		mbpx = mbx * VL_MACROBLOCK_WIDTH, mbpy = mby * VL_MACROBLOCK_HEIGHT;
+
+	assert(mc);
+	assert(blocks);
+
+	tex_pitch = mc->tex_surface[0]->stride / mc->tex_surface[0]->block.size;
+	texels = mc->texels[0] + mbpy * tex_pitch + mbpx;
+
+	for (y = 0; y < 2; ++y)
+	{
+		for (x = 0; x < 2; ++x, ++tb)
+		{
+			if ((coded_block_pattern >> (5 - tb)) & 1)
+			{
+				short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+				if (dct_type == vlDCTTypeFrameCoded)
+				{
+					vlGrabFrameCodedBlock
+					(
+						cur_block,
+						texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH,
+						tex_pitch
+					);
+				}
+				else
+				{
+					vlGrabFieldCodedBlock
+					(
+						cur_block,
+						texels + y * tex_pitch + x * VL_BLOCK_WIDTH,
+						tex_pitch
+					);
+				}
+
+				++sb;
+			}
+			else if (mc->zero_block[0].x < 0.0f)
+			{
+				vlGrabNoBlock(texels + y * tex_pitch * VL_BLOCK_HEIGHT + x * VL_BLOCK_WIDTH, tex_pitch);
+
+				mc->zero_block[0].x = (mbpx + x * 8) * mc->surface_tex_inv_size.x;
+				mc->zero_block[0].y = (mbpy + y * 8) * mc->surface_tex_inv_size.y;
+			}
+		}
+	}
+
+	/* TODO: Implement 422, 444 */
+	mbpx >>= 1;
+	mbpy >>= 1;
+
+	for (tb = 0; tb < 2; ++tb)
+	{
+		tex_pitch = mc->tex_surface[tb + 1]->stride / mc->tex_surface[tb + 1]->block.size;
+		texels = mc->texels[tb + 1] + mbpy * tex_pitch + mbpx;
+
+		if ((coded_block_pattern >> (1 - tb)) & 1)
+		{
+			short *cur_block = blocks + sb * VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT;
+
+			vlGrabFrameCodedBlock
+			(
+				cur_block,
+				texels,
+				tex_pitch
+			);
+
+			++sb;
+		}
+		else if (mc->zero_block[tb + 1].x < 0.0f)
+		{
+			vlGrabNoBlock(texels, tex_pitch);
+
+			mc->zero_block[tb + 1].x = (mbpx << 1) * mc->surface_tex_inv_size.x;
+			mc->zero_block[tb + 1].y = (mbpy << 1) * mc->surface_tex_inv_size.y;
+		}
+	}
+
+	return 0;
+}
+
+static inline enum vlMacroBlockTypeEx vlGetMacroBlockTypeEx(struct vlMpeg2MacroBlock *mb)
+{
+	assert(mb);
+
+	switch (mb->mb_type)
+	{
+		case vlMacroBlockTypeIntra:
+			return vlMacroBlockExTypeIntra;
+		case vlMacroBlockTypeFwdPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeFwdPredictedFrame : vlMacroBlockExTypeFwdPredictedField;
+		case vlMacroBlockTypeBkwdPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeBkwdPredictedFrame : vlMacroBlockExTypeBkwdPredictedField;
+		case vlMacroBlockTypeBiPredicted:
+			return mb->mo_type == vlMotionTypeFrame ?
+				vlMacroBlockExTypeBiPredictedFrame : vlMacroBlockExTypeBiPredictedField;
+		default:
+			assert(0);
+	}
+
+	/* Unreachable */
+	return -1;
+}
+
+static inline int vlGrabMacroBlock
+(
+	struct vlR16SnormBufferedMC *mc,
+	struct vlMpeg2MacroBlock *macroblock
+)
+{
+	assert(mc);
+	assert(macroblock);
+
+	mc->macroblocks[mc->num_macroblocks].mbx = macroblock->mbx;
+	mc->macroblocks[mc->num_macroblocks].mby = macroblock->mby;
+	mc->macroblocks[mc->num_macroblocks].mb_type = macroblock->mb_type;
+	mc->macroblocks[mc->num_macroblocks].mo_type = macroblock->mo_type;
+	mc->macroblocks[mc->num_macroblocks].dct_type = macroblock->dct_type;
+	mc->macroblocks[mc->num_macroblocks].PMV[0][0][0] = macroblock->PMV[0][0][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][0][1] = macroblock->PMV[0][0][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][1][0] = macroblock->PMV[0][1][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[0][1][1] = macroblock->PMV[0][1][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][0][0] = macroblock->PMV[1][0][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][0][1] = macroblock->PMV[1][0][1];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][1][0] = macroblock->PMV[1][1][0];
+	mc->macroblocks[mc->num_macroblocks].PMV[1][1][1] = macroblock->PMV[1][1][1];
+	mc->macroblocks[mc->num_macroblocks].cbp = macroblock->cbp;
+	mc->macroblocks[mc->num_macroblocks].blocks = macroblock->blocks;
+
+	vlGrabBlocks
+	(
+		mc,
+		macroblock->mbx,
+		macroblock->mby,
+		macroblock->dct_type,
+		macroblock->cbp,
+		macroblock->blocks
+	);
+
+	mc->num_macroblocks++;
+
+	return 0;
+}
+
+#define SET_BLOCK(vb, cbp, mbx, mby, unitx, unity, ofsx, ofsy, hx, hy, lm, cbm, crm, zb)					\
+	(vb)[0].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[0].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[1].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[1].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[2].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[3].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].pos.y = (mby) * (unity) + (ofsy);			\
+	(vb)[4].pos.x = (mbx) * (unitx) + (ofsx);		(vb)[4].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+	(vb)[5].pos.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].pos.y = (mby) * (unity) + (ofsy) + (hy);		\
+																\
+	if ((cbp) & (lm))													\
+	{															\
+		(vb)[0].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].luma_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].luma_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].luma_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].luma_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].luma_tc.x = (zb)[0].x;		(vb)[0].luma_tc.y = (zb)[0].y;						\
+		(vb)[1].luma_tc.x = (zb)[0].x;		(vb)[1].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[2].luma_tc.x = (zb)[0].x + (hx);	(vb)[2].luma_tc.y = (zb)[0].y;						\
+		(vb)[3].luma_tc.x = (zb)[0].x + (hx);	(vb)[3].luma_tc.y = (zb)[0].y;						\
+		(vb)[4].luma_tc.x = (zb)[0].x;		(vb)[4].luma_tc.y = (zb)[0].y + (hy);					\
+		(vb)[5].luma_tc.x = (zb)[0].x + (hx);	(vb)[5].luma_tc.y = (zb)[0].y + (hy);					\
+	}															\
+																\
+	if ((cbp) & (cbm))													\
+	{															\
+		(vb)[0].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cb_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cb_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cb_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cb_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cb_tc.x = (zb)[1].x;		(vb)[0].cb_tc.y = (zb)[1].y;						\
+		(vb)[1].cb_tc.x = (zb)[1].x;		(vb)[1].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[2].cb_tc.x = (zb)[1].x + (hx);	(vb)[2].cb_tc.y = (zb)[1].y;						\
+		(vb)[3].cb_tc.x = (zb)[1].x + (hx);	(vb)[3].cb_tc.y = (zb)[1].y;						\
+		(vb)[4].cb_tc.x = (zb)[1].x;		(vb)[4].cb_tc.y = (zb)[1].y + (hy);					\
+		(vb)[5].cb_tc.x = (zb)[1].x + (hx);	(vb)[5].cb_tc.y = (zb)[1].y + (hy);					\
+	}															\
+																\
+	if ((cbp) & (crm))													\
+	{															\
+		(vb)[0].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[0].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[1].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[1].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[2].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[2].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[3].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[3].cr_tc.y = (mby) * (unity) + (ofsy);		\
+		(vb)[4].cr_tc.x = (mbx) * (unitx) + (ofsx);		(vb)[4].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+		(vb)[5].cr_tc.x = (mbx) * (unitx) + (ofsx) + (hx);	(vb)[5].cr_tc.y = (mby) * (unity) + (ofsy) + (hy);	\
+	}															\
+	else															\
+	{															\
+		(vb)[0].cr_tc.x = (zb)[2].x;		(vb)[0].cr_tc.y = (zb)[2].y;						\
+		(vb)[1].cr_tc.x = (zb)[2].x;		(vb)[1].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[2].cr_tc.x = (zb)[2].x + (hx);	(vb)[2].cr_tc.y = (zb)[2].y;						\
+		(vb)[3].cr_tc.x = (zb)[2].x + (hx);	(vb)[3].cr_tc.y = (zb)[2].y;						\
+		(vb)[4].cr_tc.x = (zb)[2].x;		(vb)[4].cr_tc.y = (zb)[2].y + (hy);					\
+		(vb)[5].cr_tc.x = (zb)[2].x + (hx);	(vb)[5].cr_tc.y = (zb)[2].y + (hy);					\
+	}
+
+static inline int vlGrabMacroBlockVB
+(
+	struct vlR16SnormBufferedMC *mc,
+	struct vlMpeg2MacroBlock *macroblock,
+	unsigned int pos
+)
+{
+	struct vlVertex2f	mo_vec[2];
+	unsigned int		i;
+
+	assert(mc);
+	assert(macroblock);
+
+	switch (macroblock->mb_type)
+	{
+		case vlMacroBlockTypeBiPredicted:
+		{
+			struct vlVertex2f *vb;
+
+			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][2].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 2 * 24;
+
+			mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+			mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+			if (macroblock->mo_type == vlMotionTypeFrame)
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+				}
+			}
+			else
+			{
+				mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+					vb[i + 1].x = mo_vec[1].x;
+					vb[i + 1].y = mo_vec[1].y;
+				}
+			}
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][2].buffer);
+
+			/* fall-through */
+		}
+		case vlMacroBlockTypeFwdPredicted:
+		case vlMacroBlockTypeBkwdPredicted:
+		{
+			struct vlVertex2f *vb;
+
+			vb = (struct vlVertex2f*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][1].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 2 * 24;
+
+			if (macroblock->mb_type == vlMacroBlockTypeBkwdPredicted)
+			{
+				mo_vec[0].x = macroblock->PMV[0][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[0].y = macroblock->PMV[0][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				if (macroblock->mo_type == vlMotionTypeField)
+				{
+					mo_vec[1].x = macroblock->PMV[1][1][0] * 0.5f * mc->surface_tex_inv_size.x;
+					mo_vec[1].y = macroblock->PMV[1][1][1] * 0.5f * mc->surface_tex_inv_size.y;
+				}
+			}
+			else
+			{
+				mo_vec[0].x = macroblock->PMV[0][0][0] * 0.5f * mc->surface_tex_inv_size.x;
+				mo_vec[0].y = macroblock->PMV[0][0][1] * 0.5f * mc->surface_tex_inv_size.y;
+
+				if (macroblock->mo_type == vlMotionTypeField)
+				{
+					mo_vec[1].x = macroblock->PMV[1][0][0] * 0.5f * mc->surface_tex_inv_size.x;
+					mo_vec[1].y = macroblock->PMV[1][0][1] * 0.5f * mc->surface_tex_inv_size.y;
+				}
+			}
+
+			if (macroblock->mo_type == vlMotionTypeFrame)
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+				}
+			}
+			else
+			{
+				for (i = 0; i < 24 * 2; i += 2)
+				{
+					vb[i].x = mo_vec[0].x;
+					vb[i].y = mo_vec[0].y;
+					vb[i + 1].x = mo_vec[1].x;
+					vb[i + 1].y = mo_vec[1].y;
+				}
+			}
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][1].buffer);
+
+			/* fall-through */
+		}
+		case vlMacroBlockTypeIntra:
+		{
+			const struct vlVertex2f	unit =
+			{
+				mc->surface_tex_inv_size.x * VL_MACROBLOCK_WIDTH,
+				mc->surface_tex_inv_size.y * VL_MACROBLOCK_HEIGHT
+			};
+			const struct vlVertex2f half =
+			{
+				mc->surface_tex_inv_size.x * (VL_MACROBLOCK_WIDTH / 2),
+				mc->surface_tex_inv_size.y * (VL_MACROBLOCK_HEIGHT / 2)
+			};
+
+			struct vlMacroBlockVertexStream0
+			{
+				struct vlVertex2f pos;
+				struct vlVertex2f luma_tc;
+				struct vlVertex2f cb_tc;
+				struct vlVertex2f cr_tc;
+			} *vb;
+
+			vb = (struct vlMacroBlockVertexStream0*)mc->pipe->winsys->buffer_map
+			(
+				mc->pipe->winsys,
+				mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][0].buffer,
+				PIPE_BUFFER_USAGE_CPU_WRITE
+			) + pos * 24;
+
+			SET_BLOCK
+			(
+				vb,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, 0, 0, half.x, half.y,
+				32, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 6,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, half.x, 0, half.x, half.y,
+				16, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 12,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, 0, half.y, half.x, half.y,
+				8, 2, 1, mc->zero_block
+			);
+
+			SET_BLOCK
+			(
+				vb + 18,
+				macroblock->cbp, macroblock->mbx, macroblock->mby,
+				unit.x, unit.y, half.x, half.y, half.x, half.y,
+				4, 2, 1, mc->zero_block
+			);
+
+			mc->pipe->winsys->buffer_unmap(mc->pipe->winsys, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS][0].buffer);
+
+			break;
+		}
+		default:
+			assert(0);
+	}
+
+	return 0;
+}
+
+static int vlFlush
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	struct pipe_context		*pipe;
+	struct vlVertexShaderConsts	*vs_consts;
+	unsigned int			num_macroblocks[vlNumMacroBlockExTypes] = {0};
+	unsigned int			offset[vlNumMacroBlockExTypes];
+	unsigned int			vb_start = 0;
+	unsigned int			mbw;
+	unsigned int			mbh;
+	unsigned int			num_mb_per_frame;
+	unsigned int			i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+
+	if (!mc->buffered_surface)
+		return 0;
+
+	mbw = align(mc->picture_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
+	mbh = align(mc->picture_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
+	num_mb_per_frame = mbw * mbh;
+
+	if (mc->num_macroblocks < num_mb_per_frame)
+		return 0;
+
+	pipe = mc->pipe;
+
+	for (i = 0; i < mc->num_macroblocks; ++i)
+	{
+		enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
+
+		num_macroblocks[mb_type_ex]++;
+	}
+
+	offset[0] = 0;
+
+	for (i = 1; i < vlNumMacroBlockExTypes; ++i)
+		offset[i] = offset[i - 1] + num_macroblocks[i - 1];
+
+	for (i = 0; i < mc->num_macroblocks; ++i)
+	{
+		enum vlMacroBlockTypeEx mb_type_ex = vlGetMacroBlockTypeEx(&mc->macroblocks[i]);
+
+		vlGrabMacroBlockVB(mc, &mc->macroblocks[i], offset[mb_type_ex]);
+
+		offset[mb_type_ex]++;
+	}
+	
+	for (i = 0; i < 3; ++i)
+	{
+		pipe_surface_unmap(mc->tex_surface[i]);
+		mc->pipe->screen->tex_surface_release(mc->pipe->screen, &mc->tex_surface[i]);
+	}
+
+	mc->render_target.cbufs[0] = pipe->screen->get_tex_surface
+	(
+		pipe->screen,
+		mc->buffered_surface->texture,
+		0, 0, 0, PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE
+	);
+
+	pipe->set_framebuffer_state(pipe, &mc->render_target);
+	pipe->set_viewport_state(pipe, &mc->viewport);
+	vs_consts = pipe->winsys->buffer_map
+	(
+		pipe->winsys,
+		mc->vs_const_buf.buffer,
+		PIPE_BUFFER_USAGE_CPU_WRITE
+	);
+
+	vs_consts->denorm.x = mc->buffered_surface->texture->width[0];
+	vs_consts->denorm.y = mc->buffered_surface->texture->height[0];
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_VERTEX, 0, &mc->vs_const_buf);
+	pipe->set_constant_buffer(pipe, PIPE_SHADER_FRAGMENT, 0, &mc->fs_const_buf);
+
+	if (num_macroblocks[vlMacroBlockExTypeIntra] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 1, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 4, mc->vertex_elems);
+		pipe->set_sampler_textures(pipe, 3, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 3, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->i_vs);
+		pipe->bind_fs_state(pipe, mc->i_fs);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeIntra] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeIntra] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeFwdPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeFwdPredictedField] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[0]);
+		pipe->bind_fs_state(pipe, mc->p_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 2, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 6, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 4, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 4, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->p_vs[1]);
+		pipe->bind_fs_state(pipe, mc->p_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBkwdPredictedField] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->b_vs[0]);
+		pipe->bind_fs_state(pipe, mc->b_fs[0]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedFrame] * 24;
+	}
+
+	if (num_macroblocks[vlMacroBlockExTypeBiPredictedField] > 0)
+	{
+		pipe->set_vertex_buffers(pipe, 3, mc->vertex_bufs[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->set_vertex_elements(pipe, 8, mc->vertex_elems);
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][3] = mc->past_surface->texture;
+		mc->textures[mc->cur_buf % NUM_BUF_SETS][4] = mc->future_surface->texture;
+		pipe->set_sampler_textures(pipe, 5, mc->textures[mc->cur_buf % NUM_BUF_SETS]);
+		pipe->bind_sampler_states(pipe, 5, (void**)mc->samplers);
+		pipe->bind_vs_state(pipe, mc->b_vs[1]);
+		pipe->bind_fs_state(pipe, mc->b_fs[1]);
+
+		pipe->draw_arrays(pipe, PIPE_PRIM_TRIANGLES, vb_start, num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24);
+		vb_start += num_macroblocks[vlMacroBlockExTypeBiPredictedField] * 24;
+	}
+
+	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &mc->buffered_surface->render_fence);
+
+	for (i = 0; i < 3; ++i)
+		mc->zero_block[i].x = -1.0f;
+
+	mc->buffered_surface = NULL;
+	mc->num_macroblocks = 0;
+	mc->cur_buf++;
+
+	return 0;
+}
+
+static int vlRenderMacroBlocksMpeg2R16SnormBuffered
+(
+	struct vlRender *render,
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	unsigned int			i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+
+	if (mc->buffered_surface)
+	{
+		if (mc->buffered_surface != surface)
+		{
+			vlFlush(&mc->base);
+			mc->buffered_surface = surface;
+			mc->past_surface = batch->past_surface;
+			mc->future_surface = batch->future_surface;
+			mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
+			mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
+			
+			for (i = 0; i < 3; ++i)
+			{
+				mc->tex_surface[i] = mc->pipe->screen->get_tex_surface
+				(
+					mc->pipe->screen,
+					mc->textures[mc->cur_buf % NUM_BUF_SETS][i],
+					0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+				);
+
+				mc->texels[i] = pipe_surface_map(mc->tex_surface[i], PIPE_BUFFER_USAGE_CPU_WRITE);
+			}
+		}
+	}
+	else
+	{
+		mc->buffered_surface = surface;
+		mc->past_surface = batch->past_surface;
+		mc->future_surface = batch->future_surface;
+		mc->surface_tex_inv_size.x = 1.0f / surface->texture->width[0];
+		mc->surface_tex_inv_size.y = 1.0f / surface->texture->height[0];
+		
+		for (i = 0; i < 3; ++i)
+		{
+			mc->tex_surface[i] = mc->pipe->screen->get_tex_surface
+			(
+				mc->pipe->screen,
+				mc->textures[mc->cur_buf % NUM_BUF_SETS][i],
+				0, 0, 0, PIPE_BUFFER_USAGE_CPU_WRITE
+			);
+
+			mc->texels[i] = pipe_surface_map(mc->tex_surface[i], PIPE_BUFFER_USAGE_CPU_WRITE);
+		}
+	}
+
+	for (i = 0; i < batch->num_macroblocks; ++i)
+		vlGrabMacroBlock(mc, &batch->macroblocks[i]);
+
+	return 0;
+}
+
+static int vlEnd
+(
+	struct vlRender *render
+)
+{
+	assert(render);
+
+	return 0;
+}
+
+static int vlDestroy
+(
+	struct vlRender *render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	struct pipe_context		*pipe;
+	unsigned int			h, i;
+
+	assert(render);
+
+	mc = (struct vlR16SnormBufferedMC*)render;
+	pipe = mc->pipe;
+
+	for (i = 0; i < 5; ++i)
+		pipe->delete_sampler_state(pipe, mc->samplers[i]);
+
+	for (h = 0; h < NUM_BUF_SETS; ++h)
+			for (i = 0; i < 3; ++i)
+				pipe->winsys->buffer_destroy(pipe->winsys, mc->vertex_bufs[h][i].buffer);
+
+	/* Textures 3 & 4 are not created directly, no need to release them here */
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+	{
+		pipe_texture_release(&mc->textures[i][0]);
+		pipe_texture_release(&mc->textures[i][1]);
+		pipe_texture_release(&mc->textures[i][2]);
+	}
+
+	pipe->delete_vs_state(pipe, mc->i_vs);
+	pipe->delete_fs_state(pipe, mc->i_fs);
+
+	for (i = 0; i < 2; ++i)
+	{
+		pipe->delete_vs_state(pipe, mc->p_vs[i]);
+		pipe->delete_fs_state(pipe, mc->p_fs[i]);
+		pipe->delete_vs_state(pipe, mc->b_vs[i]);
+		pipe->delete_fs_state(pipe, mc->b_fs[i]);
+	}
+
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->vs_const_buf.buffer);
+	pipe->winsys->buffer_destroy(pipe->winsys, mc->fs_const_buf.buffer);
+
+	free(mc->macroblocks);
+	free(mc);
+
+	return 0;
+}
+
+/*
+ * Muliplier renormalizes block samples from 16 bits to 12 bits.
+ * Divider is used when calculating Y % 2 for choosing top or bottom
+ * field for P or B macroblocks.
+ * TODO: Use immediates.
+ */
+static const struct vlFragmentShaderConsts fs_consts =
+{
+	{32767.0f / 255.0f, 32767.0f / 255.0f, 32767.0f / 255.0f, 0.0f},
+	{0.5f, 2.0f, 0.0f, 0.0f}
+};
+
+#include "vl_r16snorm_mc_buf_shaders.inc"
+
+static int vlCreateDataBufs
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int	mbw = align(mc->picture_width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH;
+	const unsigned int	mbh = align(mc->picture_height, VL_MACROBLOCK_HEIGHT) / VL_MACROBLOCK_HEIGHT;
+	const unsigned int	num_mb_per_frame = mbw * mbh;
+
+	struct pipe_context	*pipe;
+	unsigned int		h, i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* Create our vertex buffers */
+	for (h = 0; h < NUM_BUF_SETS; ++h)
+	{
+		mc->vertex_bufs[h][0].pitch = sizeof(struct vlVertex2f) * 4;
+		mc->vertex_bufs[h][0].max_index = 24 * num_mb_per_frame - 1;
+		mc->vertex_bufs[h][0].buffer_offset = 0;
+		mc->vertex_bufs[h][0].buffer = pipe->winsys->buffer_create
+		(
+			pipe->winsys,
+			1,
+			PIPE_BUFFER_USAGE_VERTEX,
+			sizeof(struct vlVertex2f) * 4 * 24 * num_mb_per_frame
+		);
+
+		for (i = 1; i < 3; ++i)
+		{
+			mc->vertex_bufs[h][i].pitch = sizeof(struct vlVertex2f) * 2;
+			mc->vertex_bufs[h][i].max_index = 24 * num_mb_per_frame - 1;
+			mc->vertex_bufs[h][i].buffer_offset = 0;
+			mc->vertex_bufs[h][i].buffer = pipe->winsys->buffer_create
+			(
+				pipe->winsys,
+				1,
+				PIPE_BUFFER_USAGE_VERTEX,
+				sizeof(struct vlVertex2f) * 2 * 24 * num_mb_per_frame
+			);
+		}
+	}
+
+	/* Position element */
+	mc->vertex_elems[0].src_offset = 0;
+	mc->vertex_elems[0].vertex_buffer_index = 0;
+	mc->vertex_elems[0].nr_components = 2;
+	mc->vertex_elems[0].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Luma, texcoord element */
+	mc->vertex_elems[1].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[1].vertex_buffer_index = 0;
+	mc->vertex_elems[1].nr_components = 2;
+	mc->vertex_elems[1].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Chroma Cr texcoord element */
+	mc->vertex_elems[2].src_offset = sizeof(struct vlVertex2f) * 2;
+	mc->vertex_elems[2].vertex_buffer_index = 0;
+	mc->vertex_elems[2].nr_components = 2;
+	mc->vertex_elems[2].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Chroma Cb texcoord element */
+	mc->vertex_elems[3].src_offset = sizeof(struct vlVertex2f) * 3;
+	mc->vertex_elems[3].vertex_buffer_index = 0;
+	mc->vertex_elems[3].nr_components = 2;
+	mc->vertex_elems[3].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* First ref surface top field texcoord element */
+	mc->vertex_elems[4].src_offset = 0;
+	mc->vertex_elems[4].vertex_buffer_index = 1;
+	mc->vertex_elems[4].nr_components = 2;
+	mc->vertex_elems[4].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* First ref surface bottom field texcoord element */
+	mc->vertex_elems[5].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[5].vertex_buffer_index = 1;
+	mc->vertex_elems[5].nr_components = 2;
+	mc->vertex_elems[5].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Second ref surface top field texcoord element */
+	mc->vertex_elems[6].src_offset = 0;
+	mc->vertex_elems[6].vertex_buffer_index = 2;
+	mc->vertex_elems[6].nr_components = 2;
+	mc->vertex_elems[6].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Second ref surface bottom field texcoord element */
+	mc->vertex_elems[7].src_offset = sizeof(struct vlVertex2f);
+	mc->vertex_elems[7].vertex_buffer_index = 2;
+	mc->vertex_elems[7].nr_components = 2;
+	mc->vertex_elems[7].src_format = PIPE_FORMAT_R32G32_FLOAT;
+
+	/* Create our constant buffer */
+	mc->vs_const_buf.size = sizeof(struct vlVertexShaderConsts);
+	mc->vs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->vs_const_buf.size
+	);
+
+	mc->fs_const_buf.size = sizeof(struct vlFragmentShaderConsts);
+	mc->fs_const_buf.buffer = pipe->winsys->buffer_create
+	(
+		pipe->winsys,
+		1,
+		PIPE_BUFFER_USAGE_CONSTANT,
+		mc->fs_const_buf.size
+	);
+
+	memcpy
+	(
+		pipe->winsys->buffer_map(pipe->winsys, mc->fs_const_buf.buffer, PIPE_BUFFER_USAGE_CPU_WRITE),
+		&fs_consts,
+		sizeof(struct vlFragmentShaderConsts)
+	);
+
+	pipe->winsys->buffer_unmap(pipe->winsys, mc->fs_const_buf.buffer);
+
+	mc->macroblocks = malloc(sizeof(struct vlMpeg2MacroBlock) * num_mb_per_frame);
+
+	return 0;
+}
+
+static int vlInit
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	struct pipe_context		*pipe;
+	struct pipe_sampler_state	sampler;
+	struct pipe_texture		template;
+	unsigned int			filters[5];
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+
+	/* For MC we render to textures, which are rounded up to nearest POT */
+	mc->viewport.scale[0] = vlRoundUpPOT(mc->picture_width);
+	mc->viewport.scale[1] = vlRoundUpPOT(mc->picture_height);
+	mc->viewport.scale[2] = 1;
+	mc->viewport.scale[3] = 1;
+	mc->viewport.translate[0] = 0;
+	mc->viewport.translate[1] = 0;
+	mc->viewport.translate[2] = 0;
+	mc->viewport.translate[3] = 0;
+
+	mc->render_target.width = vlRoundUpPOT(mc->picture_width);
+	mc->render_target.height = vlRoundUpPOT(mc->picture_height);
+	mc->render_target.num_cbufs = 1;
+	/* FB for MC stage is a vlSurface created by the user, set at render time */
+	mc->render_target.zsbuf = NULL;
+
+	filters[0] = PIPE_TEX_FILTER_NEAREST;
+	/* FIXME: Linear causes discoloration around block edges */
+	filters[1] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
+	filters[2] = /*mc->picture_format == vlFormatYCbCr444 ?*/ PIPE_TEX_FILTER_NEAREST /*: PIPE_TEX_FILTER_LINEAR*/;
+	filters[3] = PIPE_TEX_FILTER_LINEAR;
+	filters[4] = PIPE_TEX_FILTER_LINEAR;
+
+	for (i = 0; i < 5; ++i)
+	{
+		sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+		sampler.min_img_filter = filters[i];
+		sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
+		sampler.mag_img_filter = filters[i];
+		sampler.compare_mode = PIPE_TEX_COMPARE_NONE;
+		sampler.compare_func = PIPE_FUNC_ALWAYS;
+		sampler.normalized_coords = 1;
+		/*sampler.prefilter = ;*/
+		/*sampler.shadow_ambient = ;*/
+		/*sampler.lod_bias = ;*/
+		sampler.min_lod = 0;
+		/*sampler.max_lod = ;*/
+		/*sampler.border_color[i] = ;*/
+		/*sampler.max_anisotropy = ;*/
+		mc->samplers[i] = pipe->create_sampler_state(pipe, &sampler);
+	}
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_R16_SNORM;
+	template.last_level = 0;
+	template.width[0] = vlRoundUpPOT(mc->picture_width);
+	template.height[0] = vlRoundUpPOT(mc->picture_height);
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER;
+
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+		mc->textures[i][0] = pipe->screen->texture_create(pipe->screen, &template);
+
+	if (mc->picture_format == vlFormatYCbCr420)
+	{
+		template.width[0] = vlRoundUpPOT(mc->picture_width / 2);
+		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
+	}
+	else if (mc->picture_format == vlFormatYCbCr422)
+		template.height[0] = vlRoundUpPOT(mc->picture_height / 2);
+
+	for (i = 0; i < NUM_BUF_SETS; ++i)
+	{
+		mc->textures[i][1] = pipe->screen->texture_create(pipe->screen, &template);
+		mc->textures[i][2] = pipe->screen->texture_create(pipe->screen, &template);
+	}
+
+	/* textures[3] & textures[4] are assigned from vlSurfaces for P and B macroblocks at render time */
+
+	vlCreateVertexShaderIMB(mc);
+	vlCreateFragmentShaderIMB(mc);
+	vlCreateVertexShaderFramePMB(mc);
+	vlCreateVertexShaderFieldPMB(mc);
+	vlCreateFragmentShaderFramePMB(mc);
+	vlCreateFragmentShaderFieldPMB(mc);
+	vlCreateVertexShaderFrameBMB(mc);
+	vlCreateVertexShaderFieldBMB(mc);
+	vlCreateFragmentShaderFrameBMB(mc);
+	vlCreateFragmentShaderFieldBMB(mc);
+	vlCreateDataBufs(mc);
+
+	return 0;
+}
+
+int vlCreateR16SNormBufferedMC
+(
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	struct vlRender **render
+)
+{
+	struct vlR16SnormBufferedMC	*mc;
+	unsigned int			i;
+
+	assert(pipe);
+	assert(render);
+
+	mc = calloc(1, sizeof(struct vlR16SnormBufferedMC));
+
+	mc->base.vlBegin = &vlBegin;
+	mc->base.vlRenderMacroBlocksMpeg2 = &vlRenderMacroBlocksMpeg2R16SnormBuffered;
+	mc->base.vlEnd = &vlEnd;
+	mc->base.vlFlush = &vlFlush;
+	mc->base.vlDestroy = &vlDestroy;
+	mc->pipe = pipe;
+	mc->picture_width = picture_width;
+	mc->picture_height = picture_height;
+
+	mc->cur_buf = 0;
+	mc->buffered_surface = NULL;
+	mc->past_surface = NULL;
+	mc->future_surface = NULL;
+	for (i = 0; i < 3; ++i)
+		mc->zero_block[i].x = -1.0f;
+	mc->num_macroblocks = 0;
+
+	vlInit(mc);
+
+	*render = &mc->base;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
new file mode 100644
index 0000000000..27177d64ca
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf.h
@@ -0,0 +1,18 @@
+#ifndef vl_r16snorm_mc_buf_h
+#define vl_r16snorm_mc_buf_h
+
+#include "vl_types.h"
+
+struct pipe_context;
+struct vlRender;
+
+int vlCreateR16SNormBufferedMC
+(
+	struct pipe_context *pipe,
+	unsigned int picture_width,
+	unsigned int picture_height,
+	enum vlFormat picture_format,
+	struct vlRender **render
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
new file mode 100644
index 0000000000..ef4a4b2add
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_r16snorm_mc_buf_shaders.inc
@@ -0,0 +1,1185 @@
+static int vlCreateVertexShaderIMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 50;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 4; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->i_vs = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderIMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header,max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->i_fs = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFramePMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4		; Ref surface top field texcoords
+	 * decl i5		; Ref surface bottom field texcoords (unused, packed in the same stream)
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* add o4, i0, i4	; Translate vertex pos by motion vec to form ref macroblock texcoords */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, 4);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldPMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; Ref macroblock top field texcoords
+	 * decl i5              ; Ref macroblock bottom field texcoords
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0		; Render target dimensions */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; Ref macroblock top field texcoords
+	 * decl o5		; Ref macroblock bottom field texcoords
+	 * decl o6		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 7; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form top field macroblock texcoords
+	 * add o5, i0, i5	; Translate vertex pos by motion vec to form bottom field macroblock texcoords
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o6, i0, c0	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 6, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->p_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFramePMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; Ref macroblock texcoords
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* tex2d t1, i3, s3		; Read texel from ref macroblock */
+	inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, 3, TGSI_FILE_SAMPLER, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldPMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; Ref macroblock top field texcoords
+	 * decl i4			; Ref macroblock bottom field texcoords
+	 * decl i5			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 6; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t4 */
+	decl = vl_decl_temps(0, 4);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for ref surface texture
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from ref macroblock top field
+	 * tex2d t2, i4, s3		; Read texel from ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i5.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 5, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->p_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFrameBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; First ref macroblock top field texcoords
+	 * decl i5              ; First ref macroblock bottom field texcoords (unused, packed in the same stream)
+	 * decl i6		; Second ref macroblock top field texcoords
+	 * decl i7		; Second ref macroblock bottom field texcoords (unused, packed in the same stream)
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; First ref macroblock texcoords
+	 * decl o5		; Second ref macroblock texcoords
+	 */
+	for (i = 0; i < 6; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first ref macroblock texcoords
+	 * add o5, i0, i6	; Translate vertex pos by motion vec to form second ref macroblock texcoords
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, (i + 2) * 2);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[0] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateVertexShaderFieldBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	vs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_VERTEX, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0		; Vertex pos
+	 * decl i1		; Luma texcoords
+	 * decl i2		; Chroma Cb texcoords
+	 * decl i3		; Chroma Cr texcoords
+	 * decl i4              ; First ref macroblock top field texcoords
+	 * decl i5              ; First ref macroblock bottom field texcoords
+	 * decl i6              ; Second ref macroblock top field texcoords
+	 * decl i7              ; Second ref macroblock bottom field texcoords
+	 */
+	for (i = 0; i < 8; i++)
+	{
+		decl = vl_decl_input(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl c0		; Render target dimensions */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl o0		; Vertex pos
+	 * decl o1		; Luma texcoords
+	 * decl o2		; Chroma Cb texcoords
+	 * decl o3		; Chroma Cr texcoords
+	 * decl o4		; First ref macroblock top field texcoords
+	 * decl o5		; First ref macroblock Bottom field texcoords
+	 * decl o6		; Second ref macroblock top field texcoords
+	 * decl o7		; Second ref macroblock Bottom field texcoords
+	 * decl o8		; Denormalized vertex pos
+	 */
+	for (i = 0; i < 9; i++)
+	{
+		decl = vl_decl_output(i == 0 ? TGSI_SEMANTIC_POSITION : TGSI_SEMANTIC_GENERIC, i, i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* decl t0, t1 */
+	decl = vl_decl_temps(0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * mov o0, i0		; Move input vertex pos to output
+	 * mov o1, i1		; Move input luma texcoords to output
+	 * mov o2, i2		; Move input chroma Cb texcoords to output
+	 * mov o3, i3		; Move input chroma Cr texcoords to output
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_OUTPUT, i, TGSI_FILE_INPUT, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * add o4, i0, i4	; Translate vertex pos by motion vec to form first top field macroblock texcoords
+	 * add o5, i0, i5	; Translate vertex pos by motion vec to form first bottom field macroblock texcoords
+	 * add o6, i0, i6	; Translate vertex pos by motion vec to form second top field macroblock texcoords
+	 * add o7, i0, i7	; Translate vertex pos by motion vec to form second bottom field macroblock texcoords
+	 */
+	for (i = 0; i < 4; ++i)
+	{
+		inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, i + 4, TGSI_FILE_INPUT, 0, TGSI_FILE_INPUT, i + 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul o8, i0, c0	; Denorm vertex pos */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_OUTPUT, 8, TGSI_FILE_INPUT, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	vs.tokens = tokens;
+	mc->b_vs[1] = pipe->create_vs_state(pipe, &vs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFrameBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 100;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; First ref macroblock texcoords
+	 * decl i4			; Second ref macroblock texcoords
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constant 1/2 in .x channel to use as weight to blend past and future texels
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t2 */
+	decl = vl_decl_temps(0, 2);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for first ref surface texture
+	 * decl s4			; Sampler for second ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from first ref macroblock
+	 * tex2d t2, i4, s4		; Read texel from second ref macroblock
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, i + 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[0] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
+
+static int vlCreateFragmentShaderFieldBMB
+(
+	struct vlR16SnormBufferedMC *mc
+)
+{
+	const unsigned int		max_tokens = 200;
+
+	struct pipe_context		*pipe;
+	struct pipe_shader_state	fs;
+	struct tgsi_token		*tokens;
+	struct tgsi_header		*header;
+
+	struct tgsi_full_declaration	decl;
+	struct tgsi_full_instruction	inst;
+
+	unsigned int			ti;
+	unsigned int			i;
+
+	assert(mc);
+
+	pipe = mc->pipe;
+	tokens = (struct tgsi_token*)malloc(max_tokens * sizeof(struct tgsi_token));
+
+	/* Version */
+	*(struct tgsi_version*)&tokens[0] = tgsi_build_version();
+	/* Header */
+	header = (struct tgsi_header*)&tokens[1];
+	*header = tgsi_build_header();
+	/* Processor */
+	*(struct tgsi_processor*)&tokens[2] = tgsi_build_processor(TGSI_PROCESSOR_FRAGMENT, header);
+
+	ti = 3;
+
+	/*
+	 * decl i0			; Luma texcoords
+	 * decl i1			; Chroma Cb texcoords
+	 * decl i2			; Chroma Cr texcoords
+	 * decl i3			; First ref macroblock top field texcoords
+	 * decl i4			; First ref macroblock bottom field texcoords
+	 * decl i5			; Second ref macroblock top field texcoords
+	 * decl i6			; Second ref macroblock bottom field texcoords
+	 * decl i7			; Denormalized vertex pos
+	 */
+	for (i = 0; i < 8; ++i)
+	{
+		decl = vl_decl_interpolated_input(TGSI_SEMANTIC_GENERIC, i + 1, i, i, TGSI_INTERPOLATE_LINEAR);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * decl c0			; Scaling factor, rescales 16-bit snorm to 9-bit snorm
+	 * decl c1			; Constants 1/2 & 2 in .x, .y channels to use as weight to blend past and future texels
+	 *				; and for Y-mod-2 top/bottom field selection
+	 */
+	decl = vl_decl_constants(TGSI_SEMANTIC_GENERIC, 0, 0, 1);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl o0			; Fragment color */
+	decl = vl_decl_output(TGSI_SEMANTIC_COLOR, 0, 0, 0);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/* decl t0-t5 */
+	decl = vl_decl_temps(0, 5);
+	ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * decl s0			; Sampler for luma texture
+	 * decl s1			; Sampler for chroma Cb texture
+	 * decl s2			; Sampler for chroma Cr texture
+	 * decl s3			; Sampler for first ref surface texture
+	 * decl s4			; Sampler for second ref surface texture
+	 */
+	for (i = 0; i < 5; ++i)
+	{
+		decl = vl_decl_samplers(i, i);
+		ti += tgsi_build_full_declaration(&decl, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/*
+	 * tex2d t1, i0, s0		; Read texel from luma texture
+	 * mov t0.x, t1.x		; Move luma sample into .x component
+	 * tex2d t1, i1, s1		; Read texel from chroma Cb texture
+	 * mov t0.y, t1.x		; Move Cb sample into .y component
+	 * tex2d t1, i2, s2		; Read texel from chroma Cr texture
+	 * mov t0.z, t1.x		; Move Cr sample into .z component
+	 */
+	for (i = 0; i < 3; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_INPUT, i, TGSI_FILE_SAMPLER, i);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+		inst = vl_inst2(TGSI_OPCODE_MOV, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+		inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+		inst.FullDstRegisters[0].DstRegister.WriteMask = TGSI_WRITEMASK_X << i;
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* mul t0, t0, c0		; Rescale texel to correct range */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_CONSTANT, 0);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* XXX: Pos values off by 0.5? */
+	/* sub t4, i7.y, c1.x		; Sub 0.5 from denormalized pos */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_INPUT, 7, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t4, c1.x		; Multiply pos Y-coord by 1/2 */
+	inst = vl_inst3(TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* floor t3, t3			; Get rid of fractional part */
+	inst = vl_inst2(TGSI_OPCODE_FLOOR, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* mul t3, t3, c1.y		; Multiply by 2 */
+	inst = vl_inst3( TGSI_OPCODE_MUL, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_CONSTANT, 1);
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleX = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleY = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleZ = TGSI_SWIZZLE_Y;
+	inst.FullSrcRegisters[1].SrcRegister.SwizzleW = TGSI_SWIZZLE_Y;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* sub t3, t4, t3		; Subtract from original Y to get Y % 2 */
+	inst = vl_inst3(TGSI_OPCODE_SUB, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 3);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t1, i3, s3		; Read texel from past ref macroblock top field
+	 * tex2d t2, i4, s3		; Read texel from past ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 1, TGSI_FILE_INPUT, i + 3, TGSI_FILE_SAMPLER, 3);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t1, t3, t1, t2		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/*
+	 * tex2d t4, i5, s4		; Read texel from future ref macroblock top field
+	 * tex2d t5, i6, s4		; Read texel from future ref macroblock bottom field
+	 */
+	for (i = 0; i < 2; ++i)
+	{
+		inst = vl_tex(TGSI_TEXTURE_2D, TGSI_FILE_TEMPORARY, i + 4, TGSI_FILE_INPUT, i + 5, TGSI_FILE_SAMPLER, 4);
+		ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+	}
+
+	/* TODO: Move to conditional tex fetch on t3 instead of lerp */
+	/* lerp t2, t3, t4, t5		; Choose between top and bottom fields based on Y % 2 */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 2, TGSI_FILE_TEMPORARY, 3, TGSI_FILE_TEMPORARY, 4, TGSI_FILE_TEMPORARY, 5);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* lerp t1, c1.x, t1, t2	; Blend past and future texels */
+	inst = vl_inst4(TGSI_OPCODE_LERP, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_CONSTANT, 1, TGSI_FILE_TEMPORARY, 1, TGSI_FILE_TEMPORARY, 2);
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleX = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleY = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleZ = TGSI_SWIZZLE_X;
+	inst.FullSrcRegisters[0].SrcRegister.SwizzleW = TGSI_SWIZZLE_X;
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* add o0, t0, t1		; Add past/future ref and differential to form final output */
+	inst = vl_inst3(TGSI_OPCODE_ADD, TGSI_FILE_OUTPUT, 0, TGSI_FILE_TEMPORARY, 0, TGSI_FILE_TEMPORARY, 1);
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	/* end */
+	inst = vl_end();
+	ti += tgsi_build_full_instruction(&inst, &tokens[ti], header, max_tokens - ti);
+
+	fs.tokens = tokens;
+	mc->b_fs[1] = pipe->create_fs_state(pipe, &fs);
+	free(tokens);
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_render.h b/src/gallium/state_trackers/g3dvl/vl_render.h
new file mode 100644
index 0000000000..166030b498
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_render.h
@@ -0,0 +1,38 @@
+#ifndef vl_render_h
+#define vl_render_h
+
+#include "vl_types.h"
+
+struct pipe_surface;
+
+struct vlRender
+{
+	int (*vlBegin)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlRenderMacroBlocksMpeg2)
+	(
+		struct vlRender *render,
+		struct vlMpeg2MacroBlockBatch *batch,
+		struct vlSurface *surface
+	);
+
+	int (*vlEnd)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlFlush)
+	(
+		struct vlRender *render
+	);
+
+	int (*vlDestroy)
+	(
+		struct vlRender *render
+	);
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.c b/src/gallium/state_trackers/g3dvl/vl_screen.c
new file mode 100644
index 0000000000..484f63b0d4
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.c
@@ -0,0 +1,115 @@
+#define VL_INTERNAL
+#include "vl_screen.h"
+#include <assert.h>
+#include <stdlib.h>
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+)
+{
+	struct vlScreen *scrn;
+
+	assert(display);
+	assert(pscreen);
+	assert(vl_screen);
+
+	scrn = calloc(1, sizeof(struct vlScreen));
+
+	if (!scrn)
+		return 1;
+
+	scrn->display = display;
+	scrn->ordinal = screen;
+	scrn->pscreen = pscreen;
+	*vl_screen = scrn;
+
+	return 0;
+}
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	free(screen);
+
+	return 0;
+}
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->display;
+}
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return screen->pscreen;
+}
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlProfileCount;
+}
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+)
+{
+	assert(screen);
+	assert(profiles);
+
+	profiles[0] = vlProfileMpeg2Simple;
+	profiles[1] = vlProfileMpeg2Main;
+
+	return 0;
+}
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+)
+{
+	assert(screen);
+
+	return vlEntryPointCount;
+}
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+)
+{
+	assert(screen);
+	assert(entry_points);
+
+	entry_points[0] = vlEntryPointIDCT;
+	entry_points[1] = vlEntryPointMC;
+	entry_points[2] = vlEntryPointCSC;
+
+	return 0;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_screen.h b/src/gallium/state_trackers/g3dvl/vl_screen.h
new file mode 100644
index 0000000000..98f3d429b6
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_screen.h
@@ -0,0 +1,63 @@
+#ifndef vl_screen_h
+#define vl_screen_h
+
+#include "vl_types.h"
+
+struct pipe_screen;
+
+#ifdef VL_INTERNAL
+struct vlScreen
+{
+	struct vlDisplay	*display;
+	unsigned int		ordinal;
+	struct pipe_screen	*pscreen;
+};
+#endif
+
+int vlCreateScreen
+(
+	struct vlDisplay *display,
+	int screen,
+	struct pipe_screen *pscreen,
+	struct vlScreen **vl_screen
+);
+
+int vlDestroyScreen
+(
+	struct vlScreen *screen
+);
+
+struct vlDisplay* vlGetDisplay
+(
+	struct vlScreen *screen
+);
+
+struct pipe_screen* vlGetPipeScreen
+(
+	struct vlScreen *screen
+);
+
+unsigned int vlGetMaxProfiles
+(
+	struct vlScreen *screen
+);
+
+int vlQueryProfiles
+(
+	struct vlScreen *screen,
+	enum vlProfile *profiles
+);
+
+unsigned int vlGetMaxEntryPoints
+(
+	struct vlScreen *screen
+);
+
+int vlQueryEntryPoints
+(
+	struct vlScreen *screen,
+	enum vlProfile profile,
+	enum vlEntryPoint *entry_points
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.c b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
new file mode 100644
index 0000000000..51f1721a33
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.c
@@ -0,0 +1,215 @@
+#include "vl_shader_build.h"
+#include <assert.h>
+#include <tgsi/tgsi_parse.h>
+#include <tgsi/tgsi_build.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_INPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+	unsigned int name,
+	unsigned int index,
+	unsigned int first,
+	unsigned int last,
+	int interpolation
+)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	assert
+	(
+		interpolation == TGSI_INTERPOLATE_CONSTANT ||
+		interpolation == TGSI_INTERPOLATE_LINEAR ||
+		interpolation == TGSI_INTERPOLATE_PERSPECTIVE
+	);
+
+	decl.Declaration.File = TGSI_FILE_INPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.Declaration.Interpolate = interpolation;;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_CONSTANT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl.Declaration.File = TGSI_FILE_OUTPUT;
+	decl.Declaration.Semantic = 1;
+	decl.Semantic.SemanticName = name;
+	decl.Semantic.SemanticIndex = index;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl = tgsi_default_full_declaration();
+	decl.Declaration.File = TGSI_FILE_TEMPORARY;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last)
+{
+	struct tgsi_full_declaration decl = tgsi_default_full_declaration();
+
+	decl = tgsi_default_full_declaration();
+	decl.Declaration.File = TGSI_FILE_SAMPLER;
+	decl.DeclarationRange.First = first;
+	decl.DeclarationRange.Last = last;
+
+	return decl;
+}
+
+struct tgsi_full_instruction vl_inst2
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src_file,
+	unsigned int src_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 1;
+	inst.FullSrcRegisters[0].SrcRegister.File = src_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_inst3
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 2;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_tex
+(
+	int tex,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = TGSI_OPCODE_TEX;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 2;
+	inst.InstructionExtTexture.Texture = tex;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_inst4
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index,
+	enum tgsi_file_type src3_file,
+	unsigned int src3_index
+)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = opcode;
+	inst.Instruction.NumDstRegs = 1;
+	inst.FullDstRegisters[0].DstRegister.File = dst_file;
+	inst.FullDstRegisters[0].DstRegister.Index = dst_index;
+	inst.Instruction.NumSrcRegs = 3;
+	inst.FullSrcRegisters[0].SrcRegister.File = src1_file;
+	inst.FullSrcRegisters[0].SrcRegister.Index = src1_index;
+	inst.FullSrcRegisters[1].SrcRegister.File = src2_file;
+	inst.FullSrcRegisters[1].SrcRegister.Index = src2_index;
+	inst.FullSrcRegisters[2].SrcRegister.File = src3_file;
+	inst.FullSrcRegisters[2].SrcRegister.Index = src3_index;
+
+	return inst;
+}
+
+struct tgsi_full_instruction vl_end(void)
+{
+	struct tgsi_full_instruction inst = tgsi_default_full_instruction();
+
+	inst.Instruction.Opcode = TGSI_OPCODE_END;
+	inst.Instruction.NumDstRegs = 0;
+	inst.Instruction.NumSrcRegs = 0;
+
+	return inst;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_shader_build.h b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
new file mode 100644
index 0000000000..dc615cb156
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_shader_build.h
@@ -0,0 +1,61 @@
+#ifndef vl_shader_build_h
+#define vl_shader_build_h
+
+#include <pipe/p_shader_tokens.h>
+
+struct tgsi_full_declaration vl_decl_input(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_interpolated_input
+(
+	unsigned int name,
+	unsigned int index,
+	unsigned int first,
+	unsigned int last,
+	int interpolation
+);
+struct tgsi_full_declaration vl_decl_constants(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_output(unsigned int name, unsigned int index, unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_temps(unsigned int first, unsigned int last);
+struct tgsi_full_declaration vl_decl_samplers(unsigned int first, unsigned int last);
+struct tgsi_full_instruction vl_inst2
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src_file,
+	unsigned int src_index
+);
+struct tgsi_full_instruction vl_inst3
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+);
+struct tgsi_full_instruction vl_tex
+(
+	int tex,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index
+);
+struct tgsi_full_instruction vl_inst4
+(
+	int opcode,
+	enum tgsi_file_type dst_file,
+	unsigned int dst_index,
+	enum tgsi_file_type src1_file,
+	unsigned int src1_index,
+	enum tgsi_file_type src2_file,
+	unsigned int src2_index,
+	enum tgsi_file_type src3_file,
+	unsigned int src3_index
+);
+struct tgsi_full_instruction vl_end(void);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.c b/src/gallium/state_trackers/g3dvl/vl_surface.c
new file mode 100644
index 0000000000..076bd40d41
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.c
@@ -0,0 +1,237 @@
+#define VL_INTERNAL
+#include "vl_surface.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pipe/p_screen.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <vl_winsys.h>
+#include "vl_screen.h"
+#include "vl_context.h"
+#include "vl_render.h"
+#include "vl_csc.h"
+#include "vl_util.h"
+
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+)
+{
+	struct vlSurface	*sfc;
+	struct pipe_texture	template;
+
+	assert(screen);
+	assert(surface);
+
+	sfc = calloc(1, sizeof(struct vlSurface));
+
+	if (!sfc)
+		return 1;
+
+	sfc->screen = screen;
+	sfc->width = width;
+	sfc->height = height;
+	sfc->format = format;
+
+	memset(&template, 0, sizeof(struct pipe_texture));
+	template.target = PIPE_TEXTURE_2D;
+	template.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+	template.last_level = 0;
+	template.width[0] = vlRoundUpPOT(sfc->width);
+	template.height[0] = vlRoundUpPOT(sfc->height);
+	template.depth[0] = 1;
+	template.compressed = 0;
+	pf_get_block(template.format, &template.block);
+	template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	sfc->texture = vlGetPipeScreen(screen)->texture_create(vlGetPipeScreen(screen), &template);
+
+	*surface = sfc;
+
+	return 0;
+}
+
+int vlDestroySurface
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+
+	pipe_texture_release(&surface->texture);
+	free(surface);
+
+	return 0;
+}
+
+int vlRenderMacroBlocksMpeg2
+(
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+)
+{
+	assert(batch);
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlBegin(surface->context->render);
+
+	surface->context->render->vlRenderMacroBlocksMpeg2
+	(
+		surface->context->render,
+		batch,
+		surface
+	);
+
+	surface->context->render->vlEnd(surface->context->render);
+
+	return 0;
+}
+
+int vlPutPicture
+(
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	int drawable_w,
+	int drawable_h,
+	enum vlPictureType picture_type
+)
+{
+	struct vlCSC		*csc;
+	struct pipe_context	*pipe;
+
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlFlush(surface->context->render);
+
+	csc = surface->context->csc;
+	pipe = surface->context->pipe;
+
+	csc->vlResizeFrameBuffer(csc, drawable_w, drawable_h);
+
+	csc->vlBegin(csc);
+
+	csc->vlPutPicture
+	(
+		csc,
+		surface,
+		srcx,
+		srcy,
+		srcw,
+		srch,
+		destx,
+		desty,
+		destw,
+		desth,
+		picture_type
+	);
+
+	csc->vlEnd(csc);
+
+	pipe->flush(pipe, PIPE_FLUSH_RENDER_CACHE, &surface->disp_fence);
+
+	bind_pipe_drawable(pipe, drawable);
+
+	pipe->winsys->flush_frontbuffer
+	(
+		pipe->winsys,
+		csc->vlGetFrameBuffer(csc),
+		pipe->priv
+	);
+
+	return 0;
+}
+
+int vlSurfaceGetStatus
+(
+	struct vlSurface *surface,
+	enum vlResourceStatus *status
+)
+{
+	assert(surface);
+	assert(surface->context);
+	assert(status);
+
+	if (surface->render_fence && !surface->context->pipe->winsys->fence_signalled(surface->context->pipe->winsys, surface->render_fence, 0))
+	{
+		*status = vlResourceStatusRendering;
+		return 0;
+	}
+
+	if (surface->disp_fence && !surface->context->pipe->winsys->fence_signalled(surface->context->pipe->winsys, surface->disp_fence, 0))
+	{
+		*status = vlResourceStatusDisplaying;
+		return 0;
+	}
+
+	*status = vlResourceStatusFree;
+
+	return 0;
+}
+
+int vlSurfaceFlush
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+	assert(surface->context);
+
+	surface->context->render->vlFlush(surface->context->render);
+
+	return 0;
+}
+
+int vlSurfaceSync
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+	assert(surface->context);
+	assert(surface->render_fence);
+
+	surface->context->pipe->winsys->fence_finish(surface->context->pipe->winsys, surface->render_fence, 0);
+
+	return 0;
+}
+
+struct vlScreen* vlSurfaceGetScreen
+(
+	struct vlSurface *surface
+)
+{
+	assert(surface);
+
+	return surface->screen;
+}
+
+struct vlContext* vlBindToContext
+(
+	struct vlSurface *surface,
+	struct vlContext *context
+)
+{
+	struct vlContext *old;
+
+	assert(surface);
+
+	old = surface->context;
+	surface->context = context;
+
+	return old;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_surface.h b/src/gallium/state_trackers/g3dvl/vl_surface.h
new file mode 100644
index 0000000000..133e1515ef
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_surface.h
@@ -0,0 +1,86 @@
+#ifndef vl_surface_h
+#define vl_surface_h
+
+#include "vl_types.h"
+
+#ifdef VL_INTERNAL
+struct pipe_texture;
+
+struct vlSurface
+{
+	struct vlScreen			*screen;
+	struct vlContext		*context;
+	unsigned int			width;
+	unsigned int			height;
+	enum vlFormat			format;
+	struct pipe_texture		*texture;
+	struct pipe_fence_handle	*render_fence;
+	struct pipe_fence_handle	*disp_fence;
+};
+#endif
+
+int vlCreateSurface
+(
+	struct vlScreen *screen,
+	unsigned int width,
+	unsigned int height,
+	enum vlFormat format,
+	struct vlSurface **surface
+);
+
+int vlDestroySurface
+(
+	struct vlSurface *surface
+);
+
+int vlRenderMacroBlocksMpeg2
+(
+	struct vlMpeg2MacroBlockBatch *batch,
+	struct vlSurface *surface
+);
+
+int vlPutPicture
+(
+	struct vlSurface *surface,
+	vlNativeDrawable drawable,
+	int srcx,
+	int srcy,
+	int srcw,
+	int srch,
+	int destx,
+	int desty,
+	int destw,
+	int desth,
+	int drawable_w,
+	int drawable_h,
+	enum vlPictureType picture_type
+);
+
+int vlSurfaceGetStatus
+(
+	struct vlSurface *surface,
+	enum vlResourceStatus *status
+);
+
+int vlSurfaceFlush
+(
+	struct vlSurface *surface
+);
+
+int vlSurfaceSync
+(
+	struct vlSurface *surface
+);
+
+struct vlScreen* vlSurfaceGetScreen
+(
+	struct vlSurface *surface
+);
+
+struct vlContext* vlBindToContext
+(
+	struct vlSurface *surface,
+	struct vlContext *context
+);
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_types.h b/src/gallium/state_trackers/g3dvl/vl_types.h
new file mode 100644
index 0000000000..274e1f7437
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_types.h
@@ -0,0 +1,115 @@
+#ifndef vl_types_h
+#define vl_types_h
+
+#if 1 /*#ifdef X11*/
+#include <X11/Xlib.h>
+
+typedef Display* vlNativeDisplay;
+typedef Drawable vlNativeDrawable;
+#endif
+
+struct vlDisplay;
+struct vlScreen;
+struct vlContext;
+struct vlSurface;
+
+enum vlResourceStatus
+{
+	vlResourceStatusFree,
+	vlResourceStatusRendering,
+	vlResourceStatusDisplaying
+};
+
+enum vlProfile
+{
+	vlProfileMpeg2Simple,
+	vlProfileMpeg2Main,
+
+	vlProfileCount
+};
+
+enum vlEntryPoint
+{
+	vlEntryPointIDCT,
+	vlEntryPointMC,
+	vlEntryPointCSC,
+
+	vlEntryPointCount
+};
+
+enum vlFormat
+{
+	vlFormatYCbCr420,
+	vlFormatYCbCr422,
+	vlFormatYCbCr444
+};
+
+enum vlPictureType
+{
+	vlPictureTypeTopField,
+	vlPictureTypeBottomField,
+	vlPictureTypeFrame
+};
+
+enum vlMotionType
+{
+	vlMotionTypeField,
+	vlMotionTypeFrame,
+	vlMotionTypeDualPrime,
+	vlMotionType16x8
+};
+
+enum vlFieldOrder
+{
+	vlFieldOrderFirst,
+	vlFieldOrderSecond
+};
+
+enum vlDCTType
+{
+	vlDCTTypeFrameCoded,
+	vlDCTTypeFieldCoded
+};
+
+struct vlVertex2f
+{
+	float x, y;
+};
+
+struct vlVertex4f
+{
+	float x, y, z, w;
+};
+
+enum vlMacroBlockType
+{
+	vlMacroBlockTypeIntra,
+	vlMacroBlockTypeFwdPredicted,
+	vlMacroBlockTypeBkwdPredicted,
+	vlMacroBlockTypeBiPredicted,
+
+	vlNumMacroBlockTypes
+};
+
+struct vlMpeg2MacroBlock
+{
+	unsigned int		mbx, mby;
+	enum vlMacroBlockType	mb_type;
+	enum vlMotionType	mo_type;
+	enum vlDCTType		dct_type;
+	int			PMV[2][2][2];
+	unsigned int		cbp;
+	short			*blocks;
+};
+
+struct vlMpeg2MacroBlockBatch
+{
+	struct vlSurface		*past_surface;
+	struct vlSurface		*future_surface;
+	enum vlPictureType		picture_type;
+	enum vlFieldOrder		field_order;
+	unsigned int			num_macroblocks;
+	struct vlMpeg2MacroBlock	*macroblocks;
+};
+
+#endif
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.c b/src/gallium/state_trackers/g3dvl/vl_util.c
new file mode 100644
index 0000000000..50aa9af66f
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_util.c
@@ -0,0 +1,16 @@
+#include "vl_util.h"
+#include <assert.h>
+
+unsigned int vlRoundUpPOT(unsigned int x)
+{
+	unsigned int i;
+
+	assert(x > 0);
+
+	--x;
+
+	for (i = 1; i < sizeof(unsigned int) * 8; i <<= 1)
+		x |= x >> i;
+
+	return x + 1;
+}
diff --git a/src/gallium/state_trackers/g3dvl/vl_util.h b/src/gallium/state_trackers/g3dvl/vl_util.h
new file mode 100644
index 0000000000..bc98e79df4
--- /dev/null
+++ b/src/gallium/state_trackers/g3dvl/vl_util.h
@@ -0,0 +1,6 @@
+#ifndef vl_util_h
+#define vl_util_h
+
+unsigned int vlRoundUpPOT(unsigned int x);
+
+#endif
diff --git a/src/gallium/state_trackers/python/README b/src/gallium/state_trackers/python/README
new file mode 100644
index 0000000000..8f45fb6d1b
--- /dev/null
+++ b/src/gallium/state_trackers/python/README
@@ -0,0 +1,33 @@
+This directory contains Python bindings to Gallium3D. It looks like a state
+tracker from the pipe driver perspective, and it looks like a pipe driver from
+the python script perspective.
+
+
+To build you'll need:
+* Python (with development packages)
+* SCons
+* SWIG
+* Python Imaging Library with TK support (for the samples)
+
+Invoke scons on the top dir as
+ 
+ scons statetrackers=python
+
+To use do
+
+ export PYTHONPATH=build/XXXX-XXXX-XXXX/gallium/state_trackers/python
+
+and then try running
+
+  python src/gallium/state_trackers/python/samples/tri.py
+
+which should show a triangle.
+
+
+This is still work in progress:
+- errors are not handled properly and almost always result in crash
+- state atoms with array members are awkward to set
+- there no efficient way to view images
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
diff --git a/src/gallium/state_trackers/python/SConscript b/src/gallium/state_trackers/python/SConscript
new file mode 100644
index 0000000000..1581182aec
--- /dev/null
+++ b/src/gallium/state_trackers/python/SConscript
@@ -0,0 +1,34 @@
+import sys
+import os.path
+
+Import('*')
+
+if 'python' in env['statetrackers']:
+
+    env = env.Clone()
+    
+    env.Tool('python')
+    
+    env.Tool('swig')
+    env.Append(SWIGPATH = ['#src/gallium/include', '#src/gallium/include/pipe'])
+    env.Append(SWIGFLAGS = ['-python', '-keyword'])
+    
+    env.Append(CPPPATH = '.')
+
+    pyst = env.ConvenienceLibrary(
+        target = 'pyst',
+        source = [
+            'gallium.i',
+            'st_device.c',
+            'st_sample.c',
+            'st_softpipe_winsys.c',
+        ],
+    )
+
+    env.SharedLibrary(
+        target = '_gallium',
+        source = [
+            'st_hardpipe_winsys.c',
+        ],
+        LIBS = [pyst, softpipe, trace] + auxiliaries + env['LIBS'],
+    )
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
new file mode 100644
index 0000000000..f4c4b36ea7
--- /dev/null
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -0,0 +1,103 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * SWIG interface definion for Gallium types.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+%module gallium;
+
+%{
+
+#include <stdio.h>
+#include <Python.h>
+
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "pipe/p_shader_tokens.h" 
+#include "cso_cache/cso_context.h"
+#include "util/u_draw_quad.h" 
+#include "util/u_tile.h" 
+#include "tgsi/tgsi_text.h" 
+#include "tgsi/tgsi_dump.h" 
+
+#include "st_device.h"
+#include "st_sample.h"
+
+%}
+
+%include "typemaps.i"
+
+%include "cstring.i"
+
+%include "carrays.i"
+%array_class(unsigned char, ByteArray);
+%array_class(int, IntArray);
+%array_class(unsigned, UnsignedArray);
+%array_class(float, FloatArray);
+
+
+%rename(Device) st_device;
+%rename(Context) st_context;
+%rename(Texture) pipe_texture;
+%rename(Surface) pipe_surface;
+%rename(Buffer) st_buffer;
+
+%rename(BlendColor) pipe_blend_color;
+%rename(Blend) pipe_blend_state;
+%rename(Clip) pipe_clip_state;
+%rename(ConstantBuffer) pipe_constant_buffer;
+%rename(Depth) pipe_depth_state;
+%rename(Stencil) pipe_stencil_state;
+%rename(Alpha) pipe_alpha_state;
+%rename(DepthStencilAlpha) pipe_depth_stencil_alpha_state;
+%rename(FormatBlock) pipe_format_block;
+%rename(Framebuffer) pipe_framebuffer_state;
+%rename(PolyStipple) pipe_poly_stipple;
+%rename(Rasterizer) pipe_rasterizer_state;
+%rename(Sampler) pipe_sampler_state;
+%rename(Scissor) pipe_scissor_state;
+%rename(Shader) pipe_shader_state;
+%rename(VertexBuffer) pipe_vertex_buffer;
+%rename(VertexElement) pipe_vertex_element;
+%rename(Viewport) pipe_viewport_state;
+
+
+%include "p_compiler.i"
+%include "pipe/p_defines.h";
+%include "p_format.i"
+
+%include "p_device.i"
+%include "p_context.i"
+%include "p_texture.i"
+%include "p_state.i"
+
diff --git a/src/gallium/state_trackers/python/p_compiler.i b/src/gallium/state_trackers/python/p_compiler.i
new file mode 100644
index 0000000000..15f6ba5b9d
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_compiler.i
@@ -0,0 +1,29 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+typedef unsigned char ubyte;
diff --git a/src/gallium/state_trackers/python/p_context.i b/src/gallium/state_trackers/python/p_context.i
new file mode 100644
index 0000000000..1fdcec639f
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_context.i
@@ -0,0 +1,289 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * SWIG interface definion for Gallium types.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+%nodefaultctor st_context;
+%nodefaultdtor st_context;
+
+struct st_context {
+};
+
+%extend st_context {
+   
+   ~st_context() {
+      st_context_destroy($self);
+   }
+   
+   /*
+    * State functions (create/bind/destroy state objects)
+    */
+
+   void set_blend( const struct pipe_blend_state *state ) {
+      cso_set_blend($self->cso, state);
+   }
+   
+   void set_sampler( unsigned index, const struct pipe_sampler_state *state ) {
+      cso_single_sampler($self->cso, index, state);
+      cso_single_sampler_done($self->cso);
+   }
+
+   void set_rasterizer( const struct pipe_rasterizer_state *state ) {
+      cso_set_rasterizer($self->cso, state);
+   }
+
+   void set_depth_stencil_alpha(const struct pipe_depth_stencil_alpha_state *state) {
+      cso_set_depth_stencil_alpha($self->cso, state);
+   }
+
+   void set_fragment_shader( const struct pipe_shader_state *state ) {
+      void *fs;
+      
+      if(!state) {
+         cso_set_fragment_shader_handle($self->cso, NULL);
+         return;
+      }
+      
+      fs = $self->pipe->create_fs_state($self->pipe, state);
+      if(!fs)
+         return;
+      
+      if(cso_set_fragment_shader_handle($self->cso, fs) != PIPE_OK)
+         return;
+
+      cso_delete_fragment_shader($self->cso, $self->fs);
+      $self->fs = fs;
+   }
+
+   void set_vertex_shader( const struct pipe_shader_state *state ) {
+      void *vs;
+      
+      if(!state) {
+         cso_set_vertex_shader_handle($self->cso, NULL);
+         return;
+      }
+      
+      vs = $self->pipe->create_vs_state($self->pipe, state);
+      if(!vs)
+         return;
+      
+      if(cso_set_vertex_shader_handle($self->cso, vs) != PIPE_OK)
+         return;
+
+      cso_delete_vertex_shader($self->cso, $self->vs);
+      $self->vs = vs;
+   }
+
+   /*
+    * Parameter-like state (or properties)
+    */
+   
+   void set_blend_color(const struct pipe_blend_color *state ) {
+      cso_set_blend_color($self->cso, state);
+   }
+
+   void set_clip(const struct pipe_clip_state *state ) {
+      $self->pipe->set_clip_state($self->pipe, state);
+   }
+
+   void set_constant_buffer(unsigned shader, unsigned index,
+                            struct st_buffer *buffer ) 
+   {
+      struct pipe_constant_buffer state;
+      memset(&state, 0, sizeof(state));
+      state.buffer = buffer ? buffer->buffer : NULL;
+      state.size = buffer->buffer->size;
+      $self->pipe->set_constant_buffer($self->pipe, shader, index, &state);
+   }
+
+   void set_framebuffer(const struct pipe_framebuffer_state *state ) {
+      cso_set_framebuffer($self->cso, state);
+   }
+
+   void set_polygon_stipple(const struct pipe_poly_stipple *state ) {
+      $self->pipe->set_polygon_stipple($self->pipe, state);
+   }
+
+   void set_scissor(const struct pipe_scissor_state *state ) {
+      $self->pipe->set_scissor_state($self->pipe, state);
+   }
+
+   void set_viewport(const struct pipe_viewport_state *state) {
+      cso_set_viewport($self->cso, state);
+   }
+
+   void set_sampler_texture(unsigned index,
+                            struct pipe_texture *texture) {
+      if(!texture)
+         texture = $self->default_texture;
+      pipe_texture_reference(&$self->sampler_textures[index], texture);
+      $self->pipe->set_sampler_textures($self->pipe, 
+                                        PIPE_MAX_SAMPLERS,
+                                        $self->sampler_textures);
+   }
+
+   void set_vertex_buffer(unsigned index,
+                          unsigned pitch, 
+                          unsigned max_index,
+                          unsigned buffer_offset,
+                          struct st_buffer *buffer)
+   {
+      unsigned i;
+      struct pipe_vertex_buffer state;
+      
+      memset(&state, 0, sizeof(state));
+      state.pitch = pitch;
+      state.max_index = max_index;
+      state.buffer_offset = buffer_offset;
+      state.buffer = buffer ? buffer->buffer : NULL;
+
+      memcpy(&$self->vertex_buffers[index], &state, sizeof(state));
+      
+      for(i = 0; i < PIPE_MAX_ATTRIBS; ++i)
+         if(self->vertex_buffers[i].buffer)
+            $self->num_vertex_buffers = i + 1;
+      
+      $self->pipe->set_vertex_buffers($self->pipe, 
+                                      $self->num_vertex_buffers, 
+                                      $self->vertex_buffers);
+   }
+
+   void set_vertex_element(unsigned index,
+                           const struct pipe_vertex_element *element) 
+   {
+      memcpy(&$self->vertex_elements[index], element, sizeof(*element));
+   }
+
+   void set_vertex_elements(unsigned num) 
+   {
+      $self->num_vertex_elements = num;
+      $self->pipe->set_vertex_elements($self->pipe, 
+                                       $self->num_vertex_elements, 
+                                       $self->vertex_elements);
+   }
+
+   /*
+    * Draw functions
+    */
+   
+   void draw_arrays(unsigned mode, unsigned start, unsigned count) {
+      $self->pipe->draw_arrays($self->pipe, mode, start, count);
+   }
+
+   void draw_elements( struct st_buffer *indexBuffer,
+                       unsigned indexSize,
+                       unsigned mode, unsigned start, unsigned count) 
+   {
+      $self->pipe->draw_elements($self->pipe, 
+                                 indexBuffer->buffer, 
+                                 indexSize, 
+                                 mode, start, count);
+   }
+
+   void draw_range_elements( struct st_buffer *indexBuffer,
+                             unsigned indexSize, unsigned minIndex, unsigned maxIndex,
+                             unsigned mode, unsigned start, unsigned count)
+   {
+      $self->pipe->draw_range_elements($self->pipe, 
+                                       indexBuffer->buffer, 
+                                       indexSize, minIndex, maxIndex,
+                                       mode, start, count);
+   }
+
+   void draw_vertices(unsigned prim,
+                      unsigned num_verts,
+                      unsigned num_attribs,
+                      const float *vertices) 
+   {
+      struct pipe_context *pipe = $self->pipe;
+      struct pipe_screen *screen = pipe->screen;
+      struct pipe_buffer *vbuf;
+      float *map;
+      unsigned size;
+
+      size = num_verts * num_attribs * 4 * sizeof(float);
+
+      vbuf = pipe_buffer_create(screen,
+                                32,
+                                PIPE_BUFFER_USAGE_VERTEX, 
+                                size);
+      if(!vbuf)
+         goto error1;
+      
+      map = pipe_buffer_map(screen, vbuf, PIPE_BUFFER_USAGE_CPU_WRITE);
+      if (!map)
+         goto error2;
+      memcpy(map, vertices, size);
+      pipe_buffer_unmap(screen, vbuf);
+      
+      util_draw_vertex_buffer(pipe, vbuf, prim, num_verts, num_attribs);
+      
+error2:
+      pipe_buffer_reference(screen, &vbuf, NULL);
+error1:
+      ;
+   }
+   
+   void
+   flush(unsigned flags = 0) {
+      struct pipe_fence_handle *fence = NULL; 
+      $self->pipe->flush($self->pipe, flags | PIPE_FLUSH_RENDER_CACHE, &fence);
+      /* TODO: allow asynchronous operation */ 
+      $self->pipe->winsys->fence_finish( $self->pipe->winsys, fence, 0 );
+      $self->pipe->winsys->fence_reference( $self->pipe->winsys, &fence, NULL );
+   }
+
+   /*
+    * Surface functions
+    */
+   
+   void surface_copy(int do_flip,
+                     struct pipe_surface *dest,
+                     unsigned destx, unsigned desty,
+                     struct pipe_surface *src,
+                     unsigned srcx, unsigned srcy,
+                     unsigned width, unsigned height) {
+      $self->pipe->surface_copy($self->pipe, do_flip, dest, destx, desty, src, srcx, srcy, width, height);
+   }
+
+   void surface_fill(struct pipe_surface *dst,
+                     unsigned x, unsigned y,
+                     unsigned width, unsigned height,
+                     unsigned value) {
+      $self->pipe->surface_fill($self->pipe, dst, x, y, width, height, value);
+   }
+
+   void surface_clear(struct pipe_surface *surface, unsigned value = 0) {
+      $self->pipe->clear($self->pipe, surface, value);
+   }
+
+};
diff --git a/src/gallium/state_trackers/python/p_device.i b/src/gallium/state_trackers/python/p_device.i
new file mode 100644
index 0000000000..84fd2e4349
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_device.i
@@ -0,0 +1,130 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * SWIG interface definion for Gallium types.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+%nodefaultctor st_device;
+%nodefaultdtor st_device;
+
+
+struct st_device {
+};
+
+%newobject st_device::texture_create;
+%newobject st_device::context_create;
+%newobject st_device::buffer_create;
+
+%extend st_device {
+   
+   st_device(int hardware = 1) {
+      return st_device_create(hardware ? TRUE : FALSE);
+   }
+
+   ~st_device() {
+      st_device_destroy($self);
+   }
+   
+   const char * get_name( void ) {
+      return $self->screen->get_name($self->screen);
+   }
+
+   const char * get_vendor( void ) {
+      return $self->screen->get_vendor($self->screen);
+   }
+
+   /**
+    * Query an integer-valued capability/parameter/limit
+    * \param param  one of PIPE_CAP_x
+    */
+   int get_param( int param ) {
+      return $self->screen->get_param($self->screen, param);
+   }
+
+   /**
+    * Query a float-valued capability/parameter/limit
+    * \param param  one of PIPE_CAP_x
+    */
+   float get_paramf( int param ) {
+      return $self->screen->get_paramf($self->screen, param);
+   }
+
+   /**
+    * Check if the given pipe_format is supported as a texture or
+    * drawing surface.
+    * \param type  one of PIPE_TEXTURE, PIPE_SURFACE
+    */
+   int is_format_supported( enum pipe_format format, 
+                            enum pipe_texture_target target,
+                            unsigned tex_usage, 
+                            unsigned geom_flags ) {
+      return $self->screen->is_format_supported( $self->screen, 
+                                                 format, 
+                                                 target, 
+                                                 tex_usage, 
+                                                 geom_flags );
+   }
+
+   struct st_context *
+   context_create(void) {
+      return st_context_create($self);
+   }
+
+   struct pipe_texture * 
+   texture_create(
+         enum pipe_format format,
+         unsigned width,
+         unsigned height,
+         unsigned depth = 1,
+         unsigned last_level = 0,
+         enum pipe_texture_target target = PIPE_TEXTURE_2D,
+         unsigned tex_usage = 0
+      ) {
+      struct pipe_texture templat;
+      memset(&templat, 0, sizeof(templat));
+      templat.format = format;
+      pf_get_block(templat.format, &templat.block);
+      templat.width[0] = width;
+      templat.height[0] = height;
+      templat.depth[0] = depth;
+      templat.last_level = last_level;
+      templat.target = target;
+      templat.tex_usage = tex_usage;
+      return $self->screen->texture_create($self->screen, &templat);
+   }
+   
+   struct st_buffer *
+   buffer_create(unsigned size, unsigned alignment = 0, unsigned usage = 0) {
+      return st_buffer_create($self, alignment, usage, size);
+   }
+
+};
diff --git a/src/gallium/state_trackers/python/p_format.i b/src/gallium/state_trackers/python/p_format.i
new file mode 100644
index 0000000000..26fb12b387
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_format.i
@@ -0,0 +1,162 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright (c) 2008 VMware, Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* 
+ * XXX: SWIG can't parse p_format.h, so we need to duplicate the relevant 
+ * declarations here 
+ */
+
+%{
+#include "pipe/p_format.h" 
+%}
+
+enum pipe_format {
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_A8R8G8B8_UNORM,
+   PIPE_FORMAT_X8R8G8B8_UNORM,
+   PIPE_FORMAT_B8G8R8A8_UNORM,
+   PIPE_FORMAT_B8G8R8X8_UNORM,
+   PIPE_FORMAT_A1R5G5B5_UNORM,
+   PIPE_FORMAT_A4R4G4B4_UNORM,
+   PIPE_FORMAT_R5G6B5_UNORM,
+   PIPE_FORMAT_A2B10G10R10_UNORM,
+   PIPE_FORMAT_L8_UNORM,
+   PIPE_FORMAT_A8_UNORM,
+   PIPE_FORMAT_I8_UNORM,
+   PIPE_FORMAT_A8L8_UNORM,
+   PIPE_FORMAT_L16_UNORM,
+   PIPE_FORMAT_YCBCR,
+   PIPE_FORMAT_YCBCR_REV,
+   PIPE_FORMAT_Z16_UNORM,
+   PIPE_FORMAT_Z32_UNORM,
+   PIPE_FORMAT_Z32_FLOAT,
+   PIPE_FORMAT_S8Z24_UNORM,
+   PIPE_FORMAT_Z24S8_UNORM,
+   PIPE_FORMAT_X8Z24_UNORM,
+   PIPE_FORMAT_Z24X8_UNORM,
+   PIPE_FORMAT_S8_UNORM,
+   PIPE_FORMAT_R64_FLOAT,
+   PIPE_FORMAT_R64G64_FLOAT,
+   PIPE_FORMAT_R64G64B64_FLOAT,
+   PIPE_FORMAT_R64G64B64A64_FLOAT,
+   PIPE_FORMAT_R32_FLOAT,
+   PIPE_FORMAT_R32G32_FLOAT,
+   PIPE_FORMAT_R32G32B32_FLOAT,
+   PIPE_FORMAT_R32G32B32A32_FLOAT,
+   PIPE_FORMAT_R32_UNORM,
+   PIPE_FORMAT_R32G32_UNORM,
+   PIPE_FORMAT_R32G32B32_UNORM,
+   PIPE_FORMAT_R32G32B32A32_UNORM,
+   PIPE_FORMAT_R32_USCALED,
+   PIPE_FORMAT_R32G32_USCALED,
+   PIPE_FORMAT_R32G32B32_USCALED,
+   PIPE_FORMAT_R32G32B32A32_USCALED,
+   PIPE_FORMAT_R32_SNORM,
+   PIPE_FORMAT_R32G32_SNORM,
+   PIPE_FORMAT_R32G32B32_SNORM,
+   PIPE_FORMAT_R32G32B32A32_SNORM,
+   PIPE_FORMAT_R32_SSCALED,
+   PIPE_FORMAT_R32G32_SSCALED,
+   PIPE_FORMAT_R32G32B32_SSCALED,
+   PIPE_FORMAT_R32G32B32A32_SSCALED,
+   PIPE_FORMAT_R16_UNORM,
+   PIPE_FORMAT_R16G16_UNORM,
+   PIPE_FORMAT_R16G16B16_UNORM,
+   PIPE_FORMAT_R16G16B16A16_UNORM,
+   PIPE_FORMAT_R16_USCALED,
+   PIPE_FORMAT_R16G16_USCALED,
+   PIPE_FORMAT_R16G16B16_USCALED,
+   PIPE_FORMAT_R16G16B16A16_USCALED,
+   PIPE_FORMAT_R16_SNORM,
+   PIPE_FORMAT_R16G16_SNORM,
+   PIPE_FORMAT_R16G16B16_SNORM,
+   PIPE_FORMAT_R16G16B16A16_SNORM,
+   PIPE_FORMAT_R16_SSCALED,
+   PIPE_FORMAT_R16G16_SSCALED,
+   PIPE_FORMAT_R16G16B16_SSCALED,
+   PIPE_FORMAT_R16G16B16A16_SSCALED,
+   PIPE_FORMAT_R8_UNORM,
+   PIPE_FORMAT_R8G8_UNORM,
+   PIPE_FORMAT_R8G8B8_UNORM,
+   PIPE_FORMAT_R8G8B8A8_UNORM,
+   PIPE_FORMAT_R8G8B8X8_UNORM,
+   PIPE_FORMAT_R8_USCALED,
+   PIPE_FORMAT_R8G8_USCALED,
+   PIPE_FORMAT_R8G8B8_USCALED,
+   PIPE_FORMAT_R8G8B8A8_USCALED,
+   PIPE_FORMAT_R8G8B8X8_USCALED,
+   PIPE_FORMAT_R8_SNORM,
+   PIPE_FORMAT_R8G8_SNORM,
+   PIPE_FORMAT_R8G8B8_SNORM,
+   PIPE_FORMAT_R8G8B8A8_SNORM,
+   PIPE_FORMAT_R8G8B8X8_SNORM,
+   PIPE_FORMAT_B6G5R5_SNORM,
+   PIPE_FORMAT_A8B8G8R8_SNORM,
+   PIPE_FORMAT_X8B8G8R8_SNORM,
+   PIPE_FORMAT_R8_SSCALED,
+   PIPE_FORMAT_R8G8_SSCALED,
+   PIPE_FORMAT_R8G8B8_SSCALED,
+   PIPE_FORMAT_R8G8B8A8_SSCALED,
+   PIPE_FORMAT_R8G8B8X8_SSCALED,
+   PIPE_FORMAT_R32_FIXED,
+   PIPE_FORMAT_R32G32_FIXED,
+   PIPE_FORMAT_R32G32B32_FIXED,
+   PIPE_FORMAT_R32G32B32A32_FIXED,
+
+   PIPE_FORMAT_L8_SRGB,
+   PIPE_FORMAT_A8L8_SRGB,
+   PIPE_FORMAT_R8G8B8_SRGB,
+   PIPE_FORMAT_R8G8B8A8_SRGB,
+   PIPE_FORMAT_R8G8B8X8_SRGB,
+   PIPE_FORMAT_A8R8G8B8_SRGB,
+   PIPE_FORMAT_X8R8G8B8_SRGB,
+   PIPE_FORMAT_B8G8R8A8_SRGB,
+   PIPE_FORMAT_B8G8R8X8_SRGB,
+
+   PIPE_FORMAT_X8UB8UG8SR8S_NORM,
+   PIPE_FORMAT_B6UG5SR5S_NORM,
+
+   PIPE_FORMAT_DXT1_RGB,
+   PIPE_FORMAT_DXT1_RGBA,
+   PIPE_FORMAT_DXT3_RGBA,
+   PIPE_FORMAT_DXT5_RGBA,
+
+   PIPE_FORMAT_DXT1_SRGB,
+   PIPE_FORMAT_DXT1_SRGBA,
+   PIPE_FORMAT_DXT3_SRGBA,
+   PIPE_FORMAT_DXT5_SRGBA,
+};
+
+
+struct pipe_format_block
+{
+   unsigned size;
+   unsigned width;
+   unsigned height;
+};
+
diff --git a/src/gallium/state_trackers/python/p_state.i b/src/gallium/state_trackers/python/p_state.i
new file mode 100644
index 0000000000..7f5760b3b6
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_state.i
@@ -0,0 +1,109 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * SWIG interface definion for Gallium types.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+%module gallium;
+
+%ignore winsys;
+%ignore pipe_vertex_buffer::buffer;
+
+%include "pipe/p_state.h";
+
+
+%array_class(struct pipe_stencil_state, StencilArray);
+
+
+%extend pipe_framebuffer_state {
+   
+   pipe_framebuffer_state(void) {
+      return CALLOC_STRUCT(pipe_framebuffer_state);
+   }
+   
+   ~pipe_framebuffer_state() {
+      unsigned index;
+      for(index = 0; index < PIPE_MAX_COLOR_BUFS; ++index)
+         pipe_surface_reference(&$self->cbufs[index], NULL);
+      pipe_surface_reference(&$self->zsbuf, NULL);
+      FREE($self);
+   }
+   
+   void
+   set_cbuf(unsigned index, struct pipe_surface *surface) {
+      pipe_surface_reference(&$self->cbufs[index], surface);
+   }
+   
+   void
+   set_zsbuf(struct pipe_surface *surface) {
+      pipe_surface_reference(&$self->zsbuf, surface);
+   }
+   
+};
+
+
+%extend pipe_shader_state {
+   
+   pipe_shader_state(const char *text, unsigned num_tokens = 1024) {
+      struct tgsi_token *tokens;
+      struct pipe_shader_state *shader;
+      
+      tokens = MALLOC(num_tokens * sizeof(struct tgsi_token));
+      if(!tokens)
+         goto error1;
+      
+      if(tgsi_text_translate(text, tokens, num_tokens ) != TRUE)
+         goto error2;
+      
+      shader = CALLOC_STRUCT(pipe_shader_state);
+      if(!shader)
+         goto error3;
+      
+      shader->tokens = tokens;
+      
+      return shader;
+      
+error3:
+error2:
+      FREE(tokens);
+error1:      
+      return NULL;
+   }
+   
+   ~pipe_shader_state() {
+      FREE((void*)$self->tokens);
+      FREE($self);
+   }
+
+   void dump(unsigned flags = 0) {
+      tgsi_dump($self->tokens, flags);
+   }
+}
diff --git a/src/gallium/state_trackers/python/p_texture.i b/src/gallium/state_trackers/python/p_texture.i
new file mode 100644
index 0000000000..08ba0ebe4d
--- /dev/null
+++ b/src/gallium/state_trackers/python/p_texture.i
@@ -0,0 +1,232 @@
+ /**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * SWIG interface definion for Gallium types.
+ *
+ * @author Jose Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+%nodefaultctor pipe_texture;
+%nodefaultctor pipe_surface;
+%nodefaultctor st_buffer;
+
+%nodefaultdtor pipe_texture;
+%nodefaultdtor pipe_surface;
+%nodefaultdtor st_buffer;
+
+%ignore pipe_texture::screen;
+
+%ignore pipe_surface::winsys;
+%immutable pipe_surface::texture;
+%immutable pipe_surface::buffer;
+
+%newobject pipe_texture::get_surface;
+
+
+%extend pipe_texture {
+   
+   ~pipe_texture() {
+      struct pipe_texture *ptr = $self;
+      pipe_texture_reference(&ptr, NULL);
+   }
+   
+   unsigned get_width(unsigned level=0) {
+      return $self->width[level];
+   }
+   
+   unsigned get_height(unsigned level=0) {
+      return $self->height[level];
+   }
+   
+   unsigned get_depth(unsigned level=0) {
+      return $self->depth[level];
+   }
+   
+   unsigned get_nblocksx(unsigned level=0) {
+      return $self->nblocksx[level];
+   }
+   
+   unsigned get_nblocksy(unsigned level=0) {
+      return $self->nblocksy[level];
+   }
+   
+   /** Get a surface which is a "view" into a texture */
+   struct pipe_surface *
+   get_surface(unsigned face=0, unsigned level=0, unsigned zslice=0, unsigned usage=0 )
+   {
+      struct pipe_screen *screen = $self->screen;
+      return screen->get_tex_surface(screen, $self, face, level, zslice, usage);
+   }
+   
+};
+
+
+%extend pipe_surface {
+   
+   ~pipe_surface() {
+      struct pipe_surface *ptr = $self;
+      pipe_surface_reference(&ptr, NULL);
+   }
+   
+   // gets mapped to pipe_surface_map automatically
+   void * map( unsigned flags );
+
+   // gets mapped to pipe_surface_unmap automatically
+   void unmap( void );
+
+   void
+   get_tile_raw(unsigned x, unsigned y, unsigned w, unsigned h, char *raw, unsigned stride) {
+      pipe_get_tile_raw($self, x, y, w, h, raw, stride);
+   }
+
+   void
+   put_tile_raw(unsigned x, unsigned y, unsigned w, unsigned h, const char *raw, unsigned stride) {
+      pipe_put_tile_raw($self, x, y, w, h, raw, stride);
+   }
+
+   void
+   get_tile_rgba(unsigned x, unsigned y, unsigned w, unsigned h, float *rgba) {
+      pipe_get_tile_rgba($self, x, y, w, h, rgba);
+   }
+
+   void
+   put_tile_rgba(unsigned x, unsigned y, unsigned w, unsigned h, const float *rgba) {
+      pipe_put_tile_rgba($self, x, y, w, h, rgba);
+   }
+
+   void
+   get_tile_z(unsigned x, unsigned y, unsigned w, unsigned h, unsigned *z) {
+      pipe_get_tile_z($self, x, y, w, h, z);
+   }
+
+   void
+   put_tile_z(unsigned x, unsigned y, unsigned w, unsigned h, const unsigned *z) {
+      pipe_put_tile_z($self, x, y, w, h, z);
+   }
+   
+   void
+   sample_rgba(float *rgba) {
+      st_sample_surface($self, rgba);
+   }
+   
+   unsigned
+   compare_tile_rgba(unsigned x, unsigned y, unsigned w, unsigned h, const float *rgba, float tol = 0.0) 
+   {
+      float *rgba2;
+      const float *p1;
+      const float *p2;
+      unsigned i, j, n;
+      
+      rgba2 = MALLOC(h*w*4*sizeof(float));
+      if(!rgba2)
+         return ~0;
+
+      pipe_get_tile_rgba($self, x, y, w, h, rgba2);
+
+      p1 = rgba;
+      p2 = rgba2;
+      n = 0;
+      for(i = h*w; i; --i) {
+         unsigned differs = 0;
+         for(j = 4; j; --j) {
+            float delta = *p2++ - *p1++;
+            if (delta < -tol || delta > tol)
+                differs = 1;
+         }
+         n += differs;
+      }
+      
+      FREE(rgba2);
+      
+      return n;
+   }
+
+};
+
+struct st_buffer {
+};
+
+%extend st_buffer {
+   
+   ~st_buffer() {
+      st_buffer_destroy($self);
+   }
+   
+   unsigned __len__(void) 
+   {
+      assert($self->buffer->refcount);
+      return $self->buffer->size;
+   }
+   
+   %cstring_output_allocate_size(char **STRING, int *LENGTH, free(*$1));
+   void read(char **STRING, int *LENGTH)
+   {
+      struct pipe_screen *screen = $self->st_dev->screen;
+      const char *map;
+      
+      assert($self->buffer->refcount);
+      
+      *LENGTH = $self->buffer->size;
+      *STRING = (char *) malloc($self->buffer->size);
+      if(!*STRING)
+         return;
+      
+      map = pipe_buffer_map(screen, $self->buffer, PIPE_BUFFER_USAGE_CPU_READ);
+      if(map) {
+         memcpy(*STRING, map, $self->buffer->size);
+         pipe_buffer_unmap(screen, $self->buffer);
+      }
+   }
+   
+   %cstring_input_binary(const char *STRING, unsigned LENGTH);
+   void write(const char *STRING, unsigned LENGTH, unsigned offset = 0) 
+   {
+      struct pipe_screen *screen = $self->st_dev->screen;
+      char *map;
+      
+      assert($self->buffer->refcount);
+      
+      if(offset > $self->buffer->size) {
+         PyErr_SetString(PyExc_ValueError, "offset must be smaller than buffer size");
+         return;
+      }
+
+      if(offset + LENGTH > $self->buffer->size) {
+         PyErr_SetString(PyExc_ValueError, "data length must fit inside the buffer");
+         return;
+      }
+
+      map = pipe_buffer_map(screen, $self->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+      if(map) {
+         memcpy(map + offset, STRING, LENGTH);
+         pipe_buffer_unmap(screen, $self->buffer);
+      }
+   }
+};
diff --git a/src/gallium/state_trackers/python/retrace/README b/src/gallium/state_trackers/python/retrace/README
new file mode 100644
index 0000000000..822cd11404
--- /dev/null
+++ b/src/gallium/state_trackers/python/retrace/README
@@ -0,0 +1,17 @@
+This is an application written in python to replay the traces captured by the
+ trace pipe driver. 
+
+
+To use it follow the instructions in src/gallium/drivers/trace/README and
+src/gallium/state_trackers/python/README, and then do
+
+  python src/gallium/state_trackers/python/samples/retrace/interpreter.py filename.trace
+
+
+This is still work in progress:
+- not everything is captured/replayed
+  - surface/textures contents
+- any tiny error will result in a crash
+
+--
+Jose Fonseca <jrfonseca@tungstengraphics.com>
diff --git a/src/gallium/state_trackers/python/retrace/format.py b/src/gallium/state_trackers/python/retrace/format.py
new file mode 100755
index 0000000000..0bf6baf0b9
--- /dev/null
+++ b/src/gallium/state_trackers/python/retrace/format.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+##########################################################################
+#
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+##########################################################################
+
+
+class Formatter:
+    '''Plain formatter'''
+
+    def __init__(self, stream):
+        self.stream = stream
+
+    def text(self, text):
+        self.stream.write(text)
+
+    def newline(self):
+        self.text('\n')
+
+    def function(self, name):
+        self.text(name)
+
+    def variable(self, name):
+        self.text(name)
+
+    def literal(self, value):
+        self.text(str(value))
+
+    def address(self, addr):
+        self.text(str(addr))
+
+
+class AnsiFormatter(Formatter):
+    '''Formatter for plain-text files which outputs ANSI escape codes. See
+    http://en.wikipedia.org/wiki/ANSI_escape_code for more information
+    concerning ANSI escape codes.
+    '''
+
+    _csi = '\33['
+
+    _normal = '0m'
+    _bold = '1m'
+    _italic = '3m'
+    _red = '31m'
+    _green = '32m'
+    _blue = '34m'
+
+    def _escape(self, code):
+        self.text(self._csi + code)
+
+    def function(self, name):
+        self._escape(self._bold)
+        Formatter.function(self, name)
+        self._escape(self._normal)
+
+    def variable(self, name):
+        self._escape(self._italic)
+        Formatter.variable(self, name)
+        self._escape(self._normal)
+
+    def literal(self, value):
+        self._escape(self._blue)
+        Formatter.literal(self, value)
+        self._escape(self._normal)
+
+    def address(self, value):
+        self._escape(self._green)
+        Formatter.address(self, value)
+        self._escape(self._normal)
+
+
+
+def DefaultFormatter(stream):
+    if stream.isatty():
+        return AnsiFormatter(stream)
+    else:
+        return Formatter(stream)
+
diff --git a/src/gallium/state_trackers/python/retrace/interpreter.py b/src/gallium/state_trackers/python/retrace/interpreter.py
new file mode 100755
index 0000000000..f418f80d7b
--- /dev/null
+++ b/src/gallium/state_trackers/python/retrace/interpreter.py
@@ -0,0 +1,578 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+import sys
+import struct
+
+import gallium
+import model
+import parser
+
+
+def make_image(surface):
+    pixels = gallium.FloatArray(surface.height*surface.width*4)
+    surface.get_tile_rgba(0, 0, surface.width, surface.height, pixels)
+
+    import Image
+    outimage = Image.new(
+        mode='RGB',
+        size=(surface.width, surface.height),
+        color=(0,0,0))
+    outpixels = outimage.load()
+    for y in range(0, surface.height):
+        for x in range(0, surface.width):
+            offset = (y*surface.width + x)*4
+            r, g, b, a = [int(pixels[offset + ch]*255) for ch in range(4)]
+            outpixels[x, y] = r, g, b
+    return outimage
+
+def save_image(filename, surface):
+    outimage = make_image(surface)
+    outimage.save(filename, "PNG")
+
+def show_image(surface):
+    outimage = make_image(surface)
+    
+    import Tkinter as tk
+    from PIL import Image, ImageTk
+    root = tk.Tk()
+    
+    root.title('background image')
+    
+    image1 = ImageTk.PhotoImage(outimage)
+    w = image1.width()
+    h = image1.height()
+    x = 100
+    y = 100
+    root.geometry("%dx%d+%d+%d" % (w, h, x, y))
+    panel1 = tk.Label(root, image=image1)
+    panel1.pack(side='top', fill='both', expand='yes')
+    panel1.image = image1
+    root.mainloop()
+
+
+
+
+class Struct:
+    """C-like struct"""
+
+    # A basic Python class can pass as a C-like structure
+    pass
+
+
+struct_factories = {
+    "pipe_blend_color": gallium.BlendColor,
+    "pipe_blend_state": gallium.Blend,
+    #"pipe_clip_state": gallium.Clip,
+    #"pipe_constant_buffer": gallium.ConstantBuffer,
+    "pipe_depth_state": gallium.Depth,
+    "pipe_stencil_state": gallium.Stencil,
+    "pipe_alpha_state": gallium.Alpha,
+    "pipe_depth_stencil_alpha_state": gallium.DepthStencilAlpha,
+    "pipe_format_block": gallium.FormatBlock,
+    #"pipe_framebuffer_state": gallium.Framebuffer,
+    "pipe_poly_stipple": gallium.PolyStipple,
+    "pipe_rasterizer_state": gallium.Rasterizer,
+    "pipe_sampler_state": gallium.Sampler,
+    "pipe_scissor_state": gallium.Scissor,
+    #"pipe_shader_state": gallium.Shader,
+    #"pipe_vertex_buffer": gallium.VertexBuffer,
+    "pipe_vertex_element": gallium.VertexElement,
+    "pipe_viewport_state": gallium.Viewport,
+    #"pipe_texture": gallium.Texture,
+}
+
+
+member_array_factories = {
+    "pipe_rasterizer_state": {"sprite_coord_mode": gallium.ByteArray},                          
+    "pipe_poly_stipple": {"stipple": gallium.UnsignedArray},                          
+    "pipe_viewport_state": {"scale": gallium.FloatArray, "translate": gallium.FloatArray},                          
+    #"pipe_clip_state": {"ucp": gallium.FloatArray},
+    "pipe_depth_stencil_alpha_state": {"stencil": gallium.StencilArray},
+    "pipe_blend_color": {"color": gallium.FloatArray},
+    "pipe_sampler_state": {"border_color": gallium.FloatArray},              
+}
+
+
+class Translator(model.Visitor):
+    """Translate model arguments into regular Python objects"""
+
+    def __init__(self, interpreter):
+        self.interpreter = interpreter
+        self.result = None
+
+    def visit(self, node):
+        self.result = None
+        node.visit(self)
+        return self.result
+        
+    def visit_literal(self, node):
+        self.result = node.value
+    
+    def visit_named_constant(self, node):
+        # lookup the named constant in the gallium module
+        self.result = getattr(gallium, node.name)
+    
+    def visit_array(self, node):
+        array = []
+        for element in node.elements:
+            array.append(self.visit(element))
+        self.result = array
+    
+    def visit_struct(self, node):
+        struct_factory = struct_factories.get(node.name, Struct)
+        struct = struct_factory()
+        for member_name, member_node in node.members:
+            member_value = self.visit(member_node)
+            try:
+                array_factory = member_array_factories[node.name][member_name]
+            except KeyError:
+                pass
+            else:
+                assert isinstance(member_value, list)
+                array = array_factory(len(member_value))
+                for i in range(len(member_value)):
+                    array[i] = member_value[i]
+                member_value = array
+            #print node.name, member_name, member_value
+            assert isinstance(struct, Struct) or hasattr(struct, member_name)
+            setattr(struct, member_name, member_value)
+        self.result = struct
+    
+    def visit_pointer(self, node):
+        self.result = self.interpreter.lookup_object(node.address)
+
+
+class Object:
+    
+    def __init__(self, interpreter, real):
+        self.interpreter = interpreter
+        self.real = real
+        
+
+class Global(Object):
+
+    def __init__(self, interpreter, real):
+        self.interpreter = interpreter
+        self.real = real
+        
+    def pipe_winsys_create(self):
+        return Winsys(self.interpreter, gallium.Device())
+
+    def pipe_screen_create(self, winsys):
+        return Screen(self.interpreter, winsys.real)
+    
+    def pipe_context_create(self, screen):
+        context = screen.real.context_create()
+        return Context(self.interpreter, context)
+
+    
+class Winsys(Object):
+    
+    def __init__(self, interpreter, real):
+        self.interpreter = interpreter
+        self.real = real
+
+    def get_name(self):
+        pass
+    
+    def user_buffer_create(self, data, size):
+        # We don't really care to distinguish between user and regular buffers
+        buffer = self.real.buffer_create(size, 
+                                         4, 
+                                         gallium.PIPE_BUFFER_USAGE_CPU_READ |
+                                         gallium.PIPE_BUFFER_USAGE_CPU_WRITE )
+        assert size == len(data)
+        buffer.write(data)
+        return buffer
+    
+    def buffer_create(self, alignment, usage, size):
+        return self.real.buffer_create(size, alignment, usage)
+    
+    def buffer_destroy(self, buffer):
+        pass
+    
+    def buffer_write(self, buffer, data, size):
+        assert size == len(data)
+        buffer.write(data)
+        
+    def fence_finish(self, fence, flags):
+        pass
+    
+    def fence_reference(self, dst, src):
+        pass
+    
+    def flush_frontbuffer(self, surface):
+        pass
+
+    def surface_alloc(self):
+        return None
+    
+    def surface_release(self, surface):
+        pass
+
+
+class Screen(Object):
+    
+    def destroy(self):
+        pass
+
+    def get_name(self):
+        pass
+    
+    def get_vendor(self):
+        pass
+    
+    def get_param(self, param):
+        pass
+    
+    def get_paramf(self, param):
+        pass
+    
+    def is_format_supported(self, format, target, tex_usage, geom_flags):
+        return self.real.is_format_supported(format, target, tex_usage, geom_flags)
+    
+    def texture_create(self, template):
+        return self.real.texture_create(
+            format = template.format,
+            width = template.width[0],
+            height = template.height[0],
+            depth = template.depth[0],
+            last_level = template.last_level,
+            target = template.target,
+            tex_usage = template.tex_usage,
+        )
+
+    def texture_destroy(self, texture):
+        self.interpreter.unregister_object(texture)
+
+    def texture_release(self, surface):
+        pass
+
+    def get_tex_surface(self, texture, face, level, zslice, usage):
+        return texture.get_surface(face, level, zslice, usage)
+    
+    def tex_surface_destroy(self, surface):
+        self.interpreter.unregister_object(surface)
+
+    def tex_surface_release(self, surface):
+        pass
+
+    def surface_write(self, surface, data, stride, size):
+        assert surface.nblocksy * stride == size 
+        surface.put_tile_raw(0, 0, surface.width, surface.height, data, stride)
+
+
+class Context(Object):
+    
+    def __init__(self, interpreter, real):
+        Object.__init__(self, interpreter, real)
+        self.cbufs = []
+        self.zsbuf = None
+        self.vbufs = []
+        self.velems = []
+
+    def destroy(self):
+        pass
+    
+    def create_blend_state(self, state):
+        return state
+
+    def bind_blend_state(self, state):
+        if state is not None:
+            self.real.set_blend(state)
+
+    def delete_blend_state(self, state):
+        pass
+    
+    def create_sampler_state(self, state):
+        return state
+
+    def delete_sampler_state(self, state):
+        pass
+
+    def bind_sampler_states(self, n, states):
+        for i in range(n):
+            self.real.set_sampler(i, states[i])
+        
+    def create_rasterizer_state(self, state):
+        return state
+
+    def bind_rasterizer_state(self, state):
+        if state is not None:
+            self.real.set_rasterizer(state)
+        
+    def delete_rasterizer_state(self, state):
+        pass
+    
+    def create_depth_stencil_alpha_state(self, state):
+        return state
+
+    def bind_depth_stencil_alpha_state(self, state):
+        if state is not None:
+            self.real.set_depth_stencil_alpha(state)
+            
+    def delete_depth_stencil_alpha_state(self, state):
+        pass
+
+    def create_fs_state(self, state):
+        tokens = str(state.tokens)
+        shader = gallium.Shader(tokens)
+        return shader
+
+    create_vs_state = create_fs_state
+    
+    def bind_fs_state(self, state):
+        self.real.set_fragment_shader(state)
+        
+    def bind_vs_state(self, state):
+        self.real.set_vertex_shader(state)
+
+    def delete_fs_state(self, state):
+        pass
+    
+    delete_vs_state = delete_fs_state
+    
+    def set_blend_color(self, state):
+        self.real.set_blend_color(state)
+
+    def set_clip_state(self, state):
+        _state = gallium.Clip()
+        _state.nr = state.nr
+        if state.nr:
+            # FIXME
+            ucp = gallium.FloatArray(gallium.PIPE_MAX_CLIP_PLANES*4)
+            for i in range(len(state.ucp)):
+                for j in range(len(state.ucp[i])):
+                    ucp[i*4 + j] = state.ucp[i][j]
+            _state.ucp = ucp
+        self.real.set_clip(_state)
+
+    def set_constant_buffer(self, shader, index, state):
+        if state is not None:
+            self.real.set_constant_buffer(shader, index, state.buffer)
+
+            if 1:
+                data = state.buffer.read()
+                format = '4f'
+                index = 0
+                for offset in range(0, len(data), struct.calcsize(format)):
+                    x, y, z, w = struct.unpack_from(format, data, offset)
+                    sys.stdout.write('\tCONST[%2u] = {%10.4f, %10.4f, %10.4f, %10.4f}\n' % (index, x, y, z, w))
+                    index += 1
+
+    def set_framebuffer_state(self, state):
+        _state = gallium.Framebuffer()
+        _state.width = state.width
+        _state.height = state.height
+        _state.num_cbufs = state.num_cbufs
+        for i in range(len(state.cbufs)):
+            _state.set_cbuf(i, state.cbufs[i])
+        _state.set_zsbuf(state.zsbuf)    
+        self.real.set_framebuffer(_state)
+        
+        self.cbufs = state.cbufs
+        self.zsbuf = state.zsbuf
+
+    def set_polygon_stipple(self, state):
+        self.real.set_polygon_stipple(state)
+
+    def set_scissor_state(self, state):
+        self.real.set_scissor(state)
+
+    def set_viewport_state(self, state):
+        self.real.set_viewport(state)
+
+    def set_sampler_textures(self, n, textures):
+        for i in range(n):
+            self.real.set_sampler_texture(i, textures[i])
+
+    def set_vertex_buffers(self, n, vbufs):
+        self.vbufs = vbufs[0:n]
+        for i in range(n):
+            vbuf = vbufs[i]
+            self.real.set_vertex_buffer(
+                i,
+                pitch = vbuf.pitch,
+                max_index = vbuf.max_index,
+                buffer_offset = vbuf.buffer_offset,
+                buffer = vbuf.buffer,
+            )
+
+    def set_vertex_elements(self, n, elements):
+        self.velems = elements[0:n]
+        for i in range(n):
+            self.real.set_vertex_element(i, elements[i])
+        self.real.set_vertex_elements(n)
+
+    def set_edgeflags(self, bitfield):
+        # FIXME
+        pass
+    
+    def dump_vertices(self, start, count):
+        for index in range(start, start + count):
+            if index >= start + 16:
+                sys.stdout.write('\t...\n')
+                break
+            sys.stdout.write('\t{\n')
+            for velem in self.velems:
+                vbuf = self.vbufs[velem.vertex_buffer_index]
+
+                offset = vbuf.buffer_offset + velem.src_offset + vbuf.pitch*index
+                format = {
+                    gallium.PIPE_FORMAT_R32_FLOAT: 'f',
+                    gallium.PIPE_FORMAT_R32G32_FLOAT: '2f',
+                    gallium.PIPE_FORMAT_R32G32B32_FLOAT: '3f',
+                    gallium.PIPE_FORMAT_R32G32B32A32_FLOAT: '4f',
+                    gallium.PIPE_FORMAT_B8G8R8A8_UNORM: '4B',
+                }[velem.src_format]
+
+                data = vbuf.buffer.read()
+                values = struct.unpack_from(format, data, offset)
+                sys.stdout.write('\t\t{' + ', '.join(map(str, values)) + '},\n')
+                assert len(values) == velem.nr_components
+            sys.stdout.write('\t},\n')
+
+    def dump_indices(self, ibuf, isize, start, count):
+        format = {
+            1: 'B',
+            2: 'H',
+            4: 'I',
+        }[isize]
+
+        assert struct.calcsize(format) == isize
+
+        data = ibuf.read()
+        maxindex, minindex = 0, 0xffffffff
+
+        sys.stdout.write('\t{\n')
+        for i in range(start, start + count):
+            if i >= start + 16:
+                sys.stdout.write('\t...\n')
+                break
+            offset = i*isize
+            index, = struct.unpack_from(format, data, offset)
+            sys.stdout.write('\t\t%u,\n' % index)
+            minindex = min(minindex, index)
+            maxindex = max(maxindex, index)
+        sys.stdout.write('\t},\n')
+
+        return minindex, maxindex
+
+    def draw_arrays(self, mode, start, count):
+        self.dump_vertices(start, count)
+            
+        self.real.draw_arrays(mode, start, count)
+    
+    def draw_elements(self, indexBuffer, indexSize, mode, start, count):
+        minindex, maxindex = self.dump_indices(indexBuffer, indexSize, start, count)
+        self.dump_vertices(minindex, maxindex - minindex)
+
+        self.real.draw_elements(indexBuffer, indexSize, mode, start, count)
+        
+    def draw_range_elements(self, indexBuffer, indexSize, minIndex, maxIndex, mode, start, count):
+        minindex, maxindex = self.dump_indices(indexBuffer, indexSize, start, count)
+        minindex = min(minindex, minIndex)
+        maxindex = min(maxindex, maxIndex)
+        self.dump_vertices(minindex, maxindex - minindex)
+
+        self.real.draw_range_elements(indexBuffer, indexSize, minIndex, maxIndex, mode, start, count)
+        
+    def flush(self, flags):
+        self.real.flush(flags)
+        if flags & gallium.PIPE_FLUSH_FRAME:
+            self._update()
+        return None
+
+    def clear(self, surface, value):
+        self.real.surface_clear(surface, value)
+        
+    def _update(self):
+        self.real.flush()
+    
+        if self.cbufs and self.cbufs[0]:
+            show_image(self.cbufs[0])
+    
+
+class Interpreter(parser.TraceDumper):
+    
+    ignore_calls = set((
+            ('pipe_screen', 'is_format_supported'),
+            ('pipe_screen', 'get_param'),
+            ('pipe_screen', 'get_paramf'),
+    ))
+
+    def __init__(self, stream):
+        parser.TraceDumper.__init__(self, stream)
+        self.objects = {}
+        self.result = None
+        self.globl = Global(self, None)
+
+    def register_object(self, address, object):
+        self.objects[address] = object
+        
+    def unregister_object(self, object):
+        # FIXME:
+        pass
+
+    def lookup_object(self, address):
+        return self.objects[address]
+    
+    def interpret(self, trace):
+        for call in trace.calls:
+            self.interpret_call(call)
+
+    def handle_call(self, call):
+
+        if (call.klass, call.method) in self.ignore_calls:
+            return
+
+        parser.TraceDumper.handle_call(self, call)
+        
+        args = [self.interpret_arg(arg) for name, arg in call.args] 
+        
+        if call.klass:
+            obj = args[0]
+            args = args[1:]
+        else:
+            obj = self.globl
+            
+        method = getattr(obj, call.method)
+        ret = method(*args)
+        
+        if call.ret and isinstance(call.ret, model.Pointer):
+            self.register_object(call.ret.address, ret)
+
+    def interpret_arg(self, node):
+        translator = Translator(self)
+        return translator.visit(node)
+    
+
+if __name__ == '__main__':
+    parser.main(Interpreter)
diff --git a/src/gallium/state_trackers/python/retrace/model.py b/src/gallium/state_trackers/python/retrace/model.py
new file mode 100755
index 0000000000..ae0f4327d7
--- /dev/null
+++ b/src/gallium/state_trackers/python/retrace/model.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+'''Trace data model.'''
+
+
+import sys
+import string
+import format
+
+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
+
+class Node:
+    
+    def visit(self, visitor):
+        raise NotImplementedError
+
+    def __str__(self):
+        stream = StringIO()
+        formatter = format.DefaultFormatter(stream)
+        pretty_printer = PrettyPrinter(formatter)
+        self.visit(pretty_printer)
+        return stream.getvalue()
+
+
+class Literal(Node):
+    
+    def __init__(self, value):
+        self.value = value
+
+    def visit(self, visitor):
+        visitor.visit_literal(self)
+
+
+class NamedConstant(Node):
+    
+    def __init__(self, name):
+        self.name = name
+
+    def visit(self, visitor):
+        visitor.visit_named_constant(self)
+    
+
+class Array(Node):
+    
+    def __init__(self, elements):
+        self.elements = elements
+
+    def visit(self, visitor):
+        visitor.visit_array(self)
+
+
+class Struct(Node):
+    
+    def __init__(self, name, members):
+        self.name = name
+        self.members = members        
+
+    def visit(self, visitor):
+        visitor.visit_struct(self)
+
+        
+class Pointer(Node):
+    
+    def __init__(self, address):
+        self.address = address
+
+    def visit(self, visitor):
+        visitor.visit_pointer(self)
+
+
+class Call:
+    
+    def __init__(self, klass, method, args, ret):
+        self.klass = klass
+        self.method = method
+        self.args = args
+        self.ret = ret
+        
+    def visit(self, visitor):
+        visitor.visit_call(self)
+
+
+class Trace:
+    
+    def __init__(self, calls):
+        self.calls = calls
+        
+    def visit(self, visitor):
+        visitor.visit_trace(self)
+    
+    
+class Visitor:
+    
+    def visit_literal(self, node):
+        raise NotImplementedError
+    
+    def visit_named_constant(self, node):
+        raise NotImplementedError
+    
+    def visit_array(self, node):
+        raise NotImplementedError
+    
+    def visit_struct(self, node):
+        raise NotImplementedError
+    
+    def visit_pointer(self, node):
+        raise NotImplementedError
+    
+    def visit_call(self, node):
+        raise NotImplementedError
+    
+    def visit_trace(self, node):
+        raise NotImplementedError
+
+
+class PrettyPrinter:
+
+    def __init__(self, formatter):
+        self.formatter = formatter
+    
+    def visit_literal(self, node):
+        if isinstance(node.value, basestring):
+            if len(node.value) >= 4096 or node.value.strip(string.printable):
+                self.formatter.text('...')
+                return
+
+            self.formatter.literal('"' + node.value + '"')
+            return
+
+        self.formatter.literal(repr(node.value))
+    
+    def visit_named_constant(self, node):
+        self.formatter.literal(node.name)
+    
+    def visit_array(self, node):
+        self.formatter.text('{')
+        sep = ''
+        for value in node.elements:
+            self.formatter.text(sep)
+            value.visit(self) 
+            sep = ', '
+        self.formatter.text('}')
+    
+    def visit_struct(self, node):
+        self.formatter.text('{')
+        sep = ''
+        for name, value in node.members:
+            self.formatter.text(sep)
+            self.formatter.variable(name)
+            self.formatter.text(' = ')
+            value.visit(self) 
+            sep = ', '
+        self.formatter.text('}')
+    
+    def visit_pointer(self, node):
+        self.formatter.address(node.address)
+    
+    def visit_call(self, node):
+        if node.klass is not None:
+            self.formatter.function(node.klass + '::' + node.method)
+        else:
+            self.formatter.function(node.method)
+        self.formatter.text('(')
+        sep = ''
+        for name, value in node.args:
+            self.formatter.text(sep)
+            self.formatter.variable(name)
+            self.formatter.text(' = ')
+            value.visit(self) 
+            sep = ', '
+        self.formatter.text(')')
+        if node.ret is not None:
+            self.formatter.text(' = ')
+            node.ret.visit(self)
+    
+    def visit_trace(self, node):
+        for call in node.calls:
+            call.visit(self)
+            self.formatter.newline()
+
diff --git a/src/gallium/state_trackers/python/retrace/parser.py b/src/gallium/state_trackers/python/retrace/parser.py
new file mode 100755
index 0000000000..5205f2d8dd
--- /dev/null
+++ b/src/gallium/state_trackers/python/retrace/parser.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+import sys
+import xml.parsers.expat
+import binascii
+
+from model import *
+
+
+ELEMENT_START, ELEMENT_END, CHARACTER_DATA, EOF = range(4)
+
+
+class XmlToken:
+
+    def __init__(self, type, name_or_data, attrs = None, line = None, column = None):
+        assert type in (ELEMENT_START, ELEMENT_END, CHARACTER_DATA, EOF)
+        self.type = type
+        self.name_or_data = name_or_data
+        self.attrs = attrs
+        self.line = line
+        self.column = column
+
+    def __str__(self):
+        if self.type == ELEMENT_START:
+            return '<' + self.name_or_data + ' ...>'
+        if self.type == ELEMENT_END:
+            return '</' + self.name_or_data + '>'
+        if self.type == CHARACTER_DATA:
+            return self.name_or_data
+        if self.type == EOF:
+            return 'end of file'
+        assert 0
+
+
+class XmlTokenizer:
+    """Expat based XML tokenizer."""
+
+    def __init__(self, fp, skip_ws = True):
+        self.fp = fp
+        self.tokens = []
+        self.index = 0
+        self.final = False
+        self.skip_ws = skip_ws
+        
+        self.character_pos = 0, 0
+        self.character_data = ''
+        
+        self.parser = xml.parsers.expat.ParserCreate()
+        self.parser.StartElementHandler  = self.handle_element_start
+        self.parser.EndElementHandler    = self.handle_element_end
+        self.parser.CharacterDataHandler = self.handle_character_data
+    
+    def handle_element_start(self, name, attributes):
+        self.finish_character_data()
+        line, column = self.pos()
+        token = XmlToken(ELEMENT_START, name, attributes, line, column)
+        self.tokens.append(token)
+    
+    def handle_element_end(self, name):
+        self.finish_character_data()
+        line, column = self.pos()
+        token = XmlToken(ELEMENT_END, name, None, line, column)
+        self.tokens.append(token)
+
+    def handle_character_data(self, data):
+        if not self.character_data:
+            self.character_pos = self.pos()
+        self.character_data += data
+    
+    def finish_character_data(self):
+        if self.character_data:
+            if not self.skip_ws or not self.character_data.isspace(): 
+                line, column = self.character_pos
+                token = XmlToken(CHARACTER_DATA, self.character_data, None, line, column)
+                self.tokens.append(token)
+            self.character_data = ''
+    
+    def next(self):
+        size = 16*1024
+        while self.index >= len(self.tokens) and not self.final:
+            self.tokens = []
+            self.index = 0
+            data = self.fp.read(size)
+            self.final = len(data) < size
+            data = data.rstrip('\0')
+            try:
+                self.parser.Parse(data, self.final)
+            except xml.parsers.expat.ExpatError, e:
+                #if e.code == xml.parsers.expat.errors.XML_ERROR_NO_ELEMENTS:
+                if e.code == 3:
+                    pass
+                else:
+                    raise e
+        if self.index >= len(self.tokens):
+            line, column = self.pos()
+            token = XmlToken(EOF, None, None, line, column)
+        else:
+            token = self.tokens[self.index]
+            self.index += 1
+        return token
+
+    def pos(self):
+        return self.parser.CurrentLineNumber, self.parser.CurrentColumnNumber
+
+
+class TokenMismatch(Exception):
+
+    def __init__(self, expected, found):
+        self.expected = expected
+        self.found = found
+
+    def __str__(self):
+        return '%u:%u: %s expected, %s found' % (self.found.line, self.found.column, str(self.expected), str(self.found))
+
+
+
+class XmlParser:
+    """Base XML document parser."""
+
+    def __init__(self, fp):
+        self.tokenizer = XmlTokenizer(fp)
+        self.consume()
+    
+    def consume(self):
+        self.token = self.tokenizer.next()
+
+    def match_element_start(self, name):
+        return self.token.type == ELEMENT_START and self.token.name_or_data == name
+    
+    def match_element_end(self, name):
+        return self.token.type == ELEMENT_END and self.token.name_or_data == name
+
+    def element_start(self, name):
+        while self.token.type == CHARACTER_DATA:
+            self.consume()
+        if self.token.type != ELEMENT_START:
+            raise TokenMismatch(XmlToken(ELEMENT_START, name), self.token)
+        if self.token.name_or_data != name:
+            raise TokenMismatch(XmlToken(ELEMENT_START, name), self.token)
+        attrs = self.token.attrs
+        self.consume()
+        return attrs
+    
+    def element_end(self, name):
+        while self.token.type == CHARACTER_DATA:
+            self.consume()
+        if self.token.type != ELEMENT_END:
+            raise TokenMismatch(XmlToken(ELEMENT_END, name), self.token)
+        if self.token.name_or_data != name:
+            raise TokenMismatch(XmlToken(ELEMENT_END, name), self.token)
+        self.consume()
+
+    def character_data(self, strip = True):
+        data = ''
+        while self.token.type == CHARACTER_DATA:
+            data += self.token.name_or_data
+            self.consume()
+        if strip:
+            data = data.strip()
+        return data
+
+
+class TraceParser(XmlParser):
+
+    def parse(self):
+        self.element_start('trace')
+        while self.token.type not in (ELEMENT_END, EOF):
+            call = self.parse_call()
+            self.handle_call(call)
+        if self.token.type != EOF:
+            self.element_end('trace')
+
+    def parse_call(self):
+        attrs = self.element_start('call')
+        klass = attrs['class']
+        method = attrs['method']
+        args = []
+        ret = None
+        while self.token.type == ELEMENT_START:
+            if self.token.name_or_data == 'arg':
+                arg = self.parse_arg()
+                args.append(arg)
+            elif self.token.name_or_data == 'ret':
+                ret = self.parse_ret()
+            elif self.token.name_or_data == 'call':
+                # ignore nested function calls
+                self.parse_call()
+            else:
+                raise TokenMismatch("<arg ...> or <ret ...>", self.token)
+        self.element_end('call')
+        
+        return Call(klass, method, args, ret)
+
+    def parse_arg(self):
+        attrs = self.element_start('arg')
+        name = attrs['name']
+        value = self.parse_value()
+        self.element_end('arg')
+
+        return name, value
+
+    def parse_ret(self):
+        attrs = self.element_start('ret')
+        value = self.parse_value()
+        self.element_end('ret')
+
+        return value
+
+    def parse_value(self):
+        expected_tokens = ('null', 'bool', 'int', 'uint', 'float', 'string', 'enum', 'array', 'struct', 'ptr', 'bytes')
+        if self.token.type == ELEMENT_START:
+            if self.token.name_or_data in expected_tokens:
+                method = getattr(self, 'parse_' +  self.token.name_or_data)
+                return method()
+        raise TokenMismatch(" or " .join(expected_tokens), self.token)
+
+    def parse_null(self):
+        self.element_start('null')
+        self.element_end('null')
+        return Literal(None)
+        
+    def parse_bool(self):
+        self.element_start('bool')
+        value = int(self.character_data())
+        self.element_end('bool')
+        return Literal(value)
+        
+    def parse_int(self):
+        self.element_start('int')
+        value = int(self.character_data())
+        self.element_end('int')
+        return Literal(value)
+        
+    def parse_uint(self):
+        self.element_start('uint')
+        value = int(self.character_data())
+        self.element_end('uint')
+        return Literal(value)
+        
+    def parse_float(self):
+        self.element_start('float')
+        value = float(self.character_data())
+        self.element_end('float')
+        return Literal(value)
+        
+    def parse_enum(self):
+        self.element_start('enum')
+        name = self.character_data()
+        self.element_end('enum')
+        return NamedConstant(name)
+        
+    def parse_string(self):
+        self.element_start('string')
+        value = self.character_data()
+        self.element_end('string')
+        return Literal(value)
+        
+    def parse_bytes(self):
+        self.element_start('bytes')
+        value = binascii.a2b_hex(self.character_data())
+        self.element_end('bytes')
+        return Literal(value)
+        
+    def parse_array(self):
+        self.element_start('array')
+        elems = []
+        while self.token.type != ELEMENT_END:
+            elems.append(self.parse_elem())
+        self.element_end('array')
+        return Array(elems)
+
+    def parse_elem(self):
+        self.element_start('elem')
+        value = self.parse_value()
+        self.element_end('elem')
+        return value
+
+    def parse_struct(self):
+        attrs = self.element_start('struct')
+        name = attrs['name']
+        members = []
+        while self.token.type != ELEMENT_END:
+            members.append(self.parse_member())
+        self.element_end('struct')
+        return Struct(name, members)
+
+    def parse_member(self):
+        attrs = self.element_start('member')
+        name = attrs['name']
+        value = self.parse_value()
+        self.element_end('member')
+
+        return name, value
+
+    def parse_ptr(self):
+        self.element_start('ptr')
+        address = self.character_data()
+        self.element_end('ptr')
+
+        return Pointer(address)
+
+    def handle_call(self, call):
+        pass
+    
+    
+class TraceDumper(TraceParser):
+    
+    def __init__(self, fp):
+        TraceParser.__init__(self, fp)
+        self.formatter = format.DefaultFormatter(sys.stdout)
+        self.pretty_printer = PrettyPrinter(self.formatter)
+
+    def handle_call(self, call):
+        call.visit(self.pretty_printer)
+        self.formatter.newline()
+        
+
+def main(ParserFactory):
+    for arg in sys.argv[1:]:
+        if arg.endswith('.gz'):
+            import gzip
+            stream = gzip.GzipFile(arg, 'rt')
+        else:
+            stream = open(arg, 'rt')
+        parser = ParserFactory(stream)
+        parser.parse()
+
+
+if __name__ == '__main__':
+    main(TraceDumper)
diff --git a/src/gallium/state_trackers/python/samples/tri.py b/src/gallium/state_trackers/python/samples/tri.py
new file mode 100644
index 0000000000..193479f7d6
--- /dev/null
+++ b/src/gallium/state_trackers/python/samples/tri.py
@@ -0,0 +1,229 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+from gallium import *
+
+
+def make_image(surface):
+    pixels = FloatArray(surface.height*surface.width*4)
+    surface.get_tile_rgba(0, 0, surface.width, surface.height, pixels)
+
+    import Image
+    outimage = Image.new(
+        mode='RGB',
+        size=(surface.width, surface.height),
+        color=(0,0,0))
+    outpixels = outimage.load()
+    for y in range(0, surface.height):
+        for x in range(0, surface.width):
+            offset = (y*surface.width + x)*4
+            r, g, b, a = [int(pixels[offset + ch]*255) for ch in range(4)]
+            outpixels[x, y] = r, g, b
+    return outimage
+
+def save_image(filename, surface):
+    outimage = make_image(surface)
+    outimage.save(filename, "PNG")
+
+def show_image(surface):
+    outimage = make_image(surface)
+    
+    import Tkinter as tk
+    from PIL import Image, ImageTk
+    root = tk.Tk()
+    
+    root.title('background image')
+    
+    image1 = ImageTk.PhotoImage(outimage)
+    w = image1.width()
+    h = image1.height()
+    x = 100
+    y = 100
+    root.geometry("%dx%d+%d+%d" % (w, h, x, y))
+    panel1 = tk.Label(root, image=image1)
+    panel1.pack(side='top', fill='both', expand='yes')
+    panel1.image = image1
+    root.mainloop()
+
+
+def test(dev):
+    ctx = dev.context_create()
+
+    width = 255
+    height = 255
+
+    # disabled blending/masking
+    blend = Blend()
+    blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE
+    blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE
+    blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO
+    blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO
+    blend.colormask = PIPE_MASK_RGBA
+    ctx.set_blend(blend)
+
+    # no-op depth/stencil/alpha
+    depth_stencil_alpha = DepthStencilAlpha()
+    ctx.set_depth_stencil_alpha(depth_stencil_alpha)
+
+    # rasterizer
+    rasterizer = Rasterizer()
+    rasterizer.front_winding = PIPE_WINDING_CW
+    rasterizer.cull_mode = PIPE_WINDING_NONE
+    rasterizer.bypass_clipping = 1
+    rasterizer.scissor = 1
+    #rasterizer.bypass_vs = 1
+    ctx.set_rasterizer(rasterizer)
+
+    # viewport (identity, we setup vertices in wincoords)
+    viewport = Viewport()
+    scale = FloatArray(4)
+    scale[0] = 1.0
+    scale[1] = 1.0
+    scale[2] = 1.0
+    scale[3] = 1.0
+    viewport.scale = scale
+    translate = FloatArray(4)
+    translate[0] = 0.0
+    translate[1] = 0.0
+    translate[2] = 0.0
+    translate[3] = 0.0
+    viewport.translate = translate
+    ctx.set_viewport(viewport)
+
+    # samplers
+    sampler = Sampler()
+    sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+    sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+    sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+    sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE
+    sampler.min_img_filter = PIPE_TEX_MIPFILTER_NEAREST
+    sampler.mag_img_filter = PIPE_TEX_MIPFILTER_NEAREST
+    sampler.normalized_coords = 1
+    ctx.set_sampler(0, sampler)
+
+    # scissor
+    scissor = Scissor()
+    scissor.minx = 0
+    scissor.miny = 0
+    scissor.maxx = width
+    scissor.maxy = height
+    ctx.set_scissor(scissor)
+
+    clip = Clip()
+    clip.nr = 0
+    ctx.set_clip(clip)
+
+    # framebuffer
+    cbuf = dev.texture_create(
+        PIPE_FORMAT_X8R8G8B8_UNORM, 
+        width, height,
+        tex_usage=PIPE_TEXTURE_USAGE_DISPLAY_TARGET,
+    )
+    _cbuf = cbuf.get_surface(usage = PIPE_BUFFER_USAGE_GPU_READ|PIPE_BUFFER_USAGE_GPU_WRITE)
+    fb = Framebuffer()
+    fb.width = width
+    fb.height = height
+    fb.num_cbufs = 1
+    fb.set_cbuf(0, _cbuf)
+    ctx.set_framebuffer(fb)
+    _cbuf.clear_value = 0x00000000
+    ctx.surface_clear(_cbuf, _cbuf.clear_value)
+    del _cbuf
+    
+    # vertex shader
+    vs = Shader('''
+        VERT1.1
+        DCL IN[0], POSITION, CONSTANT
+        DCL IN[1], COLOR, CONSTANT
+        DCL OUT[0], POSITION, CONSTANT
+        DCL OUT[1], COLOR, CONSTANT
+        0:MOV OUT[0], IN[0]
+        1:MOV OUT[1], IN[1]
+        2:END
+    ''')
+    ctx.set_vertex_shader(vs)
+
+    # fragment shader
+    fs = Shader('''
+        FRAG1.1
+        DCL IN[0], COLOR, LINEAR
+        DCL OUT[0], COLOR, CONSTANT
+        0:MOV OUT[0], IN[0]
+        1:END
+    ''')
+    ctx.set_fragment_shader(fs)
+
+    nverts = 3
+    nattrs = 2
+    verts = FloatArray(nverts * nattrs * 4)
+
+    verts[ 0] = 128.0 # x1
+    verts[ 1] =  32.0 # y1
+    verts[ 2] =   0.0 # z1
+    verts[ 3] =   1.0 # w1
+    verts[ 4] =   1.0 # r1
+    verts[ 5] =   0.0 # g1
+    verts[ 6] =   0.0 # b1
+    verts[ 7] =   1.0 # a1
+    verts[ 8] =  32.0 # x2
+    verts[ 9] = 224.0 # y2
+    verts[10] =   0.0 # z2
+    verts[11] =   1.0 # w2
+    verts[12] =   0.0 # r2
+    verts[13] =   1.0 # g2
+    verts[14] =   0.0 # b2
+    verts[15] =   1.0 # a2
+    verts[16] = 224.0 # x3
+    verts[17] = 224.0 # y3
+    verts[18] =   0.0 # z3
+    verts[19] =   1.0 # w3
+    verts[20] =   0.0 # r3
+    verts[21] =   0.0 # g3
+    verts[22] =   1.0 # b3
+    verts[23] =   1.0 # a3
+
+    ctx.draw_vertices(PIPE_PRIM_TRIANGLES,
+                      nverts, 
+                      nattrs, 
+                      verts)
+
+    ctx.flush()
+    
+    show_image(cbuf.get_surface(usage = PIPE_BUFFER_USAGE_CPU_READ|PIPE_BUFFER_USAGE_CPU_WRITE))
+    #save_image('tri.png', cbuf.get_surface(usage = PIPE_BUFFER_USAGE_CPU_READ|PIPE_BUFFER_USAGE_CPU_WRITE))
+
+
+
+def main():
+    dev = Device()
+    test(dev)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/state_trackers/python/st_device.c b/src/gallium/state_trackers/python/st_device.c
new file mode 100644
index 0000000000..95c1378a03
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_device.c
@@ -0,0 +1,323 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_inlines.h"
+#include "cso_cache/cso_context.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_shaders.h"
+#include "trace/tr_screen.h"
+#include "trace/tr_context.h"
+
+#include "st_device.h"
+#include "st_winsys.h"
+
+
+static void
+st_device_really_destroy(struct st_device *st_dev) 
+{
+   if(st_dev->screen)
+      st_dev->screen->destroy(st_dev->screen);
+   
+   FREE(st_dev);
+}
+
+
+void
+st_device_destroy(struct st_device *st_dev) 
+{
+   if(!--st_dev->refcount)
+      st_device_really_destroy(st_dev);
+}
+
+
+static struct st_device *
+st_device_create_from_st_winsys(const struct st_winsys *st_ws) 
+{
+   struct st_device *st_dev;
+   
+   if(!st_ws->screen_create ||
+      !st_ws->context_create)
+      return NULL;
+   
+   st_dev = CALLOC_STRUCT(st_device);
+   if(!st_dev)
+      return NULL;
+   
+   st_dev->refcount = 1;
+   st_dev->st_ws = st_ws;
+   
+   st_dev->real_screen = st_ws->screen_create();
+   if(!st_dev->real_screen) {
+      st_device_destroy(st_dev);
+      return NULL;
+   }
+
+   st_dev->screen = trace_screen_create(st_dev->real_screen);
+   if(!st_dev->screen) {
+      st_device_destroy(st_dev);
+      return NULL;
+   }
+   
+   return st_dev;
+}
+
+
+struct st_device *
+st_device_create(boolean hardware) {
+   if(hardware)
+      return st_device_create_from_st_winsys(&st_hardpipe_winsys);
+   else
+      return st_device_create_from_st_winsys(&st_softpipe_winsys);
+}
+
+
+void
+st_context_destroy(struct st_context *st_ctx) 
+{
+   unsigned i;
+   
+   if(st_ctx) {
+      struct st_device *st_dev = st_ctx->st_dev;
+      
+      if(st_ctx->cso) {
+         cso_delete_vertex_shader(st_ctx->cso, st_ctx->vs);
+         cso_delete_fragment_shader(st_ctx->cso, st_ctx->fs);
+         
+         cso_destroy_context(st_ctx->cso);
+      }
+      
+      if(st_ctx->pipe)
+         st_ctx->pipe->destroy(st_ctx->pipe);
+      
+      for(i = 0; i < PIPE_MAX_SAMPLERS; ++i)
+         pipe_texture_reference(&st_ctx->sampler_textures[i], NULL);
+      pipe_texture_reference(&st_ctx->default_texture, NULL);
+
+      FREE(st_ctx);
+      
+      if(!--st_dev->refcount)
+         st_device_really_destroy(st_dev);
+   }
+}
+
+
+struct st_context *
+st_context_create(struct st_device *st_dev) 
+{
+   struct st_context *st_ctx;
+   
+   st_ctx = CALLOC_STRUCT(st_context);
+   if(!st_ctx)
+      return NULL;
+   
+   st_ctx->st_dev = st_dev;
+   ++st_dev->refcount;
+   
+   st_ctx->real_pipe = st_dev->st_ws->context_create(st_dev->real_screen);
+   if(!st_ctx->real_pipe) {
+      st_context_destroy(st_ctx);
+      return NULL;
+   }
+   
+   st_ctx->pipe = trace_context_create(st_dev->screen, st_ctx->real_pipe);
+   if(!st_ctx->pipe) {
+      st_context_destroy(st_ctx);
+      return NULL;
+   }
+
+   st_ctx->cso = cso_create_context(st_ctx->pipe);
+   if(!st_ctx->cso) {
+      st_context_destroy(st_ctx);
+      return NULL;
+   }
+   
+   /* disabled blending/masking */
+   {
+      struct pipe_blend_state blend;
+      memset(&blend, 0, sizeof(blend));
+      blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+      blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO;
+      blend.colormask = PIPE_MASK_RGBA;
+      cso_set_blend(st_ctx->cso, &blend);
+   }
+
+   /* no-op depth/stencil/alpha */
+   {
+      struct pipe_depth_stencil_alpha_state depthstencil;
+      memset(&depthstencil, 0, sizeof(depthstencil));
+      cso_set_depth_stencil_alpha(st_ctx->cso, &depthstencil);
+   }
+
+   /* rasterizer */
+   {
+      struct pipe_rasterizer_state rasterizer;
+      memset(&rasterizer, 0, sizeof(rasterizer));
+      rasterizer.front_winding = PIPE_WINDING_CW;
+      rasterizer.cull_mode = PIPE_WINDING_NONE;
+      rasterizer.bypass_clipping = 1;
+      /*rasterizer.bypass_vs = 1;*/
+      cso_set_rasterizer(st_ctx->cso, &rasterizer);
+   }
+
+   /* identity viewport */
+   {
+      struct pipe_viewport_state viewport;
+      viewport.scale[0] = 1.0;
+      viewport.scale[1] = 1.0;
+      viewport.scale[2] = 1.0;
+      viewport.scale[3] = 1.0;
+      viewport.translate[0] = 0.0;
+      viewport.translate[1] = 0.0;
+      viewport.translate[2] = 0.0;
+      viewport.translate[3] = 0.0;
+      cso_set_viewport(st_ctx->cso, &viewport);
+   }
+
+   /* samplers */
+   {
+      struct pipe_sampler_state sampler;
+      unsigned i;
+      memset(&sampler, 0, sizeof(sampler));
+      sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
+      sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
+      sampler.min_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
+      sampler.mag_img_filter = PIPE_TEX_MIPFILTER_NEAREST;
+      sampler.normalized_coords = 1;
+      for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+         cso_single_sampler(st_ctx->cso, i, &sampler);
+      cso_single_sampler_done(st_ctx->cso);
+   }
+
+   /* default textures */
+   {
+      struct pipe_screen *screen = st_dev->screen;
+      struct pipe_texture templat;
+      struct pipe_surface *surface;
+      unsigned i;
+
+      memset( &templat, 0, sizeof( templat ) );
+      templat.target = PIPE_TEXTURE_2D;
+      templat.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+      templat.block.size = 4;
+      templat.block.width = 1;
+      templat.block.height = 1;
+      templat.width[0] = 1;
+      templat.height[0] = 1;
+      templat.depth[0] = 1;
+      templat.last_level = 0;
+   
+      st_ctx->default_texture = screen->texture_create( screen, &templat );
+      if(st_ctx->default_texture) {
+         surface = screen->get_tex_surface( screen, 
+                                            st_ctx->default_texture, 0, 0, 0,
+                                            PIPE_BUFFER_USAGE_CPU_WRITE );
+         if(surface) {
+            uint32_t *map;
+            map = (uint32_t *) pipe_surface_map(surface, PIPE_BUFFER_USAGE_CPU_WRITE );
+            if(map) {
+               *map = 0x00000000;
+               pipe_surface_unmap( surface );
+            }
+            pipe_surface_reference(&surface, NULL);
+         }
+      }
+   
+      for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+         pipe_texture_reference(&st_ctx->sampler_textures[i], st_ctx->default_texture);
+      
+      cso_set_sampler_textures(st_ctx->cso, PIPE_MAX_SAMPLERS, st_ctx->sampler_textures);
+   }
+   
+   /* vertex shader */
+   {
+      struct pipe_shader_state vert_shader;
+
+      const uint semantic_names[] = { TGSI_SEMANTIC_POSITION,
+                                      TGSI_SEMANTIC_GENERIC };
+      const uint semantic_indexes[] = { 0, 0 };
+      st_ctx->vs = util_make_vertex_passthrough_shader(st_ctx->pipe, 
+                                                       2, 
+                                                       semantic_names,
+                                                       semantic_indexes,
+                                                       &vert_shader);
+      cso_set_vertex_shader_handle(st_ctx->cso, st_ctx->vs);
+   }
+
+   /* fragment shader */
+   {
+      struct pipe_shader_state frag_shader;
+      st_ctx->fs = util_make_fragment_passthrough_shader(st_ctx->pipe, 
+                                                         &frag_shader);
+      cso_set_fragment_shader_handle(st_ctx->cso, st_ctx->fs);
+   }
+
+   return st_ctx;
+}
+
+
+void
+st_buffer_destroy(struct st_buffer *st_buf)
+{
+   if(st_buf) {
+      struct pipe_screen *screen = st_buf->st_dev->screen;
+      pipe_buffer_reference(screen, &st_buf->buffer, NULL);
+      FREE(st_buf);
+   }
+}
+
+
+struct st_buffer *
+st_buffer_create(struct st_device *st_dev,
+                 unsigned alignment, unsigned usage, unsigned size)
+{
+   struct pipe_screen *screen = st_dev->screen;
+   struct st_buffer *st_buf;
+   
+   st_buf = CALLOC_STRUCT(st_buffer);
+   if(!st_buf)
+      return NULL;
+
+   st_buf->st_dev = st_dev;
+   
+   st_buf->buffer = pipe_buffer_create(screen, alignment, usage, size);
+   if(!st_buf->buffer) {
+      st_buffer_destroy(st_buf);
+      return NULL;
+   }
+   
+   return st_buf;
+}
+
diff --git a/src/gallium/state_trackers/python/st_device.h b/src/gallium/state_trackers/python/st_device.h
new file mode 100644
index 0000000000..7cfe6de9f6
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_device.h
@@ -0,0 +1,101 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_DEVICE_H_
+#define ST_DEVICE_H_
+
+
+#include "pipe/p_state.h"
+
+struct cso_context;
+struct pipe_screen;
+struct pipe_context;
+struct st_winsys; 
+
+
+struct st_buffer {
+   struct st_device *st_dev;
+   
+   struct pipe_buffer *buffer;
+};
+
+
+struct st_context {
+   struct st_device *st_dev;
+   
+   struct pipe_context *real_pipe;
+   struct pipe_context *pipe;
+   
+   struct cso_context *cso;
+   
+   void *vs;
+   void *fs;
+
+   struct pipe_texture *default_texture;
+   struct pipe_texture *sampler_textures[PIPE_MAX_SAMPLERS];
+   
+   unsigned num_vertex_buffers;
+   struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
+   
+   unsigned num_vertex_elements;
+   struct pipe_vertex_element vertex_elements[PIPE_MAX_ATTRIBS];
+};
+
+
+struct st_device {
+   const struct st_winsys *st_ws; 
+
+   struct pipe_screen *real_screen;
+   struct pipe_screen *screen;
+   
+   /* FIXME: we also need to refcount for textures and surfaces... */
+   unsigned refcount;
+};
+
+
+struct st_buffer *
+st_buffer_create(struct st_device *st_dev,
+                 unsigned alignment, unsigned usage, unsigned size);
+
+void
+st_buffer_destroy(struct st_buffer *st_buf);
+
+struct st_context *
+st_context_create(struct st_device *st_dev);
+
+void
+st_context_destroy(struct st_context *st_ctx);
+
+struct st_device *
+st_device_create(boolean hardware);
+
+void
+st_device_destroy(struct st_device *st_dev);
+
+
+#endif /* ST_DEVICE_H_ */
diff --git a/src/gallium/state_trackers/python/st_hardpipe_winsys.c b/src/gallium/state_trackers/python/st_hardpipe_winsys.c
new file mode 100644
index 0000000000..8b33c70fd7
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_hardpipe_winsys.c
@@ -0,0 +1,62 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Stub for hardware pipe driver support.
+ */
+
+
+#include "pipe/p_compiler.h"
+
+#include "st_winsys.h"
+
+
+/* XXX: Force init_gallium symbol to be linked */
+extern void init_gallium(void);
+void (*force_init_gallium_linkage)(void) = &init_gallium;
+
+
+static struct pipe_screen *
+st_hardpipe_screen_create(void)
+{
+   return st_softpipe_winsys.screen_create();
+}
+
+
+static struct pipe_context *
+st_hardpipe_context_create(struct pipe_screen *screen)
+{
+   return st_softpipe_winsys.context_create(screen);
+}
+
+
+const struct st_winsys st_hardpipe_winsys = {
+   &st_hardpipe_screen_create,
+   &st_hardpipe_context_create
+};
diff --git a/src/gallium/state_trackers/python/st_sample.c b/src/gallium/state_trackers/python/st_sample.c
new file mode 100644
index 0000000000..7765df3c4a
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_sample.c
@@ -0,0 +1,549 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+#include "util/u_tile.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "st_sample.h"
+
+
+/**
+ * Use our own pseudo random generator to ensure consistent runs among
+ * multiple runs and platforms.
+ * 
+ * @sa http://en.wikipedia.org/wiki/Linear_congruential_generator
+ */
+static uint32_t st_random(void) {
+   static uint64_t seed = UINT64_C(0xbb9a063afb0a739d);
+
+   seed = UINT64_C(134775813) * seed + UINT64_C(1);
+   
+   return (uint16_t)(seed >> 32); 
+}
+
+
+/**
+ * We don't want to include the patent-encumbered DXT code here, so instead
+ * we store several uncompressed/compressed data pairs for hardware testing
+ * purposes. 
+ */
+struct dxt_data
+{
+   uint8_t rgba[16*4];
+   uint8_t raw[16];
+};
+
+
+static const struct dxt_data 
+dxt1_rgb_data[] = {
+   {
+      {
+         0x99, 0xb0, 0x8e, 0xff,
+         0x5d, 0x62, 0x89, 0xff,
+         0x99, 0xb0, 0x8e, 0xff,
+         0x99, 0xb0, 0x8e, 0xff,
+         0xd6, 0xff, 0x94, 0xff,
+         0x5d, 0x62, 0x89, 0xff,
+         0x99, 0xb0, 0x8e, 0xff,
+         0xd6, 0xff, 0x94, 0xff,
+         0x5d, 0x62, 0x89, 0xff,
+         0x5d, 0x62, 0x89, 0xff,
+         0x99, 0xb0, 0x8e, 0xff,
+         0x21, 0x14, 0x84, 0xff,
+         0x5d, 0x62, 0x89, 0xff,
+         0x21, 0x14, 0x84, 0xff,
+         0x21, 0x14, 0x84, 0xff,
+         0x99, 0xb0, 0x8e, 0xff
+      },
+      {0xf2, 0xd7, 0xb0, 0x20, 0xae, 0x2c, 0x6f, 0x97}
+   },
+   {
+      {
+         0xb5, 0xcf, 0x9c, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0x21, 0x08, 0x6b, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0x52, 0x4a, 0x7b, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0xb5, 0xcf, 0x9c, 0xff,
+         0x21, 0x08, 0x6b, 0xff,
+         0xb5, 0xcf, 0x9c, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0x52, 0x4a, 0x7b, 0xff,
+         0xb5, 0xcf, 0x9c, 0xff,
+         0x83, 0x8c, 0x8b, 0xff,
+         0x52, 0x4a, 0x7b, 0xff,
+         0x83, 0x8c, 0x8b, 0xff
+      },
+      {0x73, 0xb6, 0x4d, 0x20, 0x98, 0x2b, 0xe1, 0xb8}
+   },
+   {
+      {
+         0x00, 0x2c, 0xff, 0xff,
+         0x94, 0x8d, 0x7b, 0xff,
+         0x4a, 0x5c, 0xbd, 0xff,
+         0x4a, 0x5c, 0xbd, 0xff,
+         0x4a, 0x5c, 0xbd, 0xff,
+         0x94, 0x8d, 0x7b, 0xff,
+         0x94, 0x8d, 0x7b, 0xff,
+         0x94, 0x8d, 0x7b, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0x94, 0x8d, 0x7b, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0xde, 0xbe, 0x39, 0xff,
+         0x94, 0x8d, 0x7b, 0xff
+      },
+      {0xe7, 0xdd, 0x7f, 0x01, 0xf9, 0xab, 0x08, 0x80}
+   },
+   {
+      {
+         0x6b, 0x24, 0x21, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x8b, 0x7a, 0x99, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x6b, 0x24, 0x21, 0xff,
+         0x8b, 0x7a, 0x99, 0xff,
+         0x9c, 0xa6, 0xd6, 0xff,
+         0x6b, 0x24, 0x21, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x8b, 0x7a, 0x99, 0xff,
+         0x6b, 0x24, 0x21, 0xff,
+         0x8b, 0x7a, 0x99, 0xff,
+         0x7b, 0x4f, 0x5d, 0xff,
+         0x9c, 0xa6, 0xd6, 0xff
+      },
+      {0x3a, 0x9d, 0x24, 0x69, 0xbd, 0x9f, 0xb4, 0x39}
+   }
+};
+
+
+static const struct dxt_data 
+dxt1_rgba_data[] = {
+   {
+      {
+         0x00, 0x00, 0x00, 0x00,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x29, 0xff, 0xff, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x73, 0x55, 0x21, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x4e, 0xaa, 0x90, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x4e, 0xaa, 0x90, 0xff
+      },
+      {0xff, 0x2f, 0xa4, 0x72, 0xeb, 0xb2, 0xbd, 0xbe}
+   },
+   {
+      {
+         0xb5, 0xe3, 0x63, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x6b, 0x24, 0x84, 0xff,
+         0xb5, 0xe3, 0x63, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0xb5, 0xe3, 0x63, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x6b, 0x24, 0x84, 0xff,
+         0x6b, 0x24, 0x84, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0xb5, 0xe3, 0x63, 0xff,
+         0x90, 0x83, 0x73, 0xff,
+         0xb5, 0xe3, 0x63, 0xff
+      },
+      {0x30, 0x69, 0x0c, 0xb7, 0x4d, 0xf7, 0x0f, 0x67}
+   },
+   {
+      {
+         0x00, 0x00, 0x00, 0x00,
+         0xc6, 0x86, 0x8c, 0xff,
+         0xc6, 0x86, 0x8c, 0xff,
+         0x21, 0x65, 0x42, 0xff,
+         0x21, 0x65, 0x42, 0xff,
+         0x21, 0x65, 0x42, 0xff,
+         0x21, 0x65, 0x42, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x21, 0x65, 0x42, 0xff,
+         0xc6, 0x86, 0x8c, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0xc6, 0x86, 0x8c, 0xff
+      },
+      {0x28, 0x23, 0x31, 0xc4, 0x17, 0xc0, 0xd3, 0x7f}
+   },
+   {
+      {
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0xc6, 0xe3, 0x9c, 0xff,
+         0x7b, 0x1c, 0x52, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x00, 0x00, 0x00, 0x00,
+         0x7b, 0x1c, 0x52, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0x7b, 0x1c, 0x52, 0xff,
+         0xa0, 0x7f, 0x77, 0xff,
+         0xc6, 0xe3, 0x9c, 0xff,
+         0x00, 0x00, 0x00, 0x00,
+         0xa0, 0x7f, 0x77, 0xff
+      },
+      {0xea, 0x78, 0x13, 0xc7, 0x7f, 0xfc, 0x33, 0xb6}
+   },
+};
+
+
+static const struct dxt_data 
+dxt3_rgba_data[] = {
+   {
+      {
+         0x6d, 0xc6, 0x96, 0x77,
+         0x6d, 0xc6, 0x96, 0xee,
+         0x6d, 0xc6, 0x96, 0xaa,
+         0x8c, 0xff, 0xb5, 0x44,
+         0x6d, 0xc6, 0x96, 0xff,
+         0x6d, 0xc6, 0x96, 0x88,
+         0x31, 0x55, 0x5a, 0x66,
+         0x6d, 0xc6, 0x96, 0x99,
+         0x31, 0x55, 0x5a, 0xbb,
+         0x31, 0x55, 0x5a, 0x55,
+         0x31, 0x55, 0x5a, 0x11,
+         0x6d, 0xc6, 0x96, 0xcc,
+         0x6d, 0xc6, 0x96, 0xcc,
+         0x6d, 0xc6, 0x96, 0x11,
+         0x31, 0x55, 0x5a, 0x44,
+         0x31, 0x55, 0x5a, 0x88
+      },
+      {0xe7, 0x4a, 0x8f, 0x96, 0x5b, 0xc1, 0x1c, 0x84, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a}
+   },
+   {
+      {
+         0xad, 0xeb, 0x73, 0x99,
+         0x97, 0xaa, 0x86, 0x66,
+         0x6b, 0x28, 0xad, 0x99,
+         0xad, 0xeb, 0x73, 0x99,
+         0x6b, 0x28, 0xad, 0x22,
+         0xad, 0xeb, 0x73, 0xff,
+         0x97, 0xaa, 0x86, 0x55,
+         0x6b, 0x28, 0xad, 0x55,
+         0x6b, 0x28, 0xad, 0x44,
+         0xad, 0xeb, 0x73, 0x33,
+         0x6b, 0x28, 0xad, 0xee,
+         0x6b, 0x28, 0xad, 0x99,
+         0x97, 0xaa, 0x86, 0x66,
+         0xad, 0xeb, 0x73, 0xbb,
+         0x97, 0xaa, 0x86, 0x99,
+         0xad, 0xeb, 0x73, 0xbb
+      },
+      {0x69, 0x99, 0xf2, 0x55, 0x34, 0x9e, 0xb6, 0xb9, 0x4e, 0xaf, 0x55, 0x69, 0x18, 0x61, 0x51, 0x22}
+   },
+   {
+      {
+         0x63, 0xd7, 0xd6, 0x00,
+         0x57, 0x62, 0x5d, 0xdd,
+         0x57, 0x62, 0x5d, 0xcc,
+         0x57, 0x62, 0x5d, 0xbb,
+         0x52, 0x28, 0x21, 0xaa,
+         0x57, 0x62, 0x5d, 0xcc,
+         0x57, 0x62, 0x5d, 0xcc,
+         0x57, 0x62, 0x5d, 0x66,
+         0x57, 0x62, 0x5d, 0x22,
+         0x57, 0x62, 0x5d, 0xdd,
+         0x63, 0xd7, 0xd6, 0xee,
+         0x57, 0x62, 0x5d, 0x33,
+         0x63, 0xd7, 0xd6, 0x55,
+         0x52, 0x28, 0x21, 0x55,
+         0x57, 0x62, 0x5d, 0x11,
+         0x5d, 0x9c, 0x99, 0xee
+      },
+      {0xd0, 0xbc, 0xca, 0x6c, 0xd2, 0x3e, 0x55, 0xe1, 0xba, 0x66, 0x44, 0x51, 0xfc, 0xfd, 0xcf, 0xb4}
+   },
+   {
+      {
+         0x94, 0x6f, 0x60, 0x22,
+         0x94, 0x6f, 0x60, 0x22,
+         0xc5, 0xab, 0x76, 0x11,
+         0xc5, 0xab, 0x76, 0xee,
+         0x63, 0x34, 0x4a, 0xdd,
+         0x63, 0x34, 0x4a, 0x33,
+         0x94, 0x6f, 0x60, 0x77,
+         0xf7, 0xe7, 0x8c, 0x00,
+         0x94, 0x6f, 0x60, 0x33,
+         0x63, 0x34, 0x4a, 0xaa,
+         0x94, 0x6f, 0x60, 0x77,
+         0x63, 0x34, 0x4a, 0xcc,
+         0x94, 0x6f, 0x60, 0xaa,
+         0xf7, 0xe7, 0x8c, 0x99,
+         0x63, 0x34, 0x4a, 0x44,
+         0xc5, 0xab, 0x76, 0xaa
+      },
+      {0x22, 0xe1, 0x3d, 0x07, 0xa3, 0xc7, 0x9a, 0xa4, 0x31, 0xf7, 0xa9, 0x61, 0xaf, 0x35, 0x77, 0x93}
+   },
+};
+
+
+static const struct dxt_data 
+dxt5_rgba_data[] = {
+   {
+      {
+         0x6d, 0xc6, 0x96, 0x74,
+         0x6d, 0xc6, 0x96, 0xf8,
+         0x6d, 0xc6, 0x96, 0xb6,
+         0x8c, 0xff, 0xb5, 0x53,
+         0x6d, 0xc6, 0x96, 0xf8,
+         0x6d, 0xc6, 0x96, 0x95,
+         0x31, 0x55, 0x5a, 0x53,
+         0x6d, 0xc6, 0x96, 0x95,
+         0x31, 0x55, 0x5a, 0xb6,
+         0x31, 0x55, 0x5a, 0x53,
+         0x31, 0x55, 0x5a, 0x11,
+         0x6d, 0xc6, 0x96, 0xd7,
+         0x6d, 0xc6, 0x96, 0xb6,
+         0x6d, 0xc6, 0x96, 0x11,
+         0x31, 0x55, 0x5a, 0x32,
+         0x31, 0x55, 0x5a, 0x95
+      },
+      {0xf8, 0x11, 0xc5, 0x0c, 0x9a, 0x73, 0xb4, 0x9c, 0xf6, 0x8f, 0xab, 0x32, 0x2a, 0x9a, 0x95, 0x5a}
+   },
+   {
+      {
+         0xad, 0xeb, 0x73, 0xa1,
+         0x97, 0xaa, 0x86, 0x65,
+         0x6b, 0x28, 0xad, 0xa1,
+         0xad, 0xeb, 0x73, 0xa1,
+         0x6b, 0x28, 0xad, 0x2a,
+         0xad, 0xeb, 0x73, 0xfb,
+         0x97, 0xaa, 0x86, 0x47,
+         0x6b, 0x28, 0xad, 0x65,
+         0x6b, 0x28, 0xad, 0x47,
+         0xad, 0xeb, 0x73, 0x47,
+         0x6b, 0x28, 0xad, 0xdd,
+         0x6b, 0x28, 0xad, 0xa1,
+         0x97, 0xaa, 0x86, 0x65,
+         0xad, 0xeb, 0x73, 0xbf,
+         0x97, 0xaa, 0x86, 0xa1,
+         0xad, 0xeb, 0x73, 0xbf
+      },
+      {0xfb, 0x2a, 0x34, 0x19, 0xdc, 0xbf, 0xe8, 0x71, 0x4e, 0xaf, 0x55, 0x69, 0x18, 0x61, 0x51, 0x22}
+   },
+   {
+      {
+         0x63, 0xd7, 0xd6, 0x00,
+         0x57, 0x62, 0x5d, 0xf5,
+         0x57, 0x62, 0x5d, 0xd2,
+         0x57, 0x62, 0x5d, 0xaf,
+         0x52, 0x28, 0x21, 0xaf,
+         0x57, 0x62, 0x5d, 0xd2,
+         0x57, 0x62, 0x5d, 0xd2,
+         0x57, 0x62, 0x5d, 0x69,
+         0x57, 0x62, 0x5d, 0x23,
+         0x57, 0x62, 0x5d, 0xd2,
+         0x63, 0xd7, 0xd6, 0xf5,
+         0x57, 0x62, 0x5d, 0x46,
+         0x63, 0xd7, 0xd6, 0x46,
+         0x52, 0x28, 0x21, 0x69,
+         0x57, 0x62, 0x5d, 0x23,
+         0x5d, 0x9c, 0x99, 0xf5
+      },
+      {0xf5, 0x00, 0x81, 0x36, 0xa9, 0x17, 0xec, 0x1e, 0xba, 0x66, 0x44, 0x51, 0xfc, 0xfd, 0xcf, 0xb4}
+   },
+   {
+      {
+         0x94, 0x6f, 0x60, 0x25,
+         0x94, 0x6f, 0x60, 0x25,
+         0xc5, 0xab, 0x76, 0x05,
+         0xc5, 0xab, 0x76, 0xe8,
+         0x63, 0x34, 0x4a, 0xe8,
+         0x63, 0x34, 0x4a, 0x25,
+         0x94, 0x6f, 0x60, 0x86,
+         0xf7, 0xe7, 0x8c, 0x05,
+         0x94, 0x6f, 0x60, 0x25,
+         0x63, 0x34, 0x4a, 0xa7,
+         0x94, 0x6f, 0x60, 0x66,
+         0x63, 0x34, 0x4a, 0xc7,
+         0x94, 0x6f, 0x60, 0xa7,
+         0xf7, 0xe7, 0x8c, 0xa7,
+         0x63, 0x34, 0x4a, 0x45,
+         0xc5, 0xab, 0x76, 0xa7
+      },
+      {0xe8, 0x05, 0x7f, 0x80, 0x33, 0x5f, 0xb5, 0x79, 0x31, 0xf7, 0xa9, 0x61, 0xaf, 0x35, 0x77, 0x93}
+   },
+};
+
+
+static INLINE void 
+st_sample_dxt_pixel_block(enum pipe_format format, 
+                          const struct pipe_format_block *block,
+                          uint8_t *raw,
+                          float *rgba, unsigned rgba_stride, 
+                          unsigned w, unsigned h)
+{
+   const struct dxt_data *data;
+   unsigned n;
+   unsigned i;
+   unsigned x, y, ch;
+   
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGB:
+      data = dxt1_rgb_data;
+      n = sizeof(dxt1_rgb_data)/sizeof(dxt1_rgb_data[0]);
+      break;
+   case PIPE_FORMAT_DXT1_RGBA:
+      data = dxt1_rgba_data;
+      n = sizeof(dxt1_rgba_data)/sizeof(dxt1_rgba_data[0]);
+      break;
+   case PIPE_FORMAT_DXT3_RGBA:
+      data = dxt3_rgba_data;
+      n = sizeof(dxt3_rgba_data)/sizeof(dxt3_rgba_data[0]);
+      break;
+   case PIPE_FORMAT_DXT5_RGBA:
+      data = dxt5_rgba_data;
+      n = sizeof(dxt5_rgba_data)/sizeof(dxt5_rgba_data[0]);
+      break;
+   default:
+      assert(0);
+   }
+   
+   i = st_random() % n;
+   
+   for(y = 0; y < h; ++y)
+      for(x = 0; x < w; ++x)
+         for(ch = 0; ch < 4; ++ch)
+            rgba[y*rgba_stride + x*4 + ch] = (float)(data[i].rgba[y*4*4 + x*4 + ch])/255.0f;
+   
+   memcpy(raw, data[i].raw, block->size);
+}
+
+
+static INLINE void 
+st_sample_generic_pixel_block(enum pipe_format format, 
+                              const struct pipe_format_block *block,
+                              uint8_t *raw,
+                              float *rgba, unsigned rgba_stride,
+                              unsigned w, unsigned h)
+{
+   unsigned i;
+   unsigned x, y, ch;
+   
+   for(i = 0; i < block->size; ++i)
+      raw[i] = (uint8_t)st_random();
+   
+   
+   pipe_tile_raw_to_rgba(format,
+                         raw,
+                         w, h,
+                         rgba, rgba_stride);
+ 
+   if(format == PIPE_FORMAT_YCBCR || format == PIPE_FORMAT_YCBCR_REV) {
+      for(y = 0; y < h; ++y) {
+         for(x = 0; x < w; ++x) {
+            for(ch = 0; ch < 4; ++ch) {
+               unsigned offset = y*rgba_stride + x*4 + ch;
+               rgba[offset] = CLAMP(rgba[offset], 0.0f, 1.0f);
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Randomly sample pixels.
+ */
+void 
+st_sample_pixel_block(enum pipe_format format,
+                      const struct pipe_format_block *block,
+                      void *raw,
+                      float *rgba, unsigned rgba_stride,
+                      unsigned w, unsigned h)
+{
+   switch(format) {
+   case PIPE_FORMAT_DXT1_RGB:
+   case PIPE_FORMAT_DXT1_RGBA:
+   case PIPE_FORMAT_DXT3_RGBA:
+   case PIPE_FORMAT_DXT5_RGBA:
+      st_sample_dxt_pixel_block(format, block, raw, rgba, rgba_stride, w, h);
+      break;
+
+   default:
+      st_sample_generic_pixel_block(format, block, raw, rgba, rgba_stride, w, h);
+      break;
+   }
+}
+
+
+void
+st_sample_surface(struct pipe_surface *surface, float *rgba) 
+{
+   const struct pipe_format_block *block = &surface->block;
+   unsigned rgba_stride = surface->width*4;
+   void *raw;
+   unsigned x, y;
+
+   raw = pipe_surface_map(surface, PIPE_BUFFER_USAGE_CPU_READ);
+   if(!raw)
+      return;
+
+   for (y = 0; y < surface->nblocksy; ++y) {
+      for(x = 0; x < surface->nblocksx; ++x) {
+         st_sample_pixel_block(surface->format,
+                               block,
+                               (uint8_t*)raw + y*surface->stride + x*block->size, 
+                               rgba + y*block->height*rgba_stride + x*block->width*4,
+                               rgba_stride,
+                               MIN2(block->width, surface->width - x*block->width), 
+                               MIN2(block->height, surface->height - y*block->height));
+       }
+   }
+   
+   pipe_surface_unmap(surface);
+}
diff --git a/src/gallium/state_trackers/python/st_sample.h b/src/gallium/state_trackers/python/st_sample.h
new file mode 100644
index 0000000000..ff04a12613
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_sample.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_SAMPLE_H_
+#define ST_SAMPLE_H_
+
+
+#include "pipe/p_format.h"
+
+
+void 
+st_sample_pixel_block(enum pipe_format format,
+                      const struct pipe_format_block *block,
+                      void *raw,
+                      float *rgba, unsigned rgba_stride,
+                      unsigned w, unsigned h);
+
+void
+st_sample_surface(struct pipe_surface *surface, float *rgba);
+
+
+#endif /* ST_SAMPLE_H_ */
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
new file mode 100644
index 0000000000..f62113a469
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -0,0 +1,311 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * @file
+ * Softpipe support. 
+ * 
+ * @author Keith Whitwell
+ * @author Brian Paul
+ * @author Jose Fonseca
+ */
+
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+#include "st_winsys.h"
+
+
+struct st_softpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+};
+
+
+/** Cast wrapper */
+static INLINE struct st_softpipe_buffer *
+st_softpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct st_softpipe_buffer *)buf;
+}
+
+
+static void *
+st_softpipe_buffer_map(struct pipe_winsys *winsys, 
+                       struct pipe_buffer *buf,
+                       unsigned flags)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = st_softpipe_buf->data;
+   return st_softpipe_buf->mapped;
+}
+
+
+static void
+st_softpipe_buffer_unmap(struct pipe_winsys *winsys, 
+                         struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *st_softpipe_buf = st_softpipe_buffer(buf);
+   st_softpipe_buf->mapped = NULL;
+}
+
+
+static void
+st_softpipe_buffer_destroy(struct pipe_winsys *winsys,
+                           struct pipe_buffer *buf)
+{
+   struct st_softpipe_buffer *oldBuf = st_softpipe_buffer(buf);
+
+   if (oldBuf->data) {
+      if (!oldBuf->userBuffer)
+         align_free(oldBuf->data);
+
+      oldBuf->data = NULL;
+   }
+
+   FREE(oldBuf);
+}
+
+
+static void
+st_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
+                              struct pipe_surface *surf,
+                              void *context_private)
+{
+}
+
+
+
+static const char *
+st_softpipe_get_name(struct pipe_winsys *winsys)
+{
+   return "softpipe";
+}
+
+
+static struct pipe_buffer *
+st_softpipe_buffer_create(struct pipe_winsys *winsys, 
+                          unsigned alignment, 
+                          unsigned usage,
+                          unsigned size)
+{
+   struct st_softpipe_buffer *buffer = CALLOC_STRUCT(st_softpipe_buffer);
+
+   buffer->base.refcount = 1;
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+st_softpipe_user_buffer_create(struct pipe_winsys *winsys, 
+                               void *ptr, 
+                               unsigned bytes)
+{
+   struct st_softpipe_buffer *buffer;
+   
+   buffer = CALLOC_STRUCT(st_softpipe_buffer);
+   if(!buffer)
+      return NULL;
+   
+   buffer->base.refcount = 1;
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static int
+st_softpipe_surface_alloc_storage(struct pipe_winsys *winsys,
+                                  struct pipe_surface *surf,
+                                  unsigned width, unsigned height,
+                                  enum pipe_format format, 
+                                  unsigned flags,
+                                  unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        surf->stride * surf->nblocksy);
+   if(!surf->buffer)
+      return -1;
+   
+   return 0;
+}
+
+
+static struct pipe_surface *
+st_softpipe_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
+
+   assert(winsys);
+
+   surface->refcount = 1;
+   surface->winsys = winsys;
+
+   return surface;
+}
+
+
+static void
+st_softpipe_surface_release(struct pipe_winsys *winsys, 
+                            struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   assert(!surf->texture);
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+static void
+st_softpipe_fence_reference(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle **ptr,
+                            struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+st_softpipe_fence_signalled(struct pipe_winsys *winsys, 
+                            struct pipe_fence_handle *fence,
+                            unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+st_softpipe_fence_finish(struct pipe_winsys *winsys, 
+                         struct pipe_fence_handle *fence,
+                         unsigned flag)
+{
+   return 0;
+}
+
+
+static void
+st_softpipe_destroy(struct pipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+st_softpipe_screen_create(void)
+{
+   static struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(pipe_winsys);
+   if(!winsys)
+      return NULL;
+
+   winsys->destroy = st_softpipe_destroy;
+   
+   winsys->buffer_create = st_softpipe_buffer_create;
+   winsys->user_buffer_create = st_softpipe_user_buffer_create;
+   winsys->buffer_map = st_softpipe_buffer_map;
+   winsys->buffer_unmap = st_softpipe_buffer_unmap;
+   winsys->buffer_destroy = st_softpipe_buffer_destroy;
+
+   winsys->surface_alloc = st_softpipe_surface_alloc;
+   winsys->surface_alloc_storage = st_softpipe_surface_alloc_storage;
+   winsys->surface_release = st_softpipe_surface_release;
+
+   winsys->fence_reference = st_softpipe_fence_reference;
+   winsys->fence_signalled = st_softpipe_fence_signalled;
+   winsys->fence_finish = st_softpipe_fence_finish;
+
+   winsys->flush_frontbuffer = st_softpipe_flush_frontbuffer;
+   winsys->get_name = st_softpipe_get_name;
+
+   screen = softpipe_create_screen(winsys);
+   if(!screen)
+      st_softpipe_destroy(winsys);
+
+   return screen;
+}
+
+
+static struct pipe_context *
+st_softpipe_context_create(struct pipe_screen *screen)
+{
+   return softpipe_create(screen, screen->winsys, NULL);
+}
+
+
+const struct st_winsys st_softpipe_winsys = {
+   &st_softpipe_screen_create,
+   &st_softpipe_context_create,
+};
diff --git a/src/gallium/state_trackers/python/st_winsys.h b/src/gallium/state_trackers/python/st_winsys.h
new file mode 100644
index 0000000000..b8cb612d86
--- /dev/null
+++ b/src/gallium/state_trackers/python/st_winsys.h
@@ -0,0 +1,52 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef ST_WINSYS_H_
+#define ST_WINSYS_H_
+
+
+struct pipe_screen;
+struct pipe_context;
+
+
+struct st_winsys 
+{
+   struct pipe_screen *
+   (*screen_create)(void);
+
+   struct pipe_context *
+   (*context_create)(struct pipe_screen *screen);
+};
+
+
+extern const struct st_winsys st_softpipe_winsys;
+
+extern const struct st_winsys st_hardpipe_winsys;
+
+
+#endif /* ST_WINSYS_H_ */
diff --git a/src/gallium/state_trackers/python/tests/base.py b/src/gallium/state_trackers/python/tests/base.py
new file mode 100644
index 0000000000..8477aa5fc9
--- /dev/null
+++ b/src/gallium/state_trackers/python/tests/base.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+"""Base classes for tests.
+
+Loosely inspired on Python's unittest module.
+"""
+
+
+from gallium import *
+
+
+# Enumerate all pixel formats
+formats = {}
+for name, value in globals().items():
+    if name.startswith("PIPE_FORMAT_") and isinstance(value, int):
+        formats[value] = name
+
+
+def make_image(width, height, rgba):
+    import Image
+    outimage = Image.new(
+        mode='RGB',
+        size=(width, height),
+        color=(0,0,0))
+    outpixels = outimage.load()
+    for y in range(0, height):
+        for x in range(0, width):
+            offset = (y*width + x)*4
+            r, g, b, a = [int(min(max(rgba[offset + ch], 0.0), 1.0)*255) for ch in range(4)]
+            outpixels[x, y] = r, g, b
+    return outimage
+
+def save_image(width, height, rgba, filename):
+    outimage = make_image(width, height, rgba)
+    outimage.save(filename, "PNG")
+
+def show_image(width, height, **rgbas):
+    import Tkinter as tk
+    from PIL import Image, ImageTk
+
+    root = tk.Tk()
+    
+    x = 64
+    y = 64
+    
+    labels = rgbas.keys()
+    labels.sort() 
+    for i in range(len(labels)):
+        label = labels[i]
+        outimage = make_image(width, height, rgbas[label])
+        
+        if i:
+            window = tk.Toplevel(root)
+        else:
+            window = root    
+        window.title(label)
+        image1 = ImageTk.PhotoImage(outimage)
+        w = image1.width()
+        h = image1.height()
+        window.geometry("%dx%d+%d+%d" % (w, h, x, y))
+        panel1 = tk.Label(window, image=image1)
+        panel1.pack(side='top', fill='both', expand='yes')
+        panel1.image = image1
+        x += w + 2
+    
+    root.mainloop()
+
+
+class TestFailure(Exception):
+
+    pass
+
+class TestSkip(Exception):
+    
+    pass
+
+
+class Test:
+
+    def __init__(self):
+        pass
+
+    def _run(self, result):
+        raise NotImplementedError
+    
+    def run(self):
+        result = TestResult()
+        self._run(result)
+        result.summary()
+
+
+class TestCase(Test):
+    
+    def __init__(self, dev, **kargs):
+        Test.__init__(self)
+        self.dev = dev
+        self.__dict__.update(kargs)
+
+    def description(self):
+        raise NotImplementedError
+        
+    def test(self):
+        raise NotImplementedError
+    
+    def _run(self, result):
+        result.test_start(self)
+        try:
+            self.test()
+        except KeyboardInterrupt:
+            raise
+        except TestSkip:
+            result.test_skipped(self)
+        except TestFailure:
+            result.test_failed(self)
+        else:
+            result.test_passed(self)
+
+
+class TestSuite(Test):
+    
+    def __init__(self, tests = None):
+        Test.__init__(self)
+        if tests is None:
+            self.tests = []
+        else:
+            self.tests = tests
+
+    def add_test(self, test):
+        self.tests.append(test) 
+    
+    def _run(self, result):
+        for test in self.tests:
+            test._run(result)
+
+
+class TestResult:
+    
+    def __init__(self):
+        self.tests = 0
+        self.passed = 0
+        self.skipped = 0
+        self.failed = 0
+        self.failed_descriptions = []
+    
+    def test_start(self, test):
+        self.tests += 1
+        print "Running %s..." % test.description()
+    
+    def test_passed(self, test):
+        self.passed += 1
+        print "PASS"
+            
+    def test_skipped(self, test):
+        self.skipped += 1
+        print "SKIP"
+        
+    def test_failed(self, test):
+        self.failed += 1
+        self.failed_descriptions.append(test.description())
+        print "FAIL"
+
+    def summary(self):
+        print "%u tests, %u passed, %u skipped, %u failed" % (self.tests, self.passed, self.skipped, self.failed)
+        for description in self.failed_descriptions:
+            print "  %s" % description
+ 
+\ No newline at end of file
diff --git a/src/gallium/state_trackers/python/tests/texture.py b/src/gallium/state_trackers/python/tests/texture.py
new file mode 100644
index 0000000000..880a61306c
--- /dev/null
+++ b/src/gallium/state_trackers/python/tests/texture.py
@@ -0,0 +1,397 @@
+#!/usr/bin/env python
+##########################################################################
+# 
+# Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+# All Rights Reserved.
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sub license, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+# 
+# The above copyright notice and this permission notice (including the
+# next paragraph) shall be included in all copies or substantial portions
+# of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+# IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+# ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# 
+##########################################################################
+
+
+import sys
+from gallium import *
+from base import *
+
+
+def lods(*dims):
+    size = max(dims)
+    lods = 0
+    while size:
+        lods += 1
+        size >>= 1
+    return lods
+
+
+def minify(dims, level = 1):
+    return [max(dim>>level, 1) for dim in dims]
+
+
+def tex_coords(texture, face, level, zslice):
+    st = [ 
+        [0.0, 0.0], 
+        [1.0, 0.0], 
+        [1.0, 1.0], 
+        [0.0, 1.0],
+    ] 
+    
+    if texture.target == PIPE_TEXTURE_2D:
+        return [[s, t, 0.0] for s, t in st]
+    elif texture.target == PIPE_TEXTURE_3D:
+        depth = texture.get_depth(level)
+        if depth > 1:
+            r = float(zslice)/float(depth - 1)
+        else:
+            r = 0.0
+        return [[s, t, r] for s, t in st]
+    elif texture.target == PIPE_TEXTURE_CUBE:
+        result = []
+        for s, t in st:
+            # See http://developer.nvidia.com/object/cube_map_ogl_tutorial.html
+            sc = 2.0*s - 1.0
+            tc = 2.0*t - 1.0
+            if face == PIPE_TEX_FACE_POS_X:
+                rx = 1.0
+                ry = -tc
+                rz = -sc
+            if face == PIPE_TEX_FACE_NEG_X:
+                rx = -1.0
+                ry = -tc
+                rz = sc
+            if face == PIPE_TEX_FACE_POS_Y:
+                rx = sc
+                ry = 1.0
+                rz = tc
+            if face == PIPE_TEX_FACE_NEG_Y:
+                rx = sc
+                ry = -1.0
+                rz = -tc
+            if face == PIPE_TEX_FACE_POS_Z:
+                rx = sc
+                ry = -tc
+                rz = 1.0
+            if face == PIPE_TEX_FACE_NEG_Z:
+                rx = -sc
+                ry = -tc
+                rz = -1.0
+            result.append([rx, ry, rz])
+        return result
+
+def is_pot(n):
+    return n & (n - 1) == 0
+      
+                
+class TextureTest(TestCase):
+    
+    def description(self):
+        target = {
+            PIPE_TEXTURE_1D: "1d", 
+            PIPE_TEXTURE_2D: "2d", 
+            PIPE_TEXTURE_3D: "3d", 
+            PIPE_TEXTURE_CUBE: "cube",
+        }[self.target]
+        format = formats[self.format]
+        if self.target == PIPE_TEXTURE_CUBE:
+            face = {
+                PIPE_TEX_FACE_POS_X: "+x",
+                PIPE_TEX_FACE_NEG_X: "-x",
+                PIPE_TEX_FACE_POS_Y: "+y",
+                PIPE_TEX_FACE_NEG_Y: "-y", 
+                PIPE_TEX_FACE_POS_Z: "+z", 
+                PIPE_TEX_FACE_NEG_Z: "-z",
+            }[self.face]
+        else:
+            face = ""
+        return "%s %s %ux%ux%u last_level=%u face=%s level=%u zslice=%u" % (
+            target, format, 
+            self.width, self.height, self.depth, self.last_level, 
+            face, self.level, self.zslice, 
+        )
+    
+    def test(self):
+        dev = self.dev
+        
+        target = self.target
+        format = self.format
+        width = self.width
+        height = self.height
+        depth = self.depth
+        last_level = self.last_level
+        face = self.face
+        level = self.level
+        zslice = self.zslice
+        
+        tex_usage = PIPE_TEXTURE_USAGE_SAMPLER
+        geom_flags = 0
+        if width != height:
+            geom_flags |= PIPE_TEXTURE_GEOM_NON_SQUARE
+        if not is_pot(width) or not is_pot(height) or not is_pot(depth):
+            geom_flags |= PIPE_TEXTURE_GEOM_NON_POWER_OF_TWO
+        
+        if not dev.is_format_supported(format, target, tex_usage, geom_flags):
+            raise TestSkip
+        
+        ctx = self.dev.context_create()
+    
+        # disabled blending/masking
+        blend = Blend()
+        blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE
+        blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE
+        blend.rgb_dst_factor = PIPE_BLENDFACTOR_ZERO
+        blend.alpha_dst_factor = PIPE_BLENDFACTOR_ZERO
+        blend.colormask = PIPE_MASK_RGBA
+        ctx.set_blend(blend)
+    
+        # no-op depth/stencil/alpha
+        depth_stencil_alpha = DepthStencilAlpha()
+        ctx.set_depth_stencil_alpha(depth_stencil_alpha)
+    
+        # rasterizer
+        rasterizer = Rasterizer()
+        rasterizer.front_winding = PIPE_WINDING_CW
+        rasterizer.cull_mode = PIPE_WINDING_NONE
+        rasterizer.bypass_clipping = 1
+        #rasterizer.bypass_vs = 1
+        ctx.set_rasterizer(rasterizer)
+    
+        # viewport (identity, we setup vertices in wincoords)
+        viewport = Viewport()
+        scale = FloatArray(4)
+        scale[0] = 1.0
+        scale[1] = 1.0
+        scale[2] = 1.0
+        scale[3] = 1.0
+        viewport.scale = scale
+        translate = FloatArray(4)
+        translate[0] = 0.0
+        translate[1] = 0.0
+        translate[2] = 0.0
+        translate[3] = 0.0
+        viewport.translate = translate
+        ctx.set_viewport(viewport)
+    
+        # samplers
+        sampler = Sampler()
+        sampler.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+        sampler.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+        sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE
+        sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST
+        sampler.min_img_filter = PIPE_TEX_MIPFILTER_NEAREST
+        sampler.mag_img_filter = PIPE_TEX_MIPFILTER_NEAREST
+        sampler.normalized_coords = 1
+        sampler.min_lod = 0
+        sampler.max_lod = PIPE_MAX_TEXTURE_LEVELS - 1
+        ctx.set_sampler(0, sampler)
+    
+        #  texture 
+        texture = dev.texture_create(
+            target = target,
+            format = format, 
+            width = width, 
+            height = height,
+            depth = depth, 
+            last_level = last_level,
+            tex_usage = tex_usage,
+        )
+        
+        expected_rgba = FloatArray(height*width*4) 
+        texture.get_surface(
+            usage = PIPE_BUFFER_USAGE_CPU_READ|PIPE_BUFFER_USAGE_CPU_WRITE,
+            face = face,
+            level = level,
+            zslice = zslice,
+        ).sample_rgba(expected_rgba)
+        
+        ctx.set_sampler_texture(0, texture)
+
+        #  framebuffer 
+        cbuf_tex = dev.texture_create(
+            PIPE_FORMAT_A8R8G8B8_UNORM, 
+            width, 
+            height,
+            tex_usage = PIPE_TEXTURE_USAGE_RENDER_TARGET,
+        )
+
+        cbuf = cbuf_tex.get_surface(usage = PIPE_BUFFER_USAGE_GPU_WRITE|PIPE_BUFFER_USAGE_GPU_READ)
+        fb = Framebuffer()
+        fb.width = width
+        fb.height = height
+        fb.num_cbufs = 1
+        fb.set_cbuf(0, cbuf)
+        ctx.set_framebuffer(fb)
+        ctx.surface_clear(cbuf, 0x00000000)
+        del fb
+    
+        # vertex shader
+        vs = Shader('''
+            VERT1.1
+            DCL IN[0], POSITION, CONSTANT
+            DCL IN[1], GENERIC, CONSTANT
+            DCL OUT[0], POSITION, CONSTANT
+            DCL OUT[1], GENERIC, CONSTANT
+            0:MOV OUT[0], IN[0]
+            1:MOV OUT[1], IN[1]
+            2:END
+        ''')
+        #vs.dump()
+        ctx.set_vertex_shader(vs)
+    
+        # fragment shader
+        op = {
+            PIPE_TEXTURE_1D: "1D", 
+            PIPE_TEXTURE_2D: "2D", 
+            PIPE_TEXTURE_3D: "3D", 
+            PIPE_TEXTURE_CUBE: "CUBE",
+        }[target]
+        fs = Shader('''
+            FRAG1.1
+            DCL IN[0], GENERIC[0], LINEAR
+            DCL OUT[0], COLOR, CONSTANT
+            DCL SAMP[0], CONSTANT
+            0:TEX OUT[0], IN[0], SAMP[0], %s
+            1:END
+        ''' % op)
+        #fs.dump()
+        ctx.set_fragment_shader(fs)
+
+        nverts = 4
+        nattrs = 2
+        verts = FloatArray(nverts * nattrs * 4)
+    
+        x = 0
+        y = 0
+        w, h = minify((width, height), level)
+    
+        pos = [
+            [x, y],
+            [x+w, y],
+            [x+w, y+h],
+            [x, y+h],
+        ]
+    
+        tex = tex_coords(texture, face, level, zslice)
+    
+        for i in range(0, 4):
+            j = 8*i
+            verts[j + 0] = pos[i][0] # x
+            verts[j + 1] = pos[i][1] # y
+            verts[j + 2] = 0.0 # z
+            verts[j + 3] = 1.0 # w
+            verts[j + 4] = tex[i][0] # s
+            verts[j + 5] = tex[i][1] # r
+            verts[j + 6] = tex[i][2] # q
+            verts[j + 7] = 1.0
+    
+        ctx.draw_vertices(PIPE_PRIM_TRIANGLE_FAN,
+                          nverts, 
+                          nattrs, 
+                          verts)
+    
+        ctx.flush()
+    
+        cbuf = cbuf_tex.get_surface(usage = PIPE_BUFFER_USAGE_CPU_READ)
+        
+        total = h*w
+        different = cbuf.compare_tile_rgba(x, y, w, h, expected_rgba, tol=4.0/256)
+        if different:
+            sys.stderr.write("%u out of %u pixels differ\n" % (different, total))
+
+        if float(total - different)/float(total) < 0.85:
+        
+            if 0:
+                rgba = FloatArray(h*w*4)
+                cbuf.get_tile_rgba(x, y, w, h, rgba)
+                show_image(w, h, Result=rgba, Expected=expected_rgba)
+                save_image(w, h, rgba, "result.png")
+                save_image(w, h, expected_rgba, "expected.png")
+            #sys.exit(0)
+            
+            raise TestFailure
+
+        del ctx
+        
+
+
+def main():
+    dev = Device()
+    suite = TestSuite()
+    
+    targets = []
+    targets += [PIPE_TEXTURE_2D]
+    targets += [PIPE_TEXTURE_CUBE]
+    targets += [PIPE_TEXTURE_3D]
+    
+    formats = []
+    formats += [PIPE_FORMAT_A8R8G8B8_UNORM]
+    formats += [PIPE_FORMAT_R5G6B5_UNORM]
+    formats += [PIPE_FORMAT_L8_UNORM]
+    formats += [PIPE_FORMAT_YCBCR]
+    formats += [PIPE_FORMAT_DXT1_RGB]
+    
+    sizes = [64, 32, 16, 8, 4, 2, 1]
+    #sizes = [1020, 508, 252, 62, 30, 14, 6, 3]
+    #sizes = [64]
+    #sizes = [63]
+    
+    for target in targets:
+        for format in formats:
+            for size in sizes:
+                if target == PIPE_TEXTURE_CUBE:
+                    faces = [
+                        PIPE_TEX_FACE_POS_X,
+                        PIPE_TEX_FACE_NEG_X,
+                        PIPE_TEX_FACE_POS_Y,
+                        PIPE_TEX_FACE_NEG_Y, 
+                        PIPE_TEX_FACE_POS_Z, 
+                        PIPE_TEX_FACE_NEG_Z,
+                    ]
+                    #faces = [PIPE_TEX_FACE_NEG_X]
+                else:
+                    faces = [0]
+                if target == PIPE_TEXTURE_3D:
+                    depth = size
+                else:
+                    depth = 1
+                for face in faces:
+                    levels = lods(size)
+                    for last_level in range(levels):
+                        for level in range(0, last_level + 1):
+                            zslice = 0
+                            while zslice < depth >> level:
+                                test = TextureTest(
+                                    dev = dev,
+                                    target = target,
+                                    format = format, 
+                                    width = size,
+                                    height = size,
+                                    depth = depth,
+                                    last_level = last_level,
+                                    face = face,
+                                    level = level,
+                                    zslice = zslice,
+                                )
+                                suite.add_test(test)
+                                zslice = (zslice + 1)*2 - 1
+    suite.run()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/gallium/winsys/Makefile b/src/gallium/winsys/Makefile
new file mode 100644
index 0000000000..2360a6a94a
--- /dev/null
+++ b/src/gallium/winsys/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = $(GALLIUM_WINSYS_DIRS)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
+
+
+# Dummy install target
+install:
diff --git a/src/gallium/winsys/SConscript b/src/gallium/winsys/SConscript
new file mode 100644
index 0000000000..30c3378dff
--- /dev/null
+++ b/src/gallium/winsys/SConscript
@@ -0,0 +1,16 @@
+Import('*')
+
+if env['dri']:
+	SConscript([
+		'drm/SConscript',
+	])
+	
+if 'xlib' in env['winsys']:
+	SConscript([
+		'xlib/SConscript',
+	])
+
+if 'gdi' in env['winsys']:
+	SConscript([
+		'gdi/SConscript',
+	])
diff --git a/src/gallium/winsys/drm/Makefile b/src/gallium/winsys/drm/Makefile
new file mode 100644
index 0000000000..f466ce6c3c
--- /dev/null
+++ b/src/gallium/winsys/drm/Makefile
@@ -0,0 +1,38 @@
+# src/mesa/drivers/dri/Makefile
+
+TOP = ../../../..
+
+include $(TOP)/configs/current
+
+
+
+default: $(TOP)/$(LIB_DIR) subdirs
+
+
+$(TOP)/$(LIB_DIR):
+	-mkdir $(TOP)/$(LIB_DIR)
+
+
+subdirs:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+install:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) install) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	@for dir in $(DRI_DIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE) clean) ; \
+		fi \
+	done
+	-rm -f common/*.o
diff --git a/src/gallium/winsys/drm/Makefile.template b/src/gallium/winsys/drm/Makefile.template
new file mode 100644
index 0000000000..80e817b808
--- /dev/null
+++ b/src/gallium/winsys/drm/Makefile.template
@@ -0,0 +1,125 @@
+# -*-makefile-*-
+
+MESA_MODULES = \
+	$(TOP)/src/mesa/libmesa.a \
+	$(GALLIUM_AUXILIARIES)
+	
+COMMON_GALLIUM_SOURCES = \
+        $(TOP)/src/mesa/drivers/dri/common/utils.c \
+        $(TOP)/src/mesa/drivers/dri/common/vblank.c \
+        $(TOP)/src/mesa/drivers/dri/common/dri_util.c \
+        $(TOP)/src/mesa/drivers/dri/common/xmlconfig.c
+
+COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
+        $(TOP)/src/mesa/drivers/common/driverfuncs.c \
+        $(TOP)/src/mesa/drivers/dri/common/texmem.c \
+        $(TOP)/src/mesa/drivers/dri/common/drirenderbuffer.c
+
+COMMON_BM_SOURCES = \
+	$(TOP)/src/mesa/drivers/dri/common/dri_bufmgr.c \
+	$(TOP)/src/mesa/drivers/dri/common/dri_drmpool.c
+
+
+ifeq ($(WINDOW_SYSTEM),dri)
+WINOBJ=
+WINLIB=
+INCLUDES = $(SHARED_INCLUDES) $(EXPAT_INCLUDES)
+
+OBJECTS = \
+	$(C_SOURCES:.c=.o) \
+	$(ASM_SOURCES:.S=.o) 
+
+else
+# miniglx
+WINOBJ=
+WINLIB=-L$(MESA)/src/glx/mini
+MINIGLX_INCLUDES = -I$(TOP)/src/glx/mini
+INCLUDES = $(MINIGLX_INCLUDES) \
+	   $(SHARED_INCLUDES) \
+	   $(PCIACCESS_CFLAGS)
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+	  $(MINIGLX_SOURCES:.c=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+endif
+
+
+### Include directories
+SHARED_INCLUDES = \
+	-I. \
+	-I$(TOP)/src/mesa/drivers/dri/common \
+	-Iserver \
+	-I$(TOP)/include \
+	-I$(TOP)/include/GL/internal \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/winsys/common \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/mesa/glapi \
+	-I$(TOP)/src/mesa/math \
+	-I$(TOP)/src/mesa/transform \
+	-I$(TOP)/src/mesa/shader \
+	-I$(TOP)/src/mesa/swrast \
+	-I$(TOP)/src/mesa/swrast_setup \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/egl/drivers/dri \
+	$(LIBDRM_CFLAGS)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME) $(TOP)/$(LIB_DIR)/$(LIBNAME) $(LIBNAME_EGL) $(TOP)/$(LIB_DIR)/$(LIBNAME_EGL)
+
+
+$(LIBNAME): $(OBJECTS) $(MESA_MODULES) $(PIPE_DRIVERS) $(WINOBJ) Makefile $(TOP)/src/mesa/drivers/dri/Makefile.template
+	$(TOP)/bin/mklib -noprefix -o $@ \
+		$(OBJECTS) $(PIPE_DRIVERS) $(MESA_MODULES)  $(WINOBJ) $(DRI_LIB_DEPS)
+
+$(LIBNAME_EGL): $(WINSYS_OBJECTS) $(LIBS)
+	$(TOP)/bin/mklib -o $(LIBNAME_EGL) \
+		-linker "$(CC)" \
+		-noprefix \
+		$(OBJECTS) $(MKLIB_OPTIONS) $(WINSYS_OBJECTS) $(PIPE_DRIVERS) $(WINOBJ) $(DRI_LIB_DEPS) \
+		--whole-archive $(LIBS) $(GALLIUM_AUXILIARIES) --no-whole-archive
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME): $(LIBNAME)
+	$(INSTALL) $(LIBNAME) $(TOP)/$(LIB_DIR) 
+
+$(TOP)/$(LIB_DIR)/$(LIBNAME_EGL): $(LIBNAME_EGL)
+	$(INSTALL) $(LIBNAME_EGL) $(TOP)/$(LIB_DIR) 
+
+depend: $(C_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) \
+		$(ASM_SOURCES) 2> /dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+install: $(LIBNAME)
+	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+
+include depend
diff --git a/src/gallium/winsys/drm/SConscript b/src/gallium/winsys/drm/SConscript
new file mode 100644
index 0000000000..aef5210a32
--- /dev/null
+++ b/src/gallium/winsys/drm/SConscript
@@ -0,0 +1,54 @@
+Import('*')
+
+if env['dri']:
+
+	drienv = env.Clone()
+
+	drienv.Replace(CPPPATH = [
+		'#src/mesa/drivers/dri/common',
+		'#include',
+		'#include/GL/internal',
+		'#src/gallium/include',
+		'#src/gallium/auxiliary',
+		'#src/gallium/drivers',
+		'#src/mesa',
+		'#src/mesa/main',
+		'#src/mesa/glapi',
+		'#src/mesa/math',
+		'#src/mesa/transform',
+		'#src/mesa/shader',
+		'#src/mesa/swrast',
+		'#src/mesa/swrast_setup',
+		'#src/egl/main',
+		'#src/egl/drivers/dri',
+	])
+
+	drienv.ParseConfig('pkg-config --cflags --libs libdrm')
+
+	COMMON_GALLIUM_SOURCES = [
+		'#src/mesa/drivers/dri/common/utils.c',
+		'#src/mesa/drivers/dri/common/vblank.c',
+		'#src/mesa/drivers/dri/common/dri_util.c',
+		'#src/mesa/drivers/dri/common/xmlconfig.c',
+	]
+
+	COMMON_BM_SOURCES = [
+		'#src/mesa/drivers/dri/common/dri_bufmgr.c',
+		'#src/mesa/drivers/dri/common/dri_drmpool.c',
+	]
+
+	Export([
+		'drienv',
+		'COMMON_GALLIUM_SOURCES',
+		'COMMON_BM_SOURCES',
+	])
+
+	# TODO: Installation
+	#install: $(LIBNAME)
+	#	$(INSTALL) -d $(DRI_DRIVER_INSTALL_DIR)
+	#	$(INSTALL) -m 755 $(LIBNAME) $(DRI_DRIVER_INSTALL_DIR)
+
+	if 'intel' in env['winsys']:
+		SConscript([
+			'intel/SConscript',
+		])
diff --git a/src/gallium/winsys/drm/intel/Makefile b/src/gallium/winsys/drm/intel/Makefile
new file mode 100644
index 0000000000..a670ac044d
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/Makefile
@@ -0,0 +1,25 @@
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+
+SUBDIRS = common dri egl
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
+	rm -f `find . -name depend`
+
+
+# Dummy install target
+install:
diff --git a/src/gallium/winsys/drm/intel/common/Makefile b/src/gallium/winsys/drm/intel/common/Makefile
new file mode 100644
index 0000000000..bf1a7d691f
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/Makefile
@@ -0,0 +1,23 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = inteldrm
+
+C_SOURCES = \
+	intel_be_batchbuffer.c \
+	intel_be_context.c \
+	intel_be_device.c \
+	ws_dri_bufmgr.c \
+	ws_dri_drmpool.c \
+	ws_dri_fencemgr.c \
+	ws_dri_mallocpool.c \
+	ws_dri_slabpool.c
+
+
+include ./Makefile.template
+
+DRIVER_DEFINES = $(shell pkg-config libdrm --cflags \
+                && pkg-config libdrm --atleast-version=2.3.1 \
+                && echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+symlinks:
+
diff --git a/src/gallium/winsys/drm/intel/common/Makefile.template b/src/gallium/winsys/drm/intel/common/Makefile.template
new file mode 100644
index 0000000000..02ed363a43
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/Makefile.template
@@ -0,0 +1,64 @@
+# -*-makefile-*-
+
+
+# We still have a dependency on the "dri" buffer manager.  Most likely
+# the interface can be reused in non-dri environments, and also as a
+# frontend to simpler memory managers.
+#
+COMMON_SOURCES = 
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+          $(CPP_SOURCES:.cpp=.o) \
+	  $(ASM_SOURCES:.S=.o) 
+
+
+### Include directories
+INCLUDES = \
+	-I. \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/include \
+        $(DRIVER_INCLUDES)
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDES) $(CXXFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+
+##### TARGETS #####
+
+default: depend symlinks $(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) Makefile Makefile.template
+	$(TOP)/bin/mklib -o $@ -static $(OBJECTS) $(DRIVER_LIBS)
+
+
+depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(SYMLINKS)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) \
+		$(ASM_SOURCES) 2> /dev/null
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean::
+	-rm -f *.o */*.o *~ *.so *~ server/*.o $(SYMLINKS)
+	-rm -f depend depend.bak
+
+
+include depend
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.c b/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.c
new file mode 100644
index 0000000000..bc13a5761e
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.c
@@ -0,0 +1,429 @@
+
+#include "intel_be_batchbuffer.h"
+#include "intel_be_context.h"
+#include "intel_be_device.h"
+#include <errno.h>
+
+#include "xf86drm.h"
+
+static void
+intel_realloc_relocs(struct intel_be_batchbuffer *batch, int num_relocs)
+{
+    unsigned long size = num_relocs * I915_RELOC0_STRIDE + I915_RELOC_HEADER;
+
+    size *= sizeof(uint32_t);
+    batch->reloc = realloc(batch->reloc, size);
+    batch->reloc_size = num_relocs;
+}
+
+
+void
+intel_be_batchbuffer_reset(struct intel_be_batchbuffer *batch)
+{
+   /*
+    * Get a new, free batchbuffer.
+    */
+    drmBO *bo;
+    struct drm_bo_info_req *req;
+
+   driBOUnrefUserList(batch->list);
+   driBOResetList(batch->list);
+
+   /* base.size is the size available to the i915simple driver */
+   batch->base.size = batch->device->max_batch_size - BATCH_RESERVED;
+   batch->base.actual_size = batch->device->max_batch_size;
+   driBOData(batch->buffer, batch->base.actual_size, NULL, NULL, 0);
+
+   /*
+    * Add the batchbuffer to the validate list.
+    */
+
+   driBOAddListItem(batch->list, batch->buffer,
+		    DRM_BO_FLAG_EXE | DRM_BO_FLAG_MEM_TT,
+		    DRM_BO_FLAG_EXE | DRM_BO_MASK_MEM,
+		    &batch->dest_location, &batch->node);
+
+   req = &batch->node->bo_arg.d.req.bo_req;
+
+   /*
+    * Set up information needed for us to make relocations
+    * relative to the underlying drm buffer objects.
+    */
+
+   driReadLockKernelBO();
+   bo = driBOKernel(batch->buffer);
+   req->presumed_offset = (uint64_t) bo->offset;
+   req->hint = DRM_BO_HINT_PRESUMED_OFFSET;
+   batch->drmBOVirtual = (uint8_t *) bo->virtual;
+   driReadUnlockKernelBO();
+
+   /*
+    * Adjust the relocation buffer size.
+    */
+
+   if (batch->reloc_size > INTEL_MAX_RELOCS ||
+       batch->reloc == NULL)
+     intel_realloc_relocs(batch, INTEL_DEFAULT_RELOCS);
+
+   assert(batch->reloc != NULL);
+   batch->reloc[0] = 0; /* No relocs yet. */
+   batch->reloc[1] = 1; /* Reloc type 1 */
+   batch->reloc[2] = 0; /* Only a single relocation list. */
+   batch->reloc[3] = 0; /* Only a single relocation list. */
+
+   batch->base.map = driBOMap(batch->buffer, DRM_BO_FLAG_WRITE, 0);
+   batch->poolOffset = driBOPoolOffset(batch->buffer);
+   batch->base.ptr = batch->base.map;
+   batch->dirty_state = ~0;
+   batch->nr_relocs = 0;
+   batch->flags = 0;
+   batch->id = 0;//batch->intel->intelScreen->batch_id++;
+}
+
+/*======================================================================
+ * Public functions
+ */
+struct intel_be_batchbuffer *
+intel_be_batchbuffer_alloc(struct intel_be_context *intel)
+{
+   struct intel_be_batchbuffer *batch = calloc(sizeof(*batch), 1);
+
+   batch->intel = intel;
+   batch->device = intel->device;
+
+   driGenBuffers(intel->device->batchPool, "batchbuffer", 1,
+                 &batch->buffer, 4096,
+                 DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_EXE, 0);
+   batch->last_fence = NULL;
+   batch->list = driBOCreateList(20);
+   batch->reloc = NULL;
+   intel_be_batchbuffer_reset(batch);
+   return batch;
+}
+
+void
+intel_be_batchbuffer_free(struct intel_be_batchbuffer *batch)
+{
+   if (batch->last_fence) {
+      driFenceFinish(batch->last_fence,
+		     DRM_FENCE_TYPE_EXE, FALSE);
+      driFenceUnReference(&batch->last_fence);
+   }
+   if (batch->base.map) {
+      driBOUnmap(batch->buffer);
+      batch->base.map = NULL;
+   }
+   driBOUnReference(batch->buffer);
+   driBOFreeList(batch->list);
+   if (batch->reloc)
+       free(batch->reloc);
+   batch->buffer = NULL;
+   free(batch);
+}
+
+void
+intel_be_offset_relocation(struct intel_be_batchbuffer *batch,
+			unsigned pre_add,
+			struct _DriBufferObject *driBO,
+			uint64_t val_flags,
+			uint64_t val_mask)
+{
+    int itemLoc;
+    struct _drmBONode *node;
+    uint32_t *reloc;
+    struct drm_bo_info_req *req;
+
+    driBOAddListItem(batch->list, driBO, val_flags, val_mask,
+		     &itemLoc, &node);
+    req = &node->bo_arg.d.req.bo_req;
+
+    if (!(req->hint &  DRM_BO_HINT_PRESUMED_OFFSET)) {
+
+	/*
+	 * Stop other threads from tampering with the underlying
+	 * drmBO while we're reading its offset.
+	 */
+
+	driReadLockKernelBO();
+	req->presumed_offset = (uint64_t) driBOKernel(driBO)->offset;
+	driReadUnlockKernelBO();
+	req->hint = DRM_BO_HINT_PRESUMED_OFFSET;
+    }
+
+    pre_add += driBOPoolOffset(driBO);
+
+    if (batch->nr_relocs == batch->reloc_size)
+	intel_realloc_relocs(batch, batch->reloc_size * 2);
+
+    reloc = batch->reloc +
+	(I915_RELOC_HEADER + batch->nr_relocs * I915_RELOC0_STRIDE);
+
+    reloc[0] = ((uint8_t *)batch->base.ptr - batch->drmBOVirtual);
+    i915_batchbuffer_dword(&batch->base, req->presumed_offset + pre_add);
+    reloc[1] = pre_add;
+    reloc[2] = itemLoc;
+    reloc[3] = batch->dest_location;
+    batch->nr_relocs++;
+}
+
+static void
+i915_drm_copy_reply(const struct drm_bo_info_rep * rep, drmBO * buf)
+{
+    buf->handle = rep->handle;
+    buf->flags = rep->flags;
+    buf->size = rep->size;
+    buf->offset = rep->offset;
+    buf->mapHandle = rep->arg_handle;
+    buf->proposedFlags = rep->proposed_flags;
+    buf->start = rep->buffer_start;
+    buf->fenceFlags = rep->fence_flags;
+    buf->replyFlags = rep->rep_flags;
+    buf->pageAlignment = rep->page_alignment;
+}
+
+static int
+i915_execbuf(struct intel_be_batchbuffer *batch,
+	     unsigned int used,
+	     boolean ignore_cliprects,
+	     drmBOList *list,
+	     struct drm_i915_execbuffer *ea)
+{
+// struct intel_be_context *intel = batch->intel;
+   drmBONode *node;
+   drmMMListHead *l;
+   struct drm_i915_op_arg *arg, *first;
+   struct drm_bo_op_req *req;
+   struct drm_bo_info_rep *rep;
+   uint64_t *prevNext = NULL;
+   drmBO *buf;
+   int ret = 0;
+   uint32_t count = 0;
+
+   first = NULL;
+   for (l = list->list.next; l != &list->list; l = l->next) {
+      node = DRMLISTENTRY(drmBONode, l, head);
+
+      arg = &node->bo_arg;
+      req = &arg->d.req;
+
+      if (!first)
+	 first = arg;
+
+      if (prevNext)
+	 *prevNext = (unsigned long)arg;
+
+      prevNext = &arg->next;
+      req->bo_req.handle = node->buf->handle;
+      req->op = drm_bo_validate;
+      req->bo_req.flags = node->arg0;
+      req->bo_req.mask = node->arg1;
+      req->bo_req.hint |= 0;
+      count++;
+   }
+
+   memset(ea, 0, sizeof(*ea));
+   ea->num_buffers = count;
+   ea->batch.start = batch->poolOffset;
+   ea->batch.used = used;
+#if 0 /* ZZZ JB: no cliprects used */
+   ea->batch.cliprects = intel->pClipRects;
+   ea->batch.num_cliprects = ignore_cliprects ? 0 : intel->numClipRects;
+   ea->batch.DR1 = 0;
+   ea->batch.DR4 = 0;((((GLuint) intel->drawX) & 0xffff) |
+		   (((GLuint) intel->drawY) << 16));
+#else
+   ea->batch.cliprects = NULL;
+   ea->batch.num_cliprects = 0;
+   ea->batch.DR1 = 0;
+   ea->batch.DR4 = 0;
+#endif
+   ea->fence_arg.flags = DRM_I915_FENCE_FLAG_FLUSHED;
+   ea->ops_list = (unsigned long) first;
+   first->reloc_ptr = (unsigned long) batch->reloc;
+   batch->reloc[0] = batch->nr_relocs;
+
+   //return -EFAULT;
+   do {
+      ret = drmCommandWriteRead(batch->device->fd, DRM_I915_EXECBUFFER, ea,
+				sizeof(*ea));
+   } while (ret == -EAGAIN);
+
+   if (ret != 0)
+      return ret;
+
+   for (l = list->list.next; l != &list->list; l = l->next) {
+      node = DRMLISTENTRY(drmBONode, l, head);
+      arg = &node->bo_arg;
+      rep = &arg->d.rep.bo_info;
+
+      if (!arg->handled) {
+	 return -EFAULT;
+      }
+      if (arg->d.rep.ret)
+	 return arg->d.rep.ret;
+
+      buf = node->buf;
+      i915_drm_copy_reply(rep, buf);
+   }
+   return 0;
+}
+
+/* TODO: Push this whole function into bufmgr.
+ */
+static struct _DriFenceObject *
+do_flush_locked(struct intel_be_batchbuffer *batch,
+                unsigned int used,
+                boolean ignore_cliprects, boolean allow_unlock)
+{
+   struct intel_be_context *intel = batch->intel;
+   struct _DriFenceObject *fo;
+   drmFence fence;
+   drmBOList *boList;
+   struct drm_i915_execbuffer ea;
+   int ret = 0;
+
+   driBOValidateUserList(batch->list);
+   boList = driGetdrmBOList(batch->list);
+
+#if 0 /* ZZZ JB Allways run */
+   if (!(intel->numClipRects == 0 && !ignore_cliprects)) {
+#else
+   if (1) {
+#endif
+      ret = i915_execbuf(batch, used, ignore_cliprects, boList, &ea);
+   } else {
+     driPutdrmBOList(batch->list);
+     fo = NULL;
+     goto out;
+   }
+   driPutdrmBOList(batch->list);
+   if (ret)
+      abort();
+
+   if (ea.fence_arg.error != 0) {
+
+     /*
+      * The hardware has been idled by the kernel.
+      * Don't fence the driBOs.
+      */
+
+       if (batch->last_fence)
+	   driFenceUnReference(&batch->last_fence);
+#if 0 /* ZZZ JB: no _mesa_* funcs in gallium */
+       _mesa_printf("fence error\n");
+#endif
+       batch->last_fence = NULL;
+       fo = NULL;
+       goto out;
+   }
+
+   fence.handle = ea.fence_arg.handle;
+   fence.fence_class = ea.fence_arg.fence_class;
+   fence.type = ea.fence_arg.type;
+   fence.flags = ea.fence_arg.flags;
+   fence.signaled = ea.fence_arg.signaled;
+
+   fo = driBOFenceUserList(batch->device->fenceMgr, batch->list,
+			   "SuperFence", &fence);
+
+   if (driFenceType(fo) & DRM_I915_FENCE_TYPE_RW) {
+       if (batch->last_fence)
+	   driFenceUnReference(&batch->last_fence);
+   /*
+	* FIXME: Context last fence??
+	*/
+       batch->last_fence = fo;
+       driFenceReference(fo);
+   }
+ out:
+#if 0 /* ZZZ JB: fix this */
+   intel->vtbl.lost_hardware(intel);
+#else
+   (void)intel;
+#endif
+   return fo;
+}
+
+
+struct _DriFenceObject *
+intel_be_batchbuffer_flush(struct intel_be_batchbuffer *batch)
+{
+   struct intel_be_context *intel = batch->intel;
+   unsigned int used = batch->base.ptr - batch->base.map;
+   boolean was_locked = batch->intel->hardware_locked(intel);
+   struct _DriFenceObject *fence;
+
+   if (used == 0) {
+      driFenceReference(batch->last_fence);
+      return batch->last_fence;
+   }
+
+   /* Add the MI_BATCH_BUFFER_END.  Always add an MI_FLUSH - this is a
+    * performance drain that we would like to avoid.
+    */
+#if 0 /* ZZZ JB: what should we do here? */
+   if (used & 4) {
+      ((int *) batch->base.ptr)[0] = intel->vtbl.flush_cmd();
+      ((int *) batch->base.ptr)[1] = 0;
+      ((int *) batch->base.ptr)[2] = MI_BATCH_BUFFER_END;
+      used += 12;
+   }
+   else {
+      ((int *) batch->base.ptr)[0] = intel->vtbl.flush_cmd();
+      ((int *) batch->base.ptr)[1] = MI_BATCH_BUFFER_END;
+      used += 8;
+   }
+#else
+   if (used & 4) {
+      ((int *) batch->base.ptr)[0] = ((0<<29)|(4<<23)); // MI_FLUSH;
+      ((int *) batch->base.ptr)[1] = 0;
+      ((int *) batch->base.ptr)[2] = (0xA<<23); // MI_BATCH_BUFFER_END;
+      used += 12;
+   }
+   else {
+      ((int *) batch->base.ptr)[0] = ((0<<29)|(4<<23)); // MI_FLUSH;
+      ((int *) batch->base.ptr)[1] = (0xA<<23); // MI_BATCH_BUFFER_END;
+      used += 8;
+   }
+#endif
+   driBOUnmap(batch->buffer);
+   batch->base.ptr = NULL;
+   batch->base.map = NULL;
+
+   /* TODO: Just pass the relocation list and dma buffer up to the
+    * kernel.
+    */
+   if (!was_locked)
+      intel->hardware_lock(intel);
+
+   fence = do_flush_locked(batch, used, !(batch->flags & INTEL_BATCH_CLIPRECTS),
+			   FALSE);
+
+   if (!was_locked)
+      intel->hardware_unlock(intel);
+
+   /* Reset the buffer:
+    */
+   intel_be_batchbuffer_reset(batch);
+   return fence;
+}
+
+void
+intel_be_batchbuffer_finish(struct intel_be_batchbuffer *batch)
+{
+   struct _DriFenceObject *fence = intel_be_batchbuffer_flush(batch);
+   driFenceFinish(fence, driFenceType(fence), FALSE);
+   driFenceUnReference(&fence);
+}
+
+#if 0
+void
+intel_be_batchbuffer_data(struct intel_be_batchbuffer *batch,
+                       const void *data, unsigned int bytes, unsigned int flags)
+{
+   assert((bytes & 3) == 0);
+   intel_batchbuffer_require_space(batch, bytes, flags);
+   memcpy(batch->base.ptr, data, bytes);
+   batch->base.ptr += bytes;
+}
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.h b/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.h
new file mode 100644
index 0000000000..f150e3a674
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_batchbuffer.h
@@ -0,0 +1,69 @@
+
+#ifndef INTEL_BE_BATCHBUFFER_H
+#define INTEL_BE_BATCHBUFFER_H
+
+#include "i915simple/i915_batch.h"
+
+#include "ws_dri_bufmgr.h"
+
+#define BATCH_RESERVED 16
+
+#define INTEL_DEFAULT_RELOCS 100
+#define INTEL_MAX_RELOCS 400
+
+#define INTEL_BATCH_NO_CLIPRECTS 0x1
+#define INTEL_BATCH_CLIPRECTS    0x2
+
+struct intel_be_context;
+struct intel_be_device;
+
+struct intel_be_batchbuffer
+{
+	struct i915_batchbuffer base;
+
+	struct intel_be_context *intel;
+	struct intel_be_device *device;
+
+	struct _DriBufferObject *buffer;
+	struct _DriFenceObject *last_fence;
+	uint32_t flags;
+
+	struct _DriBufferList *list;
+	size_t list_count;
+
+	uint32_t *reloc;
+	size_t reloc_size;
+	size_t nr_relocs;
+
+	uint32_t dirty_state;
+	uint32_t id;
+
+	uint32_t poolOffset;
+	uint8_t *drmBOVirtual;
+	struct _drmBONode *node; /* Validation list node for this buffer */
+	int dest_location;     /* Validation list sequence for this buffer */
+};
+
+struct intel_be_batchbuffer *
+intel_be_batchbuffer_alloc(struct intel_be_context *intel);
+
+void
+intel_be_batchbuffer_free(struct intel_be_batchbuffer *batch);
+
+void
+intel_be_batchbuffer_finish(struct intel_be_batchbuffer *batch);
+
+struct _DriFenceObject *
+intel_be_batchbuffer_flush(struct intel_be_batchbuffer *batch);
+
+void
+intel_be_batchbuffer_reset(struct intel_be_batchbuffer *batch);
+
+void
+intel_be_offset_relocation(struct intel_be_batchbuffer *batch,
+			unsigned pre_add,
+			struct _DriBufferObject *driBO,
+			uint64_t val_flags,
+			uint64_t val_mask);
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_context.c b/src/gallium/winsys/drm/intel/common/intel_be_context.c
new file mode 100644
index 0000000000..1af39674f4
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_context.c
@@ -0,0 +1,107 @@
+
+/*
+ * Authors: Jakob Bornecrantz <jakob-at-tungstengraphics.com>
+ */
+
+#include "ws_dri_fencemgr.h"
+#include "intel_be_device.h"
+#include "intel_be_context.h"
+#include "intel_be_batchbuffer.h"
+
+static INLINE struct intel_be_context *
+intel_be_context(struct i915_winsys *sws)
+{
+	return (struct intel_be_context *)sws;
+}
+
+/* Simple batchbuffer interface:
+ */
+
+static struct i915_batchbuffer*
+intel_i915_batch_get(struct i915_winsys *sws)
+{
+	struct intel_be_context *intel = intel_be_context(sws);
+	return &intel->batch->base;
+}
+
+static void intel_i915_batch_reloc(struct i915_winsys *sws,
+				   struct pipe_buffer *buf,
+				   unsigned access_flags,
+				   unsigned delta)
+{
+	struct intel_be_context *intel = intel_be_context(sws);
+
+	unsigned flags = DRM_BO_FLAG_MEM_TT;
+	unsigned mask = DRM_BO_MASK_MEM;
+
+	if (access_flags & I915_BUFFER_ACCESS_WRITE) {
+		flags |= DRM_BO_FLAG_WRITE;
+		mask |= DRM_BO_FLAG_WRITE;
+	}
+
+	if (access_flags & I915_BUFFER_ACCESS_READ) {
+		flags |= DRM_BO_FLAG_READ;
+		mask |= DRM_BO_FLAG_READ;
+	}
+
+	intel_be_offset_relocation(intel->batch,
+				delta,
+				dri_bo(buf),
+				flags,
+				mask);
+}
+
+static void intel_i915_batch_flush(struct i915_winsys *sws,
+				   struct pipe_fence_handle **fence)
+{
+	struct intel_be_context *intel = intel_be_context(sws);
+
+	union {
+		struct _DriFenceObject *dri;
+		struct pipe_fence_handle *pipe;
+	} fu;
+
+	if (fence)
+		assert(!*fence);
+
+	fu.dri = intel_be_batchbuffer_flush(intel->batch);
+
+	if (!fu.dri) {
+		assert(0);
+		*fence = NULL;
+		return;
+	}
+
+	if (fu.dri) {
+		if (fence)
+			*fence = fu.pipe;
+		else
+			driFenceUnReference(&fu.dri);
+	}
+
+}
+
+boolean
+intel_be_init_context(struct intel_be_context *intel, struct intel_be_device *device)
+{
+	assert(intel);
+	assert(device);
+
+	intel->device = device;
+
+	/* TODO move framebuffer createion to the driver */
+
+	intel->base.batch_get = intel_i915_batch_get;
+	intel->base.batch_reloc = intel_i915_batch_reloc;
+	intel->base.batch_flush = intel_i915_batch_flush;
+
+	intel->batch = intel_be_batchbuffer_alloc(intel);
+
+	return true;
+}
+
+void
+intel_be_destroy_context(struct intel_be_context *intel)
+{
+	intel_be_batchbuffer_free(intel->batch);
+}
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_context.h b/src/gallium/winsys/drm/intel/common/intel_be_context.h
new file mode 100644
index 0000000000..d5cbc93594
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_context.h
@@ -0,0 +1,40 @@
+/* These need to be diffrent from the intel winsys */
+#ifndef INTEL_BE_CONTEXT_H
+#define INTEL_BE_CONTEXT_H
+
+#include "i915simple/i915_winsys.h"
+
+struct intel_be_context
+{
+	/** Interface to i915simple driver */
+	struct i915_winsys base;
+
+	struct intel_be_device *device;
+	struct intel_be_batchbuffer *batch;
+
+	/*
+     * Hardware lock functions.
+     *
+     * Needs to be filled in by the winsys.
+     */
+	void (*hardware_lock)(struct intel_be_context *context);
+	void (*hardware_unlock)(struct intel_be_context *context);
+	boolean (*hardware_locked)(struct intel_be_context *context);
+};
+
+/**
+ * Intialize a allocated intel_be_context struct.
+ *
+ * Remember to set the hardware_* functions.
+ */
+boolean
+intel_be_init_context(struct intel_be_context *intel,
+		      struct intel_be_device *device);
+
+/**
+ * Destroy a intel_be_context.
+ * Does not free the struct that is up to the winsys.
+ */
+void
+intel_be_destroy_context(struct intel_be_context *intel);
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_device.c b/src/gallium/winsys/drm/intel/common/intel_be_device.c
new file mode 100644
index 0000000000..019ee5cbd2
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_device.c
@@ -0,0 +1,308 @@
+
+
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ *			 Jakob Bornecrantz <jakob-at-tungstengraphics-dot-com>
+ */
+
+#include "intel_be_device.h"
+#include "ws_dri_bufmgr.h"
+#include "ws_dri_bufpool.h"
+#include "ws_dri_fencemgr.h"
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+
+#include "i915simple/i915_screen.h"
+
+/* Turn a pipe winsys into an intel/pipe winsys:
+ */
+static INLINE struct intel_be_device *
+intel_be_device( struct pipe_winsys *winsys )
+{
+	return (struct intel_be_device *)winsys;
+}
+
+
+/*
+ * Buffer functions.
+ *
+ * Most callbacks map direcly onto dri_bufmgr operations:
+ */
+
+static void *intel_be_buffer_map(struct pipe_winsys *winsys,
+					struct pipe_buffer *buf,
+					unsigned flags )
+{
+	unsigned drm_flags = 0;
+	
+	if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+		drm_flags |= DRM_BO_FLAG_WRITE;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ)
+		drm_flags |= DRM_BO_FLAG_READ;
+
+	return driBOMap( dri_bo(buf), drm_flags, 0 );
+}
+
+static void intel_be_buffer_unmap(struct pipe_winsys *winsys,
+					 struct pipe_buffer *buf)
+{
+	driBOUnmap( dri_bo(buf) );
+}
+
+static void
+intel_be_buffer_destroy(struct pipe_winsys *winsys,
+			  struct pipe_buffer *buf)
+{
+	driBOUnReference( dri_bo(buf) );
+	FREE(buf);
+}
+
+static struct pipe_buffer *
+intel_be_buffer_create(struct pipe_winsys *winsys,
+						  unsigned alignment,
+						  unsigned usage,
+						  unsigned size )
+{
+	struct intel_be_buffer *buffer = CALLOC_STRUCT( intel_be_buffer );
+	struct intel_be_device *iws = intel_be_device(winsys);
+	unsigned flags = 0;
+	struct _DriBufferPool *pool;
+
+	buffer->base.refcount = 1;
+	buffer->base.alignment = alignment;
+	buffer->base.usage = usage;
+	buffer->base.size = size;
+
+	if (usage & (PIPE_BUFFER_USAGE_VERTEX | PIPE_BUFFER_USAGE_CONSTANT)) {
+		flags |= DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED;
+		pool = iws->mallocPool;
+	} else if (usage & PIPE_BUFFER_USAGE_CUSTOM) {
+		/* For vertex buffers */
+		flags |= DRM_BO_FLAG_MEM_VRAM | DRM_BO_FLAG_MEM_TT;
+		pool = iws->vertexPool;
+	} else {
+		flags |= DRM_BO_FLAG_MEM_VRAM | DRM_BO_FLAG_MEM_TT;
+		pool = iws->regionPool;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_GPU_READ)
+		flags |= DRM_BO_FLAG_READ;
+
+	if (usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+		flags |= DRM_BO_FLAG_WRITE;
+
+	/* drm complains if we don't set any read/write flags.
+	 */
+	if ((flags & (DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE)) == 0)
+		flags |= DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE;
+
+	buffer->pool = pool;
+	driGenBuffers( buffer->pool,
+		"pipe buffer", 1, &buffer->driBO, alignment, flags, 0 );
+
+	driBOData( buffer->driBO, size, NULL, buffer->pool, 0 );
+
+	return &buffer->base;
+}
+
+
+static struct pipe_buffer *
+intel_be_user_buffer_create(struct pipe_winsys *winsys, void *ptr, unsigned bytes)
+{
+	struct intel_be_buffer *buffer = CALLOC_STRUCT( intel_be_buffer );
+	struct intel_be_device *iws = intel_be_device(winsys);
+
+	driGenUserBuffer( iws->regionPool,
+			  "pipe user buffer", &buffer->driBO, ptr, bytes );
+
+	buffer->base.refcount = 1;
+
+	return &buffer->base;
+}
+
+struct pipe_buffer *
+intel_be_buffer_from_handle(struct intel_be_device *device,
+                            const char* name, unsigned handle)
+{
+	struct intel_be_buffer *be_buf = malloc(sizeof(*be_buf));
+	struct pipe_buffer *buffer;
+
+	if (!be_buf)
+		goto err;
+
+	memset(be_buf, 0, sizeof(*be_buf));
+
+	driGenBuffers(device->staticPool, name, 1, &be_buf->driBO, 0, 0, 0);
+    driBOSetReferenced(be_buf->driBO, handle);
+
+	if (0) /** XXX TODO check error */
+		goto err_bo;
+	
+	buffer = &be_buf->base;
+	buffer->refcount = 1;
+	buffer->alignment = 0;
+	buffer->usage = 0;
+	buffer->size = driBOSize(be_buf->driBO);
+
+	return buffer;
+err_bo:
+	free(be_buf);
+err:
+	return NULL;
+}
+
+
+/*
+ * Surface functions.
+ *
+ * Deprecated!
+ */
+
+static struct pipe_surface *
+intel_i915_surface_alloc(struct pipe_winsys *winsys)
+{
+	assert((size_t)"intel_i915_surface_alloc is deprecated" & 0);
+	return NULL;
+}
+
+static int
+intel_i915_surface_alloc_storage(struct pipe_winsys *winsys,
+				 struct pipe_surface *surf,
+				 unsigned width, unsigned height,
+				 enum pipe_format format,
+				 unsigned flags,
+				 unsigned tex_usage)
+{
+	assert((size_t)"intel_i915_surface_alloc_storage is deprecated" & 0);
+	return -1;
+}
+
+static void
+intel_i915_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+	assert((size_t)"intel_i915_surface_release is deprecated" & 0);
+}
+
+
+/*
+ * Fence functions
+ */
+
+static void
+intel_be_fence_reference( struct pipe_winsys *sws,
+		       struct pipe_fence_handle **ptr,
+		       struct pipe_fence_handle *fence )
+{
+	if (*ptr)
+		driFenceUnReference((struct _DriFenceObject **)ptr);
+
+	if (fence)
+		*ptr = (struct pipe_fence_handle *)driFenceReference((struct _DriFenceObject *)fence);
+}
+
+static int
+intel_be_fence_signalled( struct pipe_winsys *sws,
+		       struct pipe_fence_handle *fence,
+		       unsigned flag )
+{
+	return driFenceSignaled((struct _DriFenceObject *)fence, flag);
+}
+
+static int
+intel_be_fence_finish( struct pipe_winsys *sws,
+		    struct pipe_fence_handle *fence,
+		    unsigned flag )
+{
+	return driFenceFinish((struct _DriFenceObject *)fence, flag, 0);
+}
+
+
+/*
+ * Misc functions
+ */
+
+boolean
+intel_be_init_device(struct intel_be_device *dev, int fd, unsigned id)
+{
+	dev->fd = fd;
+	dev->max_batch_size = 16 * 4096;
+	dev->max_vertex_size = 128 * 4096;
+
+	dev->base.buffer_create = intel_be_buffer_create;
+	dev->base.user_buffer_create = intel_be_user_buffer_create;
+	dev->base.buffer_map = intel_be_buffer_map;
+	dev->base.buffer_unmap = intel_be_buffer_unmap;
+	dev->base.buffer_destroy = intel_be_buffer_destroy;
+	dev->base.surface_alloc = intel_i915_surface_alloc;
+	dev->base.surface_alloc_storage = intel_i915_surface_alloc_storage;
+	dev->base.surface_release = intel_i915_surface_release;
+	dev->base.fence_reference = intel_be_fence_reference;
+	dev->base.fence_signalled = intel_be_fence_signalled;
+	dev->base.fence_finish = intel_be_fence_finish;
+
+#if 0 /* Set by the winsys */
+	dev->base.flush_frontbuffer = intel_flush_frontbuffer;
+	dev->base.get_name = intel_get_name;
+#endif
+
+	dev->fMan = driInitFreeSlabManager(10, 10);
+	dev->fenceMgr = driFenceMgrTTMInit(dev->fd);
+
+	dev->mallocPool = driMallocPoolInit();
+	dev->staticPool = driDRMPoolInit(dev->fd);
+	/* Sizes: 64 128 256 512 1024 2048 4096 8192 16384 32768 */
+	dev->regionPool = driSlabPoolInit(dev->fd,
+					  DRM_BO_FLAG_READ |
+					  DRM_BO_FLAG_WRITE |
+					  DRM_BO_FLAG_MEM_TT,
+					  DRM_BO_FLAG_READ |
+					  DRM_BO_FLAG_WRITE |
+					  DRM_BO_FLAG_MEM_TT,
+					  64,
+					  10, 120, 4096 * 64, 0,
+					  dev->fMan);
+
+	dev->vertexPool = driSlabPoolInit(dev->fd,
+					  DRM_BO_FLAG_READ |
+					  DRM_BO_FLAG_WRITE |
+					  DRM_BO_FLAG_MEM_TT,
+					  DRM_BO_FLAG_READ |
+					  DRM_BO_FLAG_WRITE |
+					  DRM_BO_FLAG_MEM_TT,
+					  dev->max_vertex_size,
+					  1, 120, dev->max_vertex_size * 4, 0,
+					  dev->fMan);
+
+	dev->batchPool = driSlabPoolInit(dev->fd,
+					 DRM_BO_FLAG_EXE |
+					 DRM_BO_FLAG_MEM_TT,
+					 DRM_BO_FLAG_EXE |
+					 DRM_BO_FLAG_MEM_TT,
+					 dev->max_batch_size,
+					 1, 40, dev->max_batch_size * 16, 0,
+					 dev->fMan);
+
+	/* Fill in this struct with callbacks that i915simple will need to
+	 * communicate with the window system, buffer manager, etc.
+	 */
+	dev->screen = i915_create_screen(&dev->base, id);
+
+	return true;
+}
+
+void
+intel_be_destroy_device(struct intel_be_device *dev)
+{
+	driPoolTakeDown(dev->mallocPool);
+	driPoolTakeDown(dev->staticPool);
+	driPoolTakeDown(dev->regionPool);
+	driPoolTakeDown(dev->vertexPool);
+	driPoolTakeDown(dev->batchPool);
+
+	/** TODO takedown fenceMgr and fMan */
+}
diff --git a/src/gallium/winsys/drm/intel/common/intel_be_device.h b/src/gallium/winsys/drm/intel/common/intel_be_device.h
new file mode 100644
index 0000000000..3f8b3f585c
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/intel_be_device.h
@@ -0,0 +1,72 @@
+#ifndef INTEL_DRM_DEVICE_H
+#define INTEL_DRM_DEVICE_H
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_context.h"
+
+/*
+ * Device
+ */
+
+struct intel_be_device
+{
+	struct pipe_winsys base;
+
+	/**
+	 * Hw level screen
+	 */
+	struct pipe_screen *screen;
+
+	int fd; /**< Drm file discriptor */
+
+	size_t max_batch_size;
+	size_t max_vertex_size;
+
+	struct _DriFenceMgr *fenceMgr;
+
+	struct _DriBufferPool *batchPool;
+	struct _DriBufferPool *regionPool;
+	struct _DriBufferPool *mallocPool;
+	struct _DriBufferPool *vertexPool;
+	struct _DriBufferPool *staticPool;
+	struct _DriFreeSlabManager *fMan;
+};
+
+boolean
+intel_be_init_device(struct intel_be_device *device, int fd, unsigned id);
+
+void
+intel_be_destroy_device(struct intel_be_device *dev);
+
+/*
+ * Buffer
+ */
+
+struct intel_be_buffer {
+	struct pipe_buffer base;
+	struct _DriBufferPool *pool;
+	struct _DriBufferObject *driBO;
+};
+
+/**
+ * Create a be buffer from a drm bo handle
+ *
+ * Takes a reference
+ */
+struct pipe_buffer *
+intel_be_buffer_from_handle(struct intel_be_device *device,
+                            const char* name, unsigned handle);
+
+static INLINE struct intel_be_buffer *
+intel_be_buffer(struct pipe_buffer *buf)
+{
+	return (struct intel_be_buffer *)buf;
+}
+
+static INLINE struct _DriBufferObject *
+dri_bo(struct pipe_buffer *buf)
+{
+	return intel_be_buffer(buf)->driBO;
+}
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
new file mode 100644
index 0000000000..517a97b3ee
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
@@ -0,0 +1,949 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ *          Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include <xf86drm.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "pipe/p_thread.h"
+#include "errno.h"
+#include "ws_dri_bufmgr.h"
+#include "string.h"
+#include "pipe/p_debug.h"
+#include "ws_dri_bufpool.h"
+#include "ws_dri_fencemgr.h"
+
+
+/*
+ * This lock is here to protect drmBO structs changing underneath us during a
+ * validate list call, since validatelist cannot take individiual locks for
+ * each drmBO. Validatelist takes this lock in write mode. Any access to an
+ * individual drmBO should take this lock in read mode, since in that case, the
+ * driBufferObject mutex will protect the access. Locking order is
+ * driBufferObject mutex - > this rw lock.
+ */
+
+pipe_static_mutex(bmMutex);
+pipe_static_condvar(bmCond);
+
+static int kernelReaders = 0;
+static int num_buffers = 0;
+static int num_user_buffers = 0;
+
+static drmBO *drmBOListBuf(void *iterator)
+{
+    drmBONode *node;
+    drmMMListHead *l = (drmMMListHead *) iterator;
+    node = DRMLISTENTRY(drmBONode, l, head);
+    return node->buf;
+}
+
+static void *drmBOListIterator(drmBOList *list)
+{
+    void *ret = list->list.next;
+
+    if (ret == &list->list)
+	return NULL;
+    return ret;
+}
+
+static void *drmBOListNext(drmBOList *list, void *iterator)
+{
+    void *ret;
+
+    drmMMListHead *l = (drmMMListHead *) iterator;
+    ret = l->next;
+    if (ret == &list->list)
+	return NULL;
+    return ret;
+}
+
+static drmBONode *drmAddListItem(drmBOList *list, drmBO *item,
+				 uint64_t arg0,
+				 uint64_t arg1)
+{
+    drmBONode *node;
+    drmMMListHead *l;
+
+    l = list->free.next;
+    if (l == &list->free) {
+	node = (drmBONode *) malloc(sizeof(*node));
+	if (!node) {
+	    return NULL;
+	}
+	list->numCurrent++;
+    }
+    else {
+	DRMLISTDEL(l);
+	node = DRMLISTENTRY(drmBONode, l, head);
+    }
+    node->buf = item;
+    node->arg0 = arg0;
+    node->arg1 = arg1;
+    DRMLISTADD(&node->head, &list->list);
+    list->numOnList++;
+    return node;
+}
+
+static int drmAddValidateItem(drmBOList *list, drmBO *buf, uint64_t flags,
+			      uint64_t mask, int *newItem)
+{
+    drmBONode *node, *cur;
+    drmMMListHead *l;
+
+    *newItem = 0;
+    cur = NULL;
+
+    for (l = list->list.next; l != &list->list; l = l->next) {
+	node = DRMLISTENTRY(drmBONode, l, head);
+	if (node->buf == buf) {
+	    cur = node;
+	    break;
+	}
+    }
+    if (!cur) {
+	cur = drmAddListItem(list, buf, flags, mask);
+	if (!cur) {
+	    return -ENOMEM;
+	}
+	*newItem = 1;
+	cur->arg0 = flags;
+	cur->arg1 = mask;
+    }
+    else {
+        uint64_t memFlags = cur->arg0 & flags & DRM_BO_MASK_MEM;
+	uint64_t accFlags = (cur->arg0 | flags) & ~DRM_BO_MASK_MEM;
+
+	if (mask & cur->arg1 & ~DRM_BO_MASK_MEM  & (cur->arg0 ^ flags)) {
+	    return -EINVAL;
+	}
+
+	cur->arg1 |= mask;
+	cur->arg0 = (cur->arg0 & ~mask) | ((memFlags | accFlags) & mask);
+
+	if (((cur->arg1 & DRM_BO_MASK_MEM) != 0) &&
+	    (cur->arg0 & DRM_BO_MASK_MEM) == 0) {
+	    return -EINVAL;
+	}
+    }
+    return 0;
+}
+
+static void drmBOFreeList(drmBOList *list)
+{
+    drmBONode *node;
+    drmMMListHead *l;
+
+    l = list->list.next;
+    while(l != &list->list) {
+	DRMLISTDEL(l);
+	node = DRMLISTENTRY(drmBONode, l, head);
+	free(node);
+	l = list->list.next;
+	list->numCurrent--;
+	list->numOnList--;
+    }
+
+    l = list->free.next;
+    while(l != &list->free) {
+	DRMLISTDEL(l);
+	node = DRMLISTENTRY(drmBONode, l, head);
+	free(node);
+	l = list->free.next;
+	list->numCurrent--;
+    }
+}
+
+static int drmAdjustListNodes(drmBOList *list)
+{
+    drmBONode *node;
+    drmMMListHead *l;
+    int ret = 0;
+
+    while(list->numCurrent < list->numTarget) {
+	node = (drmBONode *) malloc(sizeof(*node));
+	if (!node) {
+	    ret = -ENOMEM;
+	    break;
+	}
+	list->numCurrent++;
+	DRMLISTADD(&node->head, &list->free);
+    }
+
+    while(list->numCurrent > list->numTarget) {
+	l = list->free.next;
+	if (l == &list->free)
+	    break;
+	DRMLISTDEL(l);
+	node = DRMLISTENTRY(drmBONode, l, head);
+	free(node);
+	list->numCurrent--;
+    }
+    return ret;
+}
+
+static int drmBOCreateList(int numTarget, drmBOList *list)
+{
+    DRMINITLISTHEAD(&list->list);
+    DRMINITLISTHEAD(&list->free);
+    list->numTarget = numTarget;
+    list->numCurrent = 0;
+    list->numOnList = 0;
+    return drmAdjustListNodes(list);
+}
+
+static int drmBOResetList(drmBOList *list)
+{
+    drmMMListHead *l;
+    int ret;
+
+    ret = drmAdjustListNodes(list);
+    if (ret)
+	return ret;
+
+    l = list->list.next;
+    while (l != &list->list) {
+	DRMLISTDEL(l);
+	DRMLISTADD(l, &list->free);
+	list->numOnList--;
+	l = list->list.next;
+    }
+    return drmAdjustListNodes(list);
+}
+
+void driWriteLockKernelBO(void)
+{
+    pipe_mutex_lock(bmMutex);
+    while(kernelReaders != 0)
+	pipe_condvar_wait(bmCond, bmMutex);
+}
+
+void driWriteUnlockKernelBO(void)
+{
+    pipe_mutex_unlock(bmMutex);
+}
+
+void driReadLockKernelBO(void)
+{
+    pipe_mutex_lock(bmMutex);
+    kernelReaders++;
+    pipe_mutex_unlock(bmMutex);
+}
+
+void driReadUnlockKernelBO(void)
+{
+    pipe_mutex_lock(bmMutex);
+    if (--kernelReaders == 0)
+       pipe_condvar_broadcast(bmCond);
+    pipe_mutex_unlock(bmMutex);
+}
+
+
+
+
+/*
+ * TODO: Introduce fence pools in the same way as
+ * buffer object pools.
+ */
+
+typedef struct _DriBufferObject
+{
+   DriBufferPool *pool;
+   pipe_mutex mutex;
+   int refCount;
+   const char *name;
+   uint64_t flags;
+   unsigned hint;
+   unsigned alignment;
+   unsigned createdByReference;
+   void *private;
+   /* user-space buffer: */
+   unsigned userBuffer;
+   void *userData;
+   unsigned userSize;
+} DriBufferObject;
+
+typedef struct _DriBufferList {
+    drmBOList drmBuffers;  /* List of kernel buffers needing validation */
+    drmBOList driBuffers;  /* List of user-space buffers needing validation */
+} DriBufferList;
+
+
+void
+bmError(int val, const char *file, const char *function, int line)
+{
+   printf("Fatal video memory manager error \"%s\".\n"
+          "Check kernel logs or set the LIBGL_DEBUG\n"
+          "environment variable to \"verbose\" for more info.\n"
+          "Detected in file %s, line %d, function %s.\n",
+          strerror(-val), file, line, function);
+#ifndef NDEBUG
+   abort();
+#else
+   abort();
+#endif
+}
+
+extern drmBO *
+driBOKernel(struct _DriBufferObject *buf)
+{
+   drmBO *ret;
+
+   driReadLockKernelBO();
+   pipe_mutex_lock(buf->mutex);
+   assert(buf->private != NULL);
+   ret = buf->pool->kernel(buf->pool, buf->private);
+   if (!ret)
+      BM_CKFATAL(-EINVAL);
+   pipe_mutex_unlock(buf->mutex);
+   driReadUnlockKernelBO();
+
+   return ret;
+}
+
+void
+driBOWaitIdle(struct _DriBufferObject *buf, int lazy)
+{
+
+  /*
+   * This function may block. Is it sane to keep the mutex held during
+   * that time??
+   */
+
+   pipe_mutex_lock(buf->mutex);
+   BM_CKFATAL(buf->pool->waitIdle(buf->pool, buf->private, &buf->mutex, lazy));
+   pipe_mutex_unlock(buf->mutex);
+}
+
+void *
+driBOMap(struct _DriBufferObject *buf, unsigned flags, unsigned hint)
+{
+   void *virtual;
+   int retval;
+
+   if (buf->userBuffer) {
+      return buf->userData;
+   }
+
+   pipe_mutex_lock(buf->mutex);
+   assert(buf->private != NULL);
+   retval = buf->pool->map(buf->pool, buf->private, flags, hint,
+			   &buf->mutex, &virtual);
+   pipe_mutex_unlock(buf->mutex);
+
+   return retval == 0 ? virtual : NULL;
+}
+
+void
+driBOUnmap(struct _DriBufferObject *buf)
+{
+   if (buf->userBuffer)
+      return;
+
+   assert(buf->private != NULL);
+   pipe_mutex_lock(buf->mutex);
+   BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
+   pipe_mutex_unlock(buf->mutex);
+}
+
+unsigned long
+driBOOffset(struct _DriBufferObject *buf)
+{
+   unsigned long ret;
+
+   assert(buf->private != NULL);
+
+   pipe_mutex_lock(buf->mutex);
+   ret = buf->pool->offset(buf->pool, buf->private);
+   pipe_mutex_unlock(buf->mutex);
+   return ret;
+}
+
+unsigned long
+driBOPoolOffset(struct _DriBufferObject *buf)
+{
+   unsigned long ret;
+
+   assert(buf->private != NULL);
+
+   pipe_mutex_lock(buf->mutex);
+   ret = buf->pool->poolOffset(buf->pool, buf->private);
+   pipe_mutex_unlock(buf->mutex);
+   return ret;
+}
+
+uint64_t
+driBOFlags(struct _DriBufferObject *buf)
+{
+   uint64_t ret;
+
+   assert(buf->private != NULL);
+
+   driReadLockKernelBO();
+   pipe_mutex_lock(buf->mutex);
+   ret = buf->pool->flags(buf->pool, buf->private);
+   pipe_mutex_unlock(buf->mutex);
+   driReadUnlockKernelBO();
+   return ret;
+}
+
+struct _DriBufferObject *
+driBOReference(struct _DriBufferObject *buf)
+{
+   pipe_mutex_lock(buf->mutex);
+   if (++buf->refCount == 1) {
+      pipe_mutex_unlock(buf->mutex);
+      BM_CKFATAL(-EINVAL);
+   }
+   pipe_mutex_unlock(buf->mutex);
+   return buf;
+}
+
+void
+driBOUnReference(struct _DriBufferObject *buf)
+{
+   int tmp;
+
+   if (!buf)
+      return;
+
+   pipe_mutex_lock(buf->mutex);
+   tmp = --buf->refCount;
+   if (!tmp) {
+      pipe_mutex_unlock(buf->mutex);
+      if (buf->private) {
+	 if (buf->createdByReference)
+	    buf->pool->unreference(buf->pool, buf->private);
+	 else
+	    buf->pool->destroy(buf->pool, buf->private);
+      }
+      if (buf->userBuffer)
+	 num_user_buffers--;
+      else
+	 num_buffers--;
+      free(buf);
+   } else
+     pipe_mutex_unlock(buf->mutex);
+
+}
+
+
+int
+driBOData(struct _DriBufferObject *buf,
+          unsigned size, const void *data,
+	  DriBufferPool *newPool,
+	  uint64_t flags)
+{
+   void *virtual = NULL;
+   int newBuffer;
+   int retval = 0;
+   struct _DriBufferPool *pool;
+
+   assert(!buf->userBuffer); /* XXX just do a memcpy? */
+
+   pipe_mutex_lock(buf->mutex);
+   pool = buf->pool;
+
+   if (pool == NULL && newPool != NULL) {
+       buf->pool = newPool;
+       pool = newPool;
+   }
+   if (newPool == NULL)
+       newPool = pool;
+
+   if (!pool->create) {
+      assert((size_t)"driBOData called on invalid buffer\n" & 0);
+      BM_CKFATAL(-EINVAL);
+   }
+
+   newBuffer = (!buf->private || pool != newPool ||
+ 		pool->size(pool, buf->private) < size);
+
+   if (!flags)
+       flags = buf->flags;
+
+   if (newBuffer) {
+
+       if (buf->createdByReference) {
+	  assert((size_t)"driBOData requiring resizing called on shared buffer.\n" & 0);
+	  BM_CKFATAL(-EINVAL);
+       }
+
+       if (buf->private)
+	   buf->pool->destroy(buf->pool, buf->private);
+
+       pool = newPool;
+       buf->pool = newPool;
+       buf->private = pool->create(pool, size, flags, DRM_BO_HINT_DONT_FENCE,
+				  buf->alignment);
+      if (!buf->private)
+	  retval = -ENOMEM;
+
+      if (retval == 0)
+	  retval = pool->map(pool, buf->private,
+			     DRM_BO_FLAG_WRITE,
+			     DRM_BO_HINT_DONT_BLOCK, &buf->mutex, &virtual);
+   } else if (pool->map(pool, buf->private, DRM_BO_FLAG_WRITE,
+			DRM_BO_HINT_DONT_BLOCK, &buf->mutex, &virtual)) {
+       /*
+	* Buffer is busy. need to create a new one.
+	*/
+
+       void *newBuf;
+
+       newBuf = pool->create(pool, size, flags, DRM_BO_HINT_DONT_FENCE,
+			     buf->alignment);
+       if (newBuf) {
+	   buf->pool->destroy(buf->pool, buf->private);
+	   buf->private = newBuf;
+       }
+
+       retval = pool->map(pool, buf->private,
+			  DRM_BO_FLAG_WRITE, 0, &buf->mutex, &virtual);
+   } else {
+       uint64_t flag_diff = flags ^ buf->flags;
+
+       /*
+	* We might need to change buffer flags.
+	*/
+
+       if (flag_diff){
+	   assert(pool->setStatus != NULL);
+	   BM_CKFATAL(pool->unmap(pool, buf->private));
+	   BM_CKFATAL(pool->setStatus(pool, buf->private, flag_diff,
+				      buf->flags));
+	   if (!data)
+	     goto out;
+
+	   retval = pool->map(pool, buf->private,
+			      DRM_BO_FLAG_WRITE, 0, &buf->mutex, &virtual);
+       }
+   }
+
+   if (retval == 0) {
+      if (data)
+	 memcpy(virtual, data, size);
+
+      BM_CKFATAL(pool->unmap(pool, buf->private));
+   }
+
+ out:
+   pipe_mutex_unlock(buf->mutex);
+
+   return retval;
+}
+
+void
+driBOSubData(struct _DriBufferObject *buf,
+             unsigned long offset, unsigned long size, const void *data)
+{
+   void *virtual;
+
+   assert(!buf->userBuffer); /* XXX just do a memcpy? */
+
+   pipe_mutex_lock(buf->mutex);
+   if (size && data) {
+      BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
+                                DRM_BO_FLAG_WRITE, 0, &buf->mutex,
+				&virtual));
+      memcpy((unsigned char *) virtual + offset, data, size);
+      BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
+   }
+   pipe_mutex_unlock(buf->mutex);
+}
+
+void
+driBOGetSubData(struct _DriBufferObject *buf,
+                unsigned long offset, unsigned long size, void *data)
+{
+   void *virtual;
+
+   assert(!buf->userBuffer); /* XXX just do a memcpy? */
+
+   pipe_mutex_lock(buf->mutex);
+   if (size && data) {
+      BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
+                                DRM_BO_FLAG_READ, 0, &buf->mutex, &virtual));
+      memcpy(data, (unsigned char *) virtual + offset, size);
+      BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
+   }
+   pipe_mutex_unlock(buf->mutex);
+}
+
+void
+driBOSetReferenced(struct _DriBufferObject *buf,
+		   unsigned long handle)
+{
+   pipe_mutex_lock(buf->mutex);
+   if (buf->private != NULL) {
+      assert((size_t)"Invalid buffer for setReferenced\n" & 0);
+      BM_CKFATAL(-EINVAL);
+
+   }
+   if (buf->pool->reference == NULL) {
+      assert((size_t)"Invalid buffer pool for setReferenced\n" & 0);
+      BM_CKFATAL(-EINVAL);
+   }
+   buf->private = buf->pool->reference(buf->pool, handle);
+   if (!buf->private) {
+      assert((size_t)"Invalid buffer pool for setStatic\n" & 0);
+      BM_CKFATAL(-ENOMEM);
+   }
+   buf->createdByReference = TRUE;
+   buf->flags = buf->pool->kernel(buf->pool, buf->private)->flags;
+   pipe_mutex_unlock(buf->mutex);
+}
+
+int
+driGenBuffers(struct _DriBufferPool *pool,
+              const char *name,
+              unsigned n,
+              struct _DriBufferObject *buffers[],
+              unsigned alignment, uint64_t flags, unsigned hint)
+{
+   struct _DriBufferObject *buf;
+   int i;
+
+   flags = (flags) ? flags : DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_MEM_VRAM |
+      DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE;
+
+   ++num_buffers;
+
+   assert(pool);
+
+   for (i = 0; i < n; ++i) {
+      buf = (struct _DriBufferObject *) calloc(1, sizeof(*buf));
+      if (!buf)
+	 return -ENOMEM;
+
+      pipe_mutex_init(buf->mutex);
+      pipe_mutex_lock(buf->mutex);
+      buf->refCount = 1;
+      buf->flags = flags;
+      buf->hint = hint;
+      buf->name = name;
+      buf->alignment = alignment;
+      buf->pool = pool;
+      buf->createdByReference = 0;
+      pipe_mutex_unlock(buf->mutex);
+      buffers[i] = buf;
+   }
+   return 0;
+}
+
+void
+driGenUserBuffer(struct _DriBufferPool *pool,
+                 const char *name,
+                 struct _DriBufferObject **buffers,
+                 void *ptr, unsigned bytes)
+{
+   const unsigned alignment = 1, flags = 0, hint = 0;
+
+   --num_buffers; /* JB: is inced in GenBuffes */
+   driGenBuffers(pool, name, 1, buffers, alignment, flags, hint);
+   ++num_user_buffers;
+
+   (*buffers)->userBuffer = 1;
+   (*buffers)->userData = ptr;
+   (*buffers)->userSize = bytes;
+}
+
+void
+driDeleteBuffers(unsigned n, struct _DriBufferObject *buffers[])
+{
+   int i;
+
+   for (i = 0; i < n; ++i) {
+      driBOUnReference(buffers[i]);
+   }
+}
+
+
+void
+driInitBufMgr(int fd)
+{
+   ;
+}
+
+/*
+ * Note that lists are per-context and don't need mutex protection.
+ */
+
+struct _DriBufferList *
+driBOCreateList(int target)
+{
+    struct _DriBufferList *list = calloc(sizeof(*list), 1);
+
+    BM_CKFATAL(drmBOCreateList(target, &list->drmBuffers));
+    BM_CKFATAL(drmBOCreateList(target, &list->driBuffers));
+    return list;
+}
+
+int
+driBOResetList(struct _DriBufferList * list)
+{
+    int ret;
+    ret = drmBOResetList(&list->drmBuffers);
+    if (ret)
+	return ret;
+    ret = drmBOResetList(&list->driBuffers);
+    return ret;
+}
+
+void
+driBOFreeList(struct _DriBufferList * list)
+{
+   drmBOFreeList(&list->drmBuffers);
+   drmBOFreeList(&list->driBuffers);
+   free(list);
+}
+
+
+/*
+ * Copied from libdrm, because it is needed by driAddValidateItem.
+ */
+
+static drmBONode *
+driAddListItem(drmBOList * list, drmBO * item,
+	       uint64_t arg0, uint64_t arg1)
+{
+    drmBONode *node;
+    drmMMListHead *l;
+
+    l = list->free.next;
+    if (l == &list->free) {
+	node = (drmBONode *) malloc(sizeof(*node));
+	if (!node) {
+	    return NULL;
+	}
+	list->numCurrent++;
+    } else {
+	DRMLISTDEL(l);
+	node = DRMLISTENTRY(drmBONode, l, head);
+    }
+    memset(&node->bo_arg, 0, sizeof(node->bo_arg));
+    node->buf = item;
+    node->arg0 = arg0;
+    node->arg1 = arg1;
+    DRMLISTADDTAIL(&node->head, &list->list);
+    list->numOnList++;
+    return node;
+}
+
+/*
+ * Slightly modified version compared to the libdrm version.
+ * This one returns the list index of the buffer put on the list.
+ */
+
+static int
+driAddValidateItem(drmBOList * list, drmBO * buf, uint64_t flags,
+		   uint64_t mask, int *itemLoc,
+		   struct _drmBONode **pnode)
+{
+    drmBONode *node, *cur;
+    drmMMListHead *l;
+    int count = 0;
+
+    cur = NULL;
+
+    for (l = list->list.next; l != &list->list; l = l->next) {
+	node = DRMLISTENTRY(drmBONode, l, head);
+	if (node->buf == buf) {
+	    cur = node;
+	    break;
+	}
+	count++;
+    }
+    if (!cur) {
+	cur = driAddListItem(list, buf, flags, mask);
+	if (!cur)
+	    return -ENOMEM;
+
+	cur->arg0 = flags;
+	cur->arg1 = mask;
+    } else {
+        uint64_t memFlags = cur->arg0 & flags & DRM_BO_MASK_MEM;
+	uint64_t accFlags = (cur->arg0 | flags) & ~DRM_BO_MASK_MEM;
+
+	if (mask & cur->arg1 & ~DRM_BO_MASK_MEM  & (cur->arg0 ^ flags)) {
+	    return -EINVAL;
+	}
+
+	cur->arg1 |= mask;
+	cur->arg0 = (cur->arg0 & ~mask) | ((memFlags | accFlags) & mask);
+
+	if (((cur->arg1 & DRM_BO_MASK_MEM) != 0) &&
+	    (cur->arg0 & DRM_BO_MASK_MEM) == 0) {
+	    return -EINVAL;
+	}
+    }
+    *itemLoc = count;
+    *pnode = cur;
+    return 0;
+}
+
+
+void
+driBOAddListItem(struct _DriBufferList * list, struct _DriBufferObject *buf,
+                 uint64_t flags, uint64_t mask, int *itemLoc,
+		 struct _drmBONode **node)
+{
+   int newItem;
+
+   pipe_mutex_lock(buf->mutex);
+   BM_CKFATAL(driAddValidateItem(&list->drmBuffers,
+				 buf->pool->kernel(buf->pool, buf->private),
+                                 flags, mask, itemLoc, node));
+   BM_CKFATAL(drmAddValidateItem(&list->driBuffers, (drmBO *) buf,
+				 flags, mask, &newItem));
+   if (newItem)
+     buf->refCount++;
+
+   pipe_mutex_unlock(buf->mutex);
+}
+
+drmBOList *driGetdrmBOList(struct _DriBufferList *list)
+{
+	driWriteLockKernelBO();
+	return &list->drmBuffers;
+}
+
+void driPutdrmBOList(struct _DriBufferList *list)
+{
+	driWriteUnlockKernelBO();
+}
+
+
+void
+driBOFence(struct _DriBufferObject *buf, struct _DriFenceObject *fence)
+{
+   pipe_mutex_lock(buf->mutex);
+   if (buf->pool->fence)
+       BM_CKFATAL(buf->pool->fence(buf->pool, buf->private, fence));
+   pipe_mutex_unlock(buf->mutex);
+
+}
+
+void
+driBOUnrefUserList(struct _DriBufferList *list)
+{
+    struct _DriBufferObject *buf;
+    void *curBuf;
+
+    curBuf = drmBOListIterator(&list->driBuffers);
+    while (curBuf) {
+	buf = (struct _DriBufferObject *)drmBOListBuf(curBuf);
+	driBOUnReference(buf);
+	curBuf = drmBOListNext(&list->driBuffers, curBuf);
+    }
+}
+
+struct _DriFenceObject *
+driBOFenceUserList(struct _DriFenceMgr *mgr,
+		   struct _DriBufferList *list, const char *name,
+		   drmFence *kFence)
+{
+    struct _DriFenceObject *fence;
+    struct _DriBufferObject *buf;
+    void *curBuf;
+
+    fence = driFenceCreate(mgr, kFence->fence_class, kFence->type,
+			   kFence, sizeof(*kFence));
+    curBuf = drmBOListIterator(&list->driBuffers);
+
+   /*
+    * User-space fencing callbacks.
+    */
+
+   while (curBuf) {
+        buf = (struct _DriBufferObject *) drmBOListBuf(curBuf);
+	driBOFence(buf, fence);
+	driBOUnReference(buf);
+	curBuf = drmBOListNext(&list->driBuffers, curBuf);
+   }
+
+   driBOResetList(list);
+   return fence;
+}
+
+void
+driBOValidateUserList(struct _DriBufferList * list)
+{
+    void *curBuf;
+    struct _DriBufferObject *buf;
+
+    curBuf = drmBOListIterator(&list->driBuffers);
+
+    /*
+     * User-space validation callbacks.
+     */
+
+    while (curBuf) {
+	buf = (struct _DriBufferObject *) drmBOListBuf(curBuf);
+	pipe_mutex_lock(buf->mutex);
+	if (buf->pool->validate)
+	    BM_CKFATAL(buf->pool->validate(buf->pool, buf->private, &buf->mutex));
+	pipe_mutex_unlock(buf->mutex);
+	curBuf = drmBOListNext(&list->driBuffers, curBuf);
+    }
+}
+
+
+void
+driPoolTakeDown(struct _DriBufferPool *pool)
+{
+   pool->takeDown(pool);
+
+}
+
+unsigned long
+driBOSize(struct _DriBufferObject *buf)
+{
+  unsigned long size;
+
+   pipe_mutex_lock(buf->mutex);
+   size = buf->pool->size(buf->pool, buf->private);
+   pipe_mutex_unlock(buf->mutex);
+
+  return size;
+
+}
+
+drmBOList *driBOGetDRMBuffers(struct _DriBufferList *list)
+{
+    return &list->drmBuffers;
+}
+
+drmBOList *driBOGetDRIBuffers(struct _DriBufferList *list)
+{
+    return &list->driBuffers;
+}
+
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.h b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.h
new file mode 100644
index 0000000000..e6c0cff0a0
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.h
@@ -0,0 +1,138 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ *          Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#ifndef _PSB_BUFMGR_H_
+#define _PSB_BUFMGR_H_
+#include <xf86mm.h>
+#include "i915_drm.h"
+#include "ws_dri_fencemgr.h"
+
+typedef struct _drmBONode
+{
+    drmMMListHead head;
+    drmBO *buf;
+    struct drm_i915_op_arg bo_arg;
+    uint64_t arg0;
+    uint64_t arg1;
+} drmBONode;
+
+typedef struct _drmBOList {
+    unsigned numTarget;
+    unsigned numCurrent;
+    unsigned numOnList;
+    drmMMListHead list;
+    drmMMListHead free;
+} drmBOList;
+
+
+struct _DriFenceObject;
+struct _DriBufferObject;
+struct _DriBufferPool;
+struct _DriBufferList;
+
+/*
+ * Return a pointer to the libdrm buffer object this DriBufferObject
+ * uses.
+ */
+
+extern drmBO *driBOKernel(struct _DriBufferObject *buf);
+extern void *driBOMap(struct _DriBufferObject *buf, unsigned flags,
+                      unsigned hint);
+extern void driBOUnmap(struct _DriBufferObject *buf);
+extern unsigned long driBOOffset(struct _DriBufferObject *buf);
+extern unsigned long driBOPoolOffset(struct _DriBufferObject *buf);
+
+extern uint64_t driBOFlags(struct _DriBufferObject *buf);
+extern struct _DriBufferObject *driBOReference(struct _DriBufferObject *buf);
+extern void driBOUnReference(struct _DriBufferObject *buf);
+
+extern int driBOData(struct _DriBufferObject *r_buf,
+		     unsigned size, const void *data,
+		     struct _DriBufferPool *pool, uint64_t flags);
+
+extern void driBOSubData(struct _DriBufferObject *buf,
+                         unsigned long offset, unsigned long size,
+                         const void *data);
+extern void driBOGetSubData(struct _DriBufferObject *buf,
+                            unsigned long offset, unsigned long size,
+                            void *data);
+extern int driGenBuffers(struct _DriBufferPool *pool,
+			 const char *name,
+			 unsigned n,
+			 struct _DriBufferObject *buffers[],
+			 unsigned alignment, uint64_t flags, unsigned hint);
+extern void driGenUserBuffer(struct _DriBufferPool *pool,
+                             const char *name,
+                             struct _DriBufferObject *buffers[],
+                             void *ptr, unsigned bytes);
+extern void driDeleteBuffers(unsigned n, struct _DriBufferObject *buffers[]);
+extern void driInitBufMgr(int fd);
+extern struct _DriBufferList *driBOCreateList(int target);
+extern int driBOResetList(struct _DriBufferList * list);
+extern void driBOAddListItem(struct _DriBufferList * list,
+			     struct _DriBufferObject *buf,
+                             uint64_t flags, uint64_t mask, int *itemLoc,
+			     struct _drmBONode **node);
+
+extern void driBOValidateList(int fd, struct _DriBufferList * list);
+extern void driBOFreeList(struct _DriBufferList * list);
+extern struct _DriFenceObject *driBOFenceUserList(struct _DriFenceMgr *mgr,
+						  struct _DriBufferList *list,
+						  const char *name,
+						  drmFence *kFence);
+extern void driBOUnrefUserList(struct _DriBufferList *list);
+extern void driBOValidateUserList(struct _DriBufferList * list);
+extern drmBOList *driGetdrmBOList(struct _DriBufferList *list);
+extern void driPutdrmBOList(struct _DriBufferList *list);
+
+extern void driBOFence(struct _DriBufferObject *buf,
+                       struct _DriFenceObject *fence);
+
+extern void driPoolTakeDown(struct _DriBufferPool *pool);
+extern void driBOSetReferenced(struct _DriBufferObject *buf,
+			       unsigned long handle);
+unsigned long driBOSize(struct _DriBufferObject *buf);
+extern void driBOWaitIdle(struct _DriBufferObject *buf, int lazy);
+extern void driPoolTakeDown(struct _DriBufferPool *pool);
+
+extern void driReadLockKernelBO(void);
+extern void driReadUnlockKernelBO(void);
+extern void driWriteLockKernelBO(void);
+extern void driWriteUnlockKernelBO(void);
+
+/*
+ * For debugging purposes.
+ */
+
+extern drmBOList *driBOGetDRMBuffers(struct _DriBufferList *list);
+extern drmBOList *driBOGetDRIBuffers(struct _DriBufferList *list);
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
new file mode 100644
index 0000000000..ad3b6f3931
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ */
+
+#ifndef _PSB_BUFPOOL_H_
+#define _PSB_BUFPOOL_H_
+
+#include <xf86drm.h>
+#include "pipe/p_thread.h"
+struct _DriFenceObject;
+
+typedef struct _DriBufferPool
+{
+   int fd;
+   int (*map) (struct _DriBufferPool * pool, void *private,
+               unsigned flags, int hint, pipe_mutex *mutex,
+	       void **virtual);
+   int (*unmap) (struct _DriBufferPool * pool, void *private);
+   int (*destroy) (struct _DriBufferPool * pool, void *private);
+   unsigned long (*offset) (struct _DriBufferPool * pool, void *private);
+   unsigned long (*poolOffset) (struct _DriBufferPool * pool, void *private);
+   uint64_t (*flags) (struct _DriBufferPool * pool, void *private);
+   unsigned long (*size) (struct _DriBufferPool * pool, void *private);
+   void *(*create) (struct _DriBufferPool * pool, unsigned long size,
+                    uint64_t flags, unsigned hint, unsigned alignment);
+   void *(*reference) (struct _DriBufferPool * pool, unsigned handle);
+   int (*unreference) (struct _DriBufferPool * pool, void *private);
+   int (*fence) (struct _DriBufferPool * pool, void *private,
+                 struct _DriFenceObject * fence);
+   drmBO *(*kernel) (struct _DriBufferPool * pool, void *private);
+   int (*validate) (struct _DriBufferPool * pool, void *private, pipe_mutex *mutex);
+   int (*waitIdle) (struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
+		    int lazy);
+   int (*setStatus)  (struct _DriBufferPool *pool, void *private,
+		      uint64_t flag_diff, uint64_t old_flags);
+   void (*takeDown) (struct _DriBufferPool * pool);
+   void *data;
+} DriBufferPool;
+
+extern void bmError(int val, const char *file, const char *function,
+                    int line);
+#define BM_CKFATAL(val)					       \
+  do{							       \
+    int tstVal = (val);					       \
+    if (tstVal) 					       \
+      bmError(tstVal, __FILE__, __FUNCTION__, __LINE__);       \
+  } while(0);
+
+
+/*
+ * Builtin pools.
+ */
+
+/*
+ * Kernel buffer objects. Size in multiples of page size. Page size aligned.
+ */
+
+extern struct _DriBufferPool *driDRMPoolInit(int fd);
+extern struct _DriBufferPool *driMallocPoolInit(void);
+
+struct _DriFreeSlabManager;
+extern struct _DriBufferPool * driSlabPoolInit(int fd, uint64_t flags,
+					       uint64_t validMask,
+					       uint32_t smallestSize,
+					       uint32_t numSizes,
+					       uint32_t desiredNumBuffers,
+					       uint32_t maxSlabSize,
+					       uint32_t pageAlignment,
+					       struct _DriFreeSlabManager *fMan);
+extern void driFinishFreeSlabManager(struct _DriFreeSlabManager *fMan);
+extern struct _DriFreeSlabManager *
+driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec);
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
new file mode 100644
index 0000000000..54618b1c82
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
@@ -0,0 +1,268 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ */
+
+#include <xf86drm.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "ws_dri_bufpool.h"
+#include "ws_dri_bufmgr.h"
+#include "assert.h"
+
+/*
+ * Buffer pool implementation using DRM buffer objects as DRI buffer objects.
+ */
+
+static void *
+pool_create(struct _DriBufferPool *pool,
+            unsigned long size, uint64_t flags, unsigned hint,
+            unsigned alignment)
+{
+   drmBO *buf = (drmBO *) malloc(sizeof(*buf));
+   int ret;
+   unsigned pageSize = getpagesize();
+
+   if (!buf)
+      return NULL;
+
+   if ((alignment > pageSize) && (alignment % pageSize)) {
+      free(buf);
+      return NULL;
+   }
+
+   ret = drmBOCreate(pool->fd, size, alignment / pageSize,
+		     NULL,
+                     flags, hint, buf);
+   if (ret) {
+      free(buf);
+      return NULL;
+   }
+
+   return (void *) buf;
+}
+
+static void *
+pool_reference(struct _DriBufferPool *pool, unsigned handle)
+{
+   drmBO *buf = (drmBO *) malloc(sizeof(*buf));
+   int ret;
+
+   if (!buf)
+      return NULL;
+
+   ret = drmBOReference(pool->fd, handle, buf);
+
+   if (ret) {
+      free(buf);
+      return NULL;
+   }
+
+   return (void *) buf;
+}
+
+static int
+pool_destroy(struct _DriBufferPool *pool, void *private)
+{
+   int ret;
+   drmBO *buf = (drmBO *) private;
+   driReadLockKernelBO();
+   ret = drmBOUnreference(pool->fd, buf);
+   free(buf);
+   driReadUnlockKernelBO();
+   return ret;
+}
+
+static int
+pool_unreference(struct _DriBufferPool *pool, void *private)
+{
+   int ret;
+   drmBO *buf = (drmBO *) private;
+   driReadLockKernelBO();
+   ret = drmBOUnreference(pool->fd, buf);
+   free(buf);
+   driReadUnlockKernelBO();
+   return ret;
+}
+
+static int
+pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
+         int hint, pipe_mutex *mutex, void **virtual)
+{
+   drmBO *buf = (drmBO *) private;
+   int ret;
+
+   driReadLockKernelBO();
+   ret = drmBOMap(pool->fd, buf, flags, hint, virtual);
+   driReadUnlockKernelBO();
+   return ret;
+}
+
+static int
+pool_unmap(struct _DriBufferPool *pool, void *private)
+{
+   drmBO *buf = (drmBO *) private;
+   int ret;
+
+   driReadLockKernelBO();
+   ret = drmBOUnmap(pool->fd, buf);
+   driReadUnlockKernelBO();
+
+   return ret;
+}
+
+static unsigned long
+pool_offset(struct _DriBufferPool *pool, void *private)
+{
+   drmBO *buf = (drmBO *) private;
+   unsigned long offset;
+
+   driReadLockKernelBO();
+   assert(buf->flags & DRM_BO_FLAG_NO_MOVE);
+   offset = buf->offset;
+   driReadUnlockKernelBO();
+
+   return buf->offset;
+}
+
+static unsigned long
+pool_poolOffset(struct _DriBufferPool *pool, void *private)
+{
+   return 0;
+}
+
+static uint64_t
+pool_flags(struct _DriBufferPool *pool, void *private)
+{
+   drmBO *buf = (drmBO *) private;
+   uint64_t flags;
+
+   driReadLockKernelBO();
+   flags = buf->flags;
+   driReadUnlockKernelBO();
+
+   return flags;
+}
+
+
+static unsigned long
+pool_size(struct _DriBufferPool *pool, void *private)
+{
+   drmBO *buf = (drmBO *) private;
+   unsigned long size;
+
+   driReadLockKernelBO();
+   size = buf->size;
+   driReadUnlockKernelBO();
+
+   return buf->size;
+}
+
+static int
+pool_fence(struct _DriBufferPool *pool, void *private,
+           struct _DriFenceObject *fence)
+{
+   /*
+    * Noop. The kernel handles all fencing.
+    */
+
+   return 0;
+}
+
+static drmBO *
+pool_kernel(struct _DriBufferPool *pool, void *private)
+{
+   return (drmBO *) private;
+}
+
+static int
+pool_waitIdle(struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
+	      int lazy)
+{
+   drmBO *buf = (drmBO *) private;
+   int ret;
+
+   driReadLockKernelBO();
+   ret = drmBOWaitIdle(pool->fd, buf, (lazy) ? DRM_BO_HINT_WAIT_LAZY:0);
+   driReadUnlockKernelBO();
+
+   return ret;
+}
+
+
+static void
+pool_takedown(struct _DriBufferPool *pool)
+{
+   free(pool);
+}
+
+/*static int
+pool_setStatus(struct _DriBufferPool *pool, void *private,
+	       uint64_t flag_diff, uint64_t old_flags)
+{
+   drmBO *buf = (drmBO *) private;
+   uint64_t new_flags = old_flags ^ flag_diff;
+   int ret;
+
+   driReadLockKernelBO();
+   ret = drmBOSetStatus(pool->fd, buf, new_flags, flag_diff,
+			0, 0, 0);
+   driReadUnlockKernelBO();
+   return ret;
+}*/
+
+struct _DriBufferPool *
+driDRMPoolInit(int fd)
+{
+   struct _DriBufferPool *pool;
+
+   pool = (struct _DriBufferPool *) malloc(sizeof(*pool));
+
+   if (!pool)
+      return NULL;
+
+   pool->fd = fd;
+   pool->map = &pool_map;
+   pool->unmap = &pool_unmap;
+   pool->destroy = &pool_destroy;
+   pool->offset = &pool_offset;
+   pool->poolOffset = &pool_poolOffset;
+   pool->flags = &pool_flags;
+   pool->size = &pool_size;
+   pool->create = &pool_create;
+   pool->fence = &pool_fence;
+   pool->kernel = &pool_kernel;
+   pool->validate = NULL;
+   pool->waitIdle = &pool_waitIdle;
+   pool->takeDown = &pool_takedown;
+   pool->reference = &pool_reference;
+   pool->unreference = &pool_unreference;
+   pool->data = NULL;
+   return pool;
+}
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
new file mode 100644
index 0000000000..831c75d30c
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
@@ -0,0 +1,377 @@
+#include "ws_dri_fencemgr.h"
+#include "pipe/p_thread.h"
+#include <xf86mm.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Note: Locking order is
+ * _DriFenceObject::mutex
+ * _DriFenceMgr::mutex
+ */
+
+struct _DriFenceMgr {
+    /*
+     * Constant members. Need no mutex protection.
+     */
+    struct _DriFenceMgrCreateInfo info;
+    void *private;
+
+    /*
+     * These members are protected by this->mutex
+     */
+    pipe_mutex mutex;
+    int refCount;
+    drmMMListHead *heads;
+    int num_fences;
+};
+
+struct _DriFenceObject {
+
+    /*
+     * These members are constant and need no mutex protection.
+     */
+    struct _DriFenceMgr *mgr;
+    uint32_t fence_class;
+    uint32_t fence_type;
+
+    /*
+     * These members are protected by mgr->mutex.
+     */
+    drmMMListHead head;
+    int refCount;
+
+    /*
+     * These members are protected by this->mutex.
+     */
+    pipe_mutex mutex;
+    uint32_t signaled_type;
+    void *private;
+};
+
+uint32_t
+driFenceType(struct _DriFenceObject *fence)
+{
+  return fence->fence_type;
+}
+
+struct _DriFenceMgr *
+driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info)
+{
+  struct _DriFenceMgr *tmp;
+  uint32_t i;
+
+  tmp = calloc(1, sizeof(*tmp));
+  if (!tmp)
+      return NULL;
+
+  pipe_mutex_init(tmp->mutex);
+  pipe_mutex_lock(tmp->mutex);
+  tmp->refCount = 1;
+  tmp->info = *info;
+  tmp->num_fences = 0;
+  tmp->heads = calloc(tmp->info.num_classes, sizeof(*tmp->heads));
+  if (!tmp->heads)
+      goto out_err;
+
+  for (i=0; i<tmp->info.num_classes; ++i) {
+      DRMINITLISTHEAD(&tmp->heads[i]);
+  }
+  pipe_mutex_unlock(tmp->mutex);
+  return tmp;
+
+  out_err:
+  if (tmp)
+      free(tmp);
+  return NULL;
+}
+
+static void
+driFenceMgrUnrefUnlock(struct _DriFenceMgr **pMgr)
+{
+    struct _DriFenceMgr *mgr = *pMgr;
+
+    *pMgr = NULL;
+    if (--mgr->refCount == 0)
+	free(mgr);
+    else
+	pipe_mutex_unlock(mgr->mutex);
+}
+
+void
+driFenceMgrUnReference(struct _DriFenceMgr **pMgr)
+{
+    pipe_mutex_lock((*pMgr)->mutex);
+    driFenceMgrUnrefUnlock(pMgr);
+}
+
+static void
+driFenceUnReferenceLocked(struct _DriFenceObject **pFence)
+{
+    struct _DriFenceObject *fence = *pFence;
+    struct _DriFenceMgr *mgr = fence->mgr;
+
+    *pFence = NULL;
+    if (--fence->refCount == 0) {
+	DRMLISTDELINIT(&fence->head);
+	if (fence->private)
+	    mgr->info.unreference(mgr, &fence->private);
+    --mgr->num_fences;
+	fence->mgr = NULL;
+	--mgr->refCount;
+	free(fence);
+
+    }
+}
+
+
+static void
+driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
+			      drmMMListHead *list,
+			      uint32_t fence_class,
+			      uint32_t fence_type)
+{
+    struct _DriFenceObject *entry;
+    drmMMListHead *prev;
+
+    while(list != &mgr->heads[fence_class]) {
+	entry = DRMLISTENTRY(struct _DriFenceObject, list, head);
+
+	/*
+	 * Up refcount so that entry doesn't disappear from under us
+	 * when we unlock-relock mgr to get the correct locking order.
+	 */
+
+	++entry->refCount;
+	pipe_mutex_unlock(mgr->mutex);
+	pipe_mutex_lock(entry->mutex);
+	pipe_mutex_lock(mgr->mutex);
+
+	prev = list->prev;
+
+
+
+	if (list->prev == list) {
+
+		/*
+		 * Somebody else removed the entry from the list.
+		 */
+
+		pipe_mutex_unlock(entry->mutex);
+		driFenceUnReferenceLocked(&entry);
+		return;
+	}
+
+	entry->signaled_type |= (fence_type & entry->fence_type);
+	if (entry->signaled_type == entry->fence_type) {
+	    DRMLISTDELINIT(list);
+	    mgr->info.unreference(mgr, &entry->private);
+	}
+	pipe_mutex_unlock(entry->mutex);
+	driFenceUnReferenceLocked(&entry);
+	list = prev;
+    }
+}
+
+
+int
+driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
+	       int lazy_hint)
+{
+    struct _DriFenceMgr *mgr = fence->mgr;
+    int ret = 0;
+
+    pipe_mutex_lock(fence->mutex);
+
+    if ((fence->signaled_type & fence_type) == fence_type)
+	goto out0;
+
+    ret = mgr->info.finish(mgr, fence->private, fence_type, lazy_hint);
+    if (ret)
+	goto out0;
+
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
+
+    driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
+				  fence_type);
+    pipe_mutex_unlock(mgr->mutex);
+    return 0;
+
+  out0:
+    pipe_mutex_unlock(fence->mutex);
+    return ret;
+}
+
+uint32_t driFenceSignaledTypeCached(struct _DriFenceObject *fence)
+{
+    uint32_t ret;
+
+    pipe_mutex_lock(fence->mutex);
+    ret = fence->signaled_type;
+    pipe_mutex_unlock(fence->mutex);
+
+    return ret;
+}
+
+int
+driFenceSignaledType(struct _DriFenceObject *fence, uint32_t flush_type,
+		 uint32_t *signaled)
+{
+    int ret = 0;
+    struct _DriFenceMgr *mgr;
+
+    pipe_mutex_lock(fence->mutex);
+    mgr = fence->mgr;
+    *signaled = fence->signaled_type;
+    if ((fence->signaled_type & flush_type) == flush_type)
+	goto out0;
+
+    ret = mgr->info.signaled(mgr, fence->private, flush_type, signaled);
+    if (ret) {
+	*signaled = fence->signaled_type;
+	goto out0;
+    }
+
+    if ((fence->signaled_type | *signaled) == fence->signaled_type)
+	goto out0;
+
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
+
+    driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
+				  *signaled);
+
+    pipe_mutex_unlock(mgr->mutex);
+    return 0;
+  out0:
+    pipe_mutex_unlock(fence->mutex);
+    return ret;
+}
+
+struct _DriFenceObject *
+driFenceReference(struct _DriFenceObject *fence)
+{
+    pipe_mutex_lock(fence->mgr->mutex);
+    ++fence->refCount;
+    pipe_mutex_unlock(fence->mgr->mutex);
+    return fence;
+}
+
+void
+driFenceUnReference(struct _DriFenceObject **pFence)
+{
+    struct _DriFenceMgr *mgr;
+
+    if (*pFence == NULL)
+	return;
+
+    mgr = (*pFence)->mgr;
+    pipe_mutex_lock(mgr->mutex);
+    ++mgr->refCount;
+    driFenceUnReferenceLocked(pFence);
+    driFenceMgrUnrefUnlock(&mgr);
+}
+
+struct _DriFenceObject
+*driFenceCreate(struct _DriFenceMgr *mgr, uint32_t fence_class,
+		uint32_t fence_type, void *private, size_t private_size)
+{
+    struct _DriFenceObject *fence;
+    size_t fence_size = sizeof(*fence);
+
+    if (private_size)
+	fence_size = ((fence_size + 15) & ~15);
+
+    fence = calloc(1, fence_size + private_size);
+
+    if (!fence) {
+	int ret = mgr->info.finish(mgr, private, fence_type, 0);
+
+	if (ret)
+	    usleep(10000000);
+
+	return NULL;
+    }
+
+    pipe_mutex_init(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
+    fence->refCount = 1;
+    DRMLISTADDTAIL(&fence->head, &mgr->heads[fence_class]);
+    fence->mgr = mgr;
+    ++mgr->refCount;
+    ++mgr->num_fences;
+    pipe_mutex_unlock(mgr->mutex);
+    fence->fence_class = fence_class;
+    fence->fence_type = fence_type;
+    fence->signaled_type = 0;
+    fence->private = private;
+    if (private_size) {
+        fence->private = (void *)(((uint8_t *) fence) + fence_size);
+	memcpy(fence->private, private, private_size);
+    }
+
+    pipe_mutex_unlock(fence->mutex);
+    return fence;
+}
+
+
+static int
+tSignaled(struct _DriFenceMgr *mgr, void *private, uint32_t flush_type,
+	  uint32_t *signaled_type)
+{
+  long fd = (long) mgr->private;
+  int dummy;
+  drmFence *fence = (drmFence *) private;
+  int ret;
+
+  *signaled_type = 0;
+  ret = drmFenceSignaled((int) fd, fence, flush_type, &dummy);
+  if (ret)
+    return ret;
+
+  *signaled_type = fence->signaled;
+
+  return 0;
+}
+
+static int
+tFinish(struct _DriFenceMgr *mgr, void *private, uint32_t fence_type,
+	int lazy_hint)
+{
+  long fd = (long) mgr->private;
+  unsigned flags = lazy_hint ? DRM_FENCE_FLAG_WAIT_LAZY : 0;
+
+  return drmFenceWait((int)fd, flags, (drmFence *) private, fence_type);
+}
+
+static int
+tUnref(struct _DriFenceMgr *mgr, void **private)
+{
+  long fd = (long) mgr->private;
+  drmFence *fence = (drmFence *) *private;
+  *private = NULL;
+
+  return drmFenceUnreference(fd, fence);
+}
+
+struct _DriFenceMgr *driFenceMgrTTMInit(int fd)
+{
+  struct _DriFenceMgrCreateInfo info;
+  struct _DriFenceMgr *mgr;
+
+  info.flags = DRI_FENCE_CLASS_ORDERED;
+  info.num_classes = 4;
+  info.signaled = tSignaled;
+  info.finish = tFinish;
+  info.unreference = tUnref;
+
+  mgr = driFenceMgrCreate(&info);
+  if (mgr == NULL)
+    return NULL;
+
+  mgr->private = (void *) (long) fd;
+  return mgr;
+}
+
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.h b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.h
new file mode 100644
index 0000000000..4ea58dfe18
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.h
@@ -0,0 +1,115 @@
+#ifndef DRI_FENCEMGR_H
+#define DRI_FENCEMGR_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+struct _DriFenceObject;
+struct _DriFenceMgr;
+
+/*
+ * Do a quick check to see if the fence manager has registered the fence
+ * object as signaled. Note that this function may return a false negative
+ * answer.
+ */
+extern uint32_t driFenceSignaledTypeCached(struct _DriFenceObject *fence);
+
+/*
+ * Check if the fence object is signaled. This function can be substantially
+ * more expensive to call than the above function, but will not return a false
+ * negative answer. The argument "flush_type" sets the types that the
+ * underlying mechanism must make sure will eventually signal.
+ */
+extern int driFenceSignaledType(struct _DriFenceObject *fence,
+				uint32_t flush_type, uint32_t *signaled);
+
+/*
+ * Convenience functions.
+ */
+
+static inline int driFenceSignaled(struct _DriFenceObject *fence,
+				   uint32_t flush_type)
+{
+    uint32_t signaled_types;
+    int ret = driFenceSignaledType(fence, flush_type, &signaled_types);
+    if (ret)
+	return 0;
+    return ((signaled_types & flush_type) == flush_type);
+}
+
+static inline int driFenceSignaledCached(struct _DriFenceObject *fence,
+					 uint32_t flush_type)
+{
+    uint32_t signaled_types =
+	driFenceSignaledTypeCached(fence);
+
+    return ((signaled_types & flush_type) == flush_type);
+}
+
+/*
+ * Reference a fence object.
+ */
+extern struct _DriFenceObject *driFenceReference(struct _DriFenceObject *fence);
+
+/*
+ * Unreference a fence object. The fence object pointer will be reset to NULL.
+ */
+
+extern void driFenceUnReference(struct _DriFenceObject **pFence);
+
+
+/*
+ * Wait for a fence to signal the indicated fence_type.
+ * If "lazy_hint" is true, it indicates that the wait may sleep to avoid
+ * busy-wait polling.
+ */
+extern int driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
+			  int lazy_hint);
+
+/*
+ * Create a DriFenceObject for manager "mgr".
+ *
+ * "private" is a pointer that should be used for the callbacks in
+ * struct _DriFenceMgrCreateInfo.
+ *
+ * if private_size is nonzero, then the info stored at *private, with size
+ * private size will be copied and the fence manager will instead use a
+ * pointer to the copied data for the callbacks in
+ * struct _DriFenceMgrCreateInfo. In that case, the object pointed to by
+ * "private" may be destroyed after the call to driFenceCreate.
+ */
+extern struct _DriFenceObject *driFenceCreate(struct _DriFenceMgr *mgr,
+					      uint32_t fence_class,
+					      uint32_t fence_type,
+					      void *private,
+					      size_t private_size);
+
+extern uint32_t driFenceType(struct _DriFenceObject *fence);
+
+/*
+ * Fence creations are ordered. If a fence signals a fence_type,
+ * it is safe to assume that all fences of the same class that was
+ * created before that fence has signaled the same type.
+ */
+
+#define DRI_FENCE_CLASS_ORDERED (1 << 0)
+
+struct _DriFenceMgrCreateInfo {
+    uint32_t flags;
+    uint32_t num_classes;
+    int (*signaled) (struct _DriFenceMgr *mgr, void *private, uint32_t flush_type,
+		     uint32_t *signaled_type);
+    int (*finish) (struct _DriFenceMgr *mgr, void *private, uint32_t fence_type, int lazy_hint);
+    int (*unreference) (struct _DriFenceMgr *mgr, void **private);
+};
+
+extern struct _DriFenceMgr *
+driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info);
+
+void
+driFenceMgrUnReference(struct _DriFenceMgr **pMgr);
+
+extern struct _DriFenceMgr *
+driFenceMgrTTMInit(int fd);
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
new file mode 100644
index 0000000000..60924eac9e
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
@@ -0,0 +1,161 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ */
+
+#include <xf86drm.h>
+#include <stdlib.h>
+#include <errno.h>
+#include "pipe/p_debug.h"
+#include "pipe/p_thread.h"
+#include "ws_dri_bufpool.h"
+#include "ws_dri_bufmgr.h"
+
+static void *
+pool_create(struct _DriBufferPool *pool,
+            unsigned long size, uint64_t flags, unsigned hint,
+            unsigned alignment)
+{
+    unsigned long *private = malloc(size + 2*sizeof(unsigned long));
+    if ((flags & DRM_BO_MASK_MEM) != DRM_BO_FLAG_MEM_LOCAL)
+      abort();
+
+    *private = size;
+    return (void *)private;
+}
+
+
+static int
+pool_destroy(struct _DriBufferPool *pool, void *private)
+{
+    free(private);
+    return 0;
+}
+
+static int
+pool_waitIdle(struct _DriBufferPool *pool, void *private,
+	      pipe_mutex *mutex, int lazy)
+{
+    return 0;
+}
+
+static int
+pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
+         int hint, pipe_mutex *mutex, void **virtual)
+{
+    *virtual = (void *)((unsigned long *)private + 2);
+    return 0;
+}
+
+static int
+pool_unmap(struct _DriBufferPool *pool, void *private)
+{
+    return 0;
+}
+
+static unsigned long
+pool_offset(struct _DriBufferPool *pool, void *private)
+{
+    /*
+     * BUG
+     */
+    abort();
+    return 0UL;
+}
+
+static unsigned long
+pool_poolOffset(struct _DriBufferPool *pool, void *private)
+{
+    /*
+     * BUG
+     */
+    abort();
+}
+
+static uint64_t
+pool_flags(struct _DriBufferPool *pool, void *private)
+{
+    return DRM_BO_FLAG_MEM_LOCAL | DRM_BO_FLAG_CACHED;
+}
+
+static unsigned long
+pool_size(struct _DriBufferPool *pool, void *private)
+{
+    return *(unsigned long *) private;
+}
+
+
+static int
+pool_fence(struct _DriBufferPool *pool, void *private,
+           struct _DriFenceObject *fence)
+{
+    abort();
+    return 0UL;
+}
+
+static drmBO *
+pool_kernel(struct _DriBufferPool *pool, void *private)
+{
+    abort();
+    return NULL;
+}
+
+static void
+pool_takedown(struct _DriBufferPool *pool)
+{
+    free(pool);
+}
+
+
+struct _DriBufferPool *
+driMallocPoolInit(void)
+{
+   struct _DriBufferPool *pool;
+
+   pool = (struct _DriBufferPool *) malloc(sizeof(*pool));
+   if (!pool)
+       return NULL;
+
+   pool->data = NULL;
+   pool->fd = -1;
+   pool->map = &pool_map;
+   pool->unmap = &pool_unmap;
+   pool->destroy = &pool_destroy;
+   pool->offset = &pool_offset;
+   pool->poolOffset = &pool_poolOffset;
+   pool->flags = &pool_flags;
+   pool->size = &pool_size;
+   pool->create = &pool_create;
+   pool->fence = &pool_fence;
+   pool->kernel = &pool_kernel;
+   pool->validate = NULL;
+   pool->waitIdle = &pool_waitIdle;
+   pool->takeDown = &pool_takedown;
+   return pool;
+}
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
new file mode 100644
index 0000000000..391cea50a7
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
@@ -0,0 +1,968 @@
+/**************************************************************************
+ *
+ * Copyright 2006-2008 Tungsten Graphics, Inc., Cedar Park, TX., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstrom <thomas-at-tungstengraphics-dot-com>
+ */
+
+#include <stdint.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <unistd.h>
+#include <assert.h>
+#include "ws_dri_bufpool.h"
+#include "ws_dri_fencemgr.h"
+#include "ws_dri_bufmgr.h"
+#include "pipe/p_thread.h"
+
+#define DRI_SLABPOOL_ALLOC_RETRIES 100
+
+struct _DriSlab;
+
+struct _DriSlabBuffer {
+    int isSlabBuffer;
+    drmBO *bo;
+    struct _DriFenceObject *fence;
+    struct _DriSlab *parent;
+    drmMMListHead head;
+    uint32_t mapCount;
+    uint32_t start;
+    uint32_t fenceType;
+    int unFenced;
+    pipe_condvar event;
+};
+
+struct _DriKernelBO {
+    int fd;
+    drmBO bo;
+    drmMMListHead timeoutHead;
+    drmMMListHead head;
+    struct timeval timeFreed;
+    uint32_t pageAlignment;
+    void *virtual;
+};
+
+struct _DriSlab{
+    drmMMListHead head;
+    drmMMListHead freeBuffers;
+    uint32_t numBuffers;
+    uint32_t numFree;
+    struct _DriSlabBuffer *buffers;
+    struct _DriSlabSizeHeader *header;
+    struct _DriKernelBO *kbo;
+};
+
+
+struct _DriSlabSizeHeader {
+    drmMMListHead slabs;
+    drmMMListHead freeSlabs;
+    drmMMListHead delayedBuffers;
+    uint32_t numDelayed;
+    struct _DriSlabPool *slabPool;
+    uint32_t bufSize;
+    pipe_mutex mutex;
+};
+
+struct _DriFreeSlabManager {
+    struct timeval slabTimeout;
+    struct timeval checkInterval;
+    struct timeval nextCheck;
+    drmMMListHead timeoutList;
+    drmMMListHead unCached;
+    drmMMListHead cached;
+    pipe_mutex mutex;
+};
+
+
+struct _DriSlabPool {
+
+    /*
+     * The data of this structure remains constant after
+     * initialization and thus needs no mutex protection.
+     */
+
+    struct _DriFreeSlabManager *fMan;
+    uint64_t proposedFlags;
+    uint64_t validMask;
+    uint32_t *bucketSizes;
+    uint32_t numBuckets;
+    uint32_t pageSize;
+    int fd;
+    int pageAlignment;
+    int maxSlabSize;
+    int desiredNumBuffers;
+    struct _DriSlabSizeHeader *headers;
+};
+
+/*
+ * FIXME: Perhaps arrange timeout slabs in size buckets for fast
+ * retreival??
+ */
+
+
+static inline int
+driTimeAfterEq(struct timeval *arg1, struct timeval *arg2)
+{
+    return ((arg1->tv_sec > arg2->tv_sec) ||
+	    ((arg1->tv_sec == arg2->tv_sec) &&
+	     (arg1->tv_usec > arg2->tv_usec)));
+}
+
+static inline void
+driTimeAdd(struct timeval *arg, struct timeval *add)
+{
+    unsigned int sec;
+
+    arg->tv_sec += add->tv_sec;
+    arg->tv_usec += add->tv_usec;
+    sec = arg->tv_usec / 1000000;
+    arg->tv_sec += sec;
+    arg->tv_usec -= sec*1000000;
+}
+
+static void
+driFreeKernelBO(struct _DriKernelBO *kbo)
+{
+    if (!kbo)
+	return;
+
+    (void) drmBOUnreference(kbo->fd, &kbo->bo);
+    free(kbo);
+}
+
+
+static void
+driFreeTimeoutKBOsLocked(struct _DriFreeSlabManager *fMan,
+			 struct timeval *time)
+{
+    drmMMListHead *list, *next;
+    struct _DriKernelBO *kbo;
+
+    if (!driTimeAfterEq(time, &fMan->nextCheck))
+	return;
+
+    for (list = fMan->timeoutList.next, next = list->next;
+	 list != &fMan->timeoutList;
+	 list = next, next = list->next) {
+
+	kbo = DRMLISTENTRY(struct _DriKernelBO, list, timeoutHead);
+
+	if (!driTimeAfterEq(time, &kbo->timeFreed))
+	    break;
+
+	DRMLISTDELINIT(&kbo->timeoutHead);
+	DRMLISTDELINIT(&kbo->head);
+	driFreeKernelBO(kbo);
+    }
+
+    fMan->nextCheck = *time;
+    driTimeAdd(&fMan->nextCheck, &fMan->checkInterval);
+}
+
+
+/*
+ * Add a _DriKernelBO to the free slab manager.
+ * This means that it is available for reuse, but if it's not
+ * reused in a while, it will be freed.
+ */
+
+static void
+driSetKernelBOFree(struct _DriFreeSlabManager *fMan,
+		   struct _DriKernelBO *kbo)
+{
+    struct timeval time;
+
+    pipe_mutex_lock(fMan->mutex);
+    gettimeofday(&time, NULL);
+    driTimeAdd(&time, &fMan->slabTimeout);
+
+    kbo->timeFreed = time;
+
+    if (kbo->bo.flags & DRM_BO_FLAG_CACHED)
+	DRMLISTADD(&kbo->head, &fMan->cached);
+    else
+	DRMLISTADD(&kbo->head, &fMan->unCached);
+
+    DRMLISTADDTAIL(&kbo->timeoutHead, &fMan->timeoutList);
+    driFreeTimeoutKBOsLocked(fMan, &time);
+
+    pipe_mutex_unlock(fMan->mutex);
+}
+
+/*
+ * Get a _DriKernelBO for us to use as storage for a slab.
+ *
+ */
+
+static struct _DriKernelBO *
+driAllocKernelBO(struct _DriSlabSizeHeader *header)
+
+{
+    struct _DriSlabPool *slabPool = header->slabPool;
+    struct _DriFreeSlabManager *fMan = slabPool->fMan;
+    drmMMListHead *list, *next, *head;
+    uint32_t size = header->bufSize * slabPool->desiredNumBuffers;
+    struct _DriKernelBO *kbo;
+    struct _DriKernelBO *kboTmp;
+    int ret;
+
+    /*
+     * FIXME: We should perhaps allow some variation in slabsize in order
+     * to efficiently reuse slabs.
+     */
+
+    size = (size <= slabPool->maxSlabSize) ? size : slabPool->maxSlabSize;
+    size = (size + slabPool->pageSize - 1) & ~(slabPool->pageSize - 1);
+    pipe_mutex_lock(fMan->mutex);
+
+    kbo = NULL;
+
+  retry:
+    head = (slabPool->proposedFlags & DRM_BO_FLAG_CACHED) ?
+	&fMan->cached : &fMan->unCached;
+
+    for (list = head->next, next = list->next;
+	 list != head;
+	 list = next, next = list->next) {
+
+	kboTmp = DRMLISTENTRY(struct _DriKernelBO, list, head);
+
+	if ((kboTmp->bo.size == size) &&
+	    (slabPool->pageAlignment == 0 ||
+	     (kboTmp->pageAlignment % slabPool->pageAlignment) == 0)) {
+
+	    if (!kbo)
+		kbo = kboTmp;
+
+	    if ((kbo->bo.proposedFlags ^ slabPool->proposedFlags) == 0)
+		break;
+
+	}
+    }
+
+    if (kbo) {
+	DRMLISTDELINIT(&kbo->head);
+	DRMLISTDELINIT(&kbo->timeoutHead);
+    }
+
+    pipe_mutex_unlock(fMan->mutex);
+
+    if (kbo) {
+        uint64_t new_mask = kbo->bo.proposedFlags ^ slabPool->proposedFlags;
+
+	ret = 0;
+	if (new_mask) {
+	    ret = drmBOSetStatus(kbo->fd, &kbo->bo, slabPool->proposedFlags,
+				 new_mask, DRM_BO_HINT_DONT_FENCE, 0, 0);
+	}
+	if (ret == 0)
+	    return kbo;
+
+	driFreeKernelBO(kbo);
+	kbo = NULL;
+	goto retry;
+    }
+
+    kbo = calloc(1, sizeof(struct _DriKernelBO));
+    if (!kbo)
+	return NULL;
+
+    kbo->fd = slabPool->fd;
+    DRMINITLISTHEAD(&kbo->head);
+    DRMINITLISTHEAD(&kbo->timeoutHead);
+    ret = drmBOCreate(kbo->fd, size, slabPool->pageAlignment, NULL,
+		      slabPool->proposedFlags,
+		      DRM_BO_HINT_DONT_FENCE, &kbo->bo);
+    if (ret)
+	goto out_err0;
+
+    ret = drmBOMap(kbo->fd, &kbo->bo,
+		   DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		   0, &kbo->virtual);
+
+    if (ret)
+	goto out_err1;
+
+    ret = drmBOUnmap(kbo->fd, &kbo->bo);
+    if (ret)
+	goto out_err1;
+
+    return kbo;
+
+  out_err1:
+    drmBOUnreference(kbo->fd, &kbo->bo);
+  out_err0:
+    free(kbo);
+    return NULL;
+}
+
+
+static int
+driAllocSlab(struct _DriSlabSizeHeader *header)
+{
+    struct _DriSlab *slab;
+    struct _DriSlabBuffer *buf;
+    uint32_t numBuffers;
+    int ret;
+    int i;
+
+    slab = calloc(1, sizeof(*slab));
+    if (!slab)
+	return -ENOMEM;
+
+    slab->kbo = driAllocKernelBO(header);
+    if (!slab->kbo) {
+	ret = -ENOMEM;
+	goto out_err0;
+    }
+
+    numBuffers = slab->kbo->bo.size / header->bufSize;
+
+    slab->buffers = calloc(numBuffers, sizeof(*slab->buffers));
+    if (!slab->buffers) {
+	ret = -ENOMEM;
+	goto out_err1;
+    }
+
+    DRMINITLISTHEAD(&slab->head);
+    DRMINITLISTHEAD(&slab->freeBuffers);
+    slab->numBuffers = numBuffers;
+    slab->numFree = 0;
+    slab->header = header;
+
+    buf = slab->buffers;
+    for (i=0; i < numBuffers; ++i) {
+	buf->parent = slab;
+	buf->start = i* header->bufSize;
+	buf->mapCount = 0;
+	buf->isSlabBuffer = 1;
+	pipe_condvar_init(buf->event);
+	DRMLISTADDTAIL(&buf->head, &slab->freeBuffers);
+	slab->numFree++;
+	buf++;
+    }
+
+    DRMLISTADDTAIL(&slab->head, &header->slabs);
+
+    return 0;
+
+  out_err1:
+    driSetKernelBOFree(header->slabPool->fMan, slab->kbo);
+    free(slab->buffers);
+  out_err0:
+    free(slab);
+    return ret;
+}
+
+/*
+ * Delete a buffer from the slab header delayed list and put
+ * it on the slab free list.
+ */
+
+static void
+driSlabFreeBufferLocked(struct _DriSlabBuffer *buf)
+{
+    struct _DriSlab *slab = buf->parent;
+    struct _DriSlabSizeHeader *header = slab->header;
+    drmMMListHead *list = &buf->head;
+
+    DRMLISTDEL(list);
+    DRMLISTADDTAIL(list, &slab->freeBuffers);
+    slab->numFree++;
+
+    if (slab->head.next == &slab->head)
+	DRMLISTADDTAIL(&slab->head, &header->slabs);
+
+    if (slab->numFree == slab->numBuffers) {
+	list = &slab->head;
+	DRMLISTDEL(list);
+	DRMLISTADDTAIL(list, &header->freeSlabs);
+    }
+
+    if (header->slabs.next == &header->slabs ||
+	slab->numFree != slab->numBuffers) {
+
+	drmMMListHead *next;
+	struct _DriFreeSlabManager *fMan = header->slabPool->fMan;
+
+	for (list = header->freeSlabs.next, next = list->next;
+	     list != &header->freeSlabs;
+	     list = next, next = list->next) {
+
+	    slab = DRMLISTENTRY(struct _DriSlab, list, head);
+
+	    DRMLISTDELINIT(list);
+	    driSetKernelBOFree(fMan, slab->kbo);
+	    free(slab->buffers);
+	    free(slab);
+	}
+    }
+}
+
+static void
+driSlabCheckFreeLocked(struct _DriSlabSizeHeader *header, int wait)
+{
+  drmMMListHead *list, *prev, *first;
+   struct _DriSlabBuffer *buf;
+   struct _DriSlab *slab;
+   int firstWasSignaled = 1;
+   int signaled;
+   int i;
+   int ret;
+
+   /*
+    * Rerun the freeing test if the youngest tested buffer
+    * was signaled, since there might be more idle buffers
+    * in the delay list.
+    */
+
+   while (firstWasSignaled) {
+       firstWasSignaled = 0;
+       signaled = 0;
+       first = header->delayedBuffers.next;
+
+       /* Only examine the oldest 1/3 of delayed buffers:
+	*/
+       if (header->numDelayed > 3) {
+	   for (i = 0; i < header->numDelayed; i += 3) {
+	       first = first->next;
+	   }
+       }
+
+       for (list = first, prev = list->prev;
+	    list != &header->delayedBuffers;
+	    list = prev, prev = list->prev) {
+	   buf = DRMLISTENTRY(struct _DriSlabBuffer, list, head);
+	   slab = buf->parent;
+
+	   if (!signaled) {
+	       if (wait) {
+		   ret = driFenceFinish(buf->fence, buf->fenceType, 0);
+		   if (ret)
+		       break;
+		   signaled = 1;
+		   wait = 0;
+	       } else {
+		   signaled = driFenceSignaled(buf->fence, buf->fenceType);
+	       }
+	       if (signaled) {
+		   if (list == first)
+		       firstWasSignaled = 1;
+		   driFenceUnReference(&buf->fence);
+		   header->numDelayed--;
+		   driSlabFreeBufferLocked(buf);
+	       }
+	   } else if (driFenceSignaledCached(buf->fence, buf->fenceType)) {
+	       driFenceUnReference(&buf->fence);
+	       header->numDelayed--;
+	       driSlabFreeBufferLocked(buf);
+	   }
+       }
+   }
+}
+
+
+static struct _DriSlabBuffer *
+driSlabAllocBuffer(struct _DriSlabSizeHeader *header)
+{
+    static struct _DriSlabBuffer *buf;
+    struct _DriSlab *slab;
+    drmMMListHead *list;
+    int count = DRI_SLABPOOL_ALLOC_RETRIES;
+
+    pipe_mutex_lock(header->mutex);
+    while(header->slabs.next == &header->slabs && count > 0) {
+        driSlabCheckFreeLocked(header, 0);
+	if (header->slabs.next != &header->slabs)
+	  break;
+
+	pipe_mutex_unlock(header->mutex);
+	if (count != DRI_SLABPOOL_ALLOC_RETRIES)
+	    usleep(1);
+	pipe_mutex_lock(header->mutex);
+	(void) driAllocSlab(header);
+	count--;
+    }
+
+    list = header->slabs.next;
+    if (list == &header->slabs) {
+	pipe_mutex_unlock(header->mutex);
+	return NULL;
+    }
+    slab = DRMLISTENTRY(struct _DriSlab, list, head);
+    if (--slab->numFree == 0)
+	DRMLISTDELINIT(list);
+
+    list = slab->freeBuffers.next;
+    DRMLISTDELINIT(list);
+
+    pipe_mutex_unlock(header->mutex);
+    buf = DRMLISTENTRY(struct _DriSlabBuffer, list, head);
+    return buf;
+}
+
+static void *
+pool_create(struct _DriBufferPool *driPool, unsigned long size,
+	    uint64_t flags, unsigned hint, unsigned alignment)
+{
+    struct _DriSlabPool *pool = (struct _DriSlabPool *) driPool->data;
+    struct _DriSlabSizeHeader *header;
+    struct _DriSlabBuffer *buf;
+    void *dummy;
+    int i;
+    int ret;
+
+    /*
+     * FIXME: Check for compatibility.
+     */
+
+    header = pool->headers;
+    for (i=0; i<pool->numBuckets; ++i) {
+      if (header->bufSize >= size)
+	break;
+      header++;
+    }
+
+    if (i < pool->numBuckets)
+	return driSlabAllocBuffer(header);
+
+
+    /*
+     * Fall back to allocate a buffer object directly from DRM.
+     * and wrap it in a driBO structure.
+     */
+
+
+    buf = calloc(1, sizeof(*buf));
+
+    if (!buf)
+	return NULL;
+
+    buf->bo = calloc(1, sizeof(*buf->bo));
+    if (!buf->bo)
+	goto out_err0;
+
+    if (alignment) {
+	if ((alignment < pool->pageSize) && (pool->pageSize % alignment))
+	    goto out_err1;
+	if ((alignment > pool->pageSize) && (alignment % pool->pageSize))
+	    goto out_err1;
+    }
+
+    ret = drmBOCreate(pool->fd, size, alignment / pool->pageSize, NULL,
+			flags, hint, buf->bo);
+    if (ret)
+	goto out_err1;
+
+    ret  = drmBOMap(pool->fd, buf->bo, DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE,
+		    0, &dummy);
+    if (ret)
+	goto out_err2;
+
+    ret = drmBOUnmap(pool->fd, buf->bo);
+    if (ret)
+	goto out_err2;
+
+    return buf;
+  out_err2:
+    drmBOUnreference(pool->fd, buf->bo);
+  out_err1:
+    free(buf->bo);
+  out_err0:
+    free(buf);
+    return NULL;
+}
+
+static int
+pool_destroy(struct _DriBufferPool *driPool, void *private)
+{
+    struct _DriSlabBuffer *buf =
+	(struct _DriSlabBuffer *) private;
+    struct _DriSlab *slab;
+    struct _DriSlabSizeHeader *header;
+
+    if (!buf->isSlabBuffer) {
+	struct _DriSlabPool *pool = (struct _DriSlabPool *) driPool->data;
+	int ret;
+
+	ret = drmBOUnreference(pool->fd, buf->bo);
+	free(buf->bo);
+	free(buf);
+	return ret;
+    }
+
+    slab = buf->parent;
+    header = slab->header;
+
+    pipe_mutex_lock(header->mutex);
+    buf->unFenced = 0;
+    buf->mapCount = 0;
+
+    if (buf->fence && !driFenceSignaledCached(buf->fence, buf->fenceType)) {
+	DRMLISTADDTAIL(&buf->head, &header->delayedBuffers);
+	header->numDelayed++;
+    } else {
+	if (buf->fence)
+	    driFenceUnReference(&buf->fence);
+	driSlabFreeBufferLocked(buf);
+    }
+
+    pipe_mutex_unlock(header->mutex);
+    return 0;
+}
+
+static int
+pool_waitIdle(struct _DriBufferPool *driPool, void *private,
+	      pipe_mutex *mutex, int lazy)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   while(buf->unFenced)
+       pipe_condvar_wait(buf->event, *mutex);
+
+   if (!buf->fence)
+     return 0;
+
+   driFenceFinish(buf->fence, buf->fenceType, lazy);
+   driFenceUnReference(&buf->fence);
+
+   return 0;
+}
+
+static int
+pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
+         int hint, pipe_mutex *mutex, void **virtual)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+   int busy;
+
+   if (buf->isSlabBuffer)
+       busy = buf->unFenced || (buf->fence && !driFenceSignaledCached(buf->fence, buf->fenceType));
+   else
+       busy = buf->fence && !driFenceSignaled(buf->fence, buf->fenceType);
+
+
+   if (busy) {
+       if (hint & DRM_BO_HINT_DONT_BLOCK)
+	   return -EBUSY;
+       else {
+	   (void) pool_waitIdle(pool, private, mutex, 0);
+       }
+   }
+
+   ++buf->mapCount;
+   *virtual = (buf->isSlabBuffer) ?
+       (void *) ((uint8_t *) buf->parent->kbo->virtual + buf->start) :
+       (void *) buf->bo->virtual;
+
+   return 0;
+}
+
+static int
+pool_unmap(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   --buf->mapCount;
+   if (buf->mapCount == 0 && buf->isSlabBuffer)
+      pipe_condvar_broadcast(buf->event);
+
+   return 0;
+}
+
+static unsigned long
+pool_offset(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+   struct _DriSlab *slab;
+   struct _DriSlabSizeHeader *header;
+
+   if (!buf->isSlabBuffer) {
+       assert(buf->bo->proposedFlags & DRM_BO_FLAG_NO_MOVE);
+       return buf->bo->offset;
+   }
+
+   slab = buf->parent;
+   header = slab->header;
+
+   (void) header;
+   assert(header->slabPool->proposedFlags & DRM_BO_FLAG_NO_MOVE);
+   return slab->kbo->bo.offset + buf->start;
+}
+
+static unsigned long
+pool_poolOffset(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   return buf->start;
+}
+
+static uint64_t
+pool_flags(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   if (!buf->isSlabBuffer)
+       return buf->bo->flags;
+
+   return buf->parent->kbo->bo.flags;
+}
+
+static unsigned long
+pool_size(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+   if (!buf->isSlabBuffer)
+       return buf->bo->size;
+
+   return buf->parent->header->bufSize;
+}
+
+static int
+pool_fence(struct _DriBufferPool *pool, void *private,
+           struct _DriFenceObject *fence)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+   drmBO *bo;
+
+   if (buf->fence)
+      driFenceUnReference(&buf->fence);
+
+   buf->fence = driFenceReference(fence);
+   bo = (buf->isSlabBuffer) ?
+     &buf->parent->kbo->bo:
+     buf->bo;
+   buf->fenceType = bo->fenceFlags;
+
+   buf->unFenced = 0;
+   pipe_condvar_broadcast(buf->event);
+
+   return 0;
+}
+
+static drmBO *
+pool_kernel(struct _DriBufferPool *pool, void *private)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   return (buf->isSlabBuffer) ? &buf->parent->kbo->bo : buf->bo;
+}
+
+static int
+pool_validate(struct _DriBufferPool *pool, void *private,
+	      pipe_mutex *mutex)
+{
+   struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
+
+   if (!buf->isSlabBuffer)
+       return 0;
+
+   while(buf->mapCount != 0)
+      pipe_condvar_wait(buf->event, *mutex);
+
+   buf->unFenced = 1;
+   return 0;
+}
+
+
+struct _DriFreeSlabManager *
+driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec)
+{
+    struct _DriFreeSlabManager *tmp;
+
+    tmp = calloc(1, sizeof(*tmp));
+    if (!tmp)
+	return NULL;
+
+    pipe_mutex_init(tmp->mutex);
+    pipe_mutex_lock(tmp->mutex);
+    tmp->slabTimeout.tv_usec = slabTimeoutMsec*1000;
+    tmp->slabTimeout.tv_sec = tmp->slabTimeout.tv_usec / 1000000;
+    tmp->slabTimeout.tv_usec -=  tmp->slabTimeout.tv_sec*1000000;
+
+    tmp->checkInterval.tv_usec = checkIntervalMsec*1000;
+    tmp->checkInterval.tv_sec = tmp->checkInterval.tv_usec / 1000000;
+    tmp->checkInterval.tv_usec -=  tmp->checkInterval.tv_sec*1000000;
+
+    gettimeofday(&tmp->nextCheck, NULL);
+    driTimeAdd(&tmp->nextCheck, &tmp->checkInterval);
+    DRMINITLISTHEAD(&tmp->timeoutList);
+    DRMINITLISTHEAD(&tmp->unCached);
+    DRMINITLISTHEAD(&tmp->cached);
+    pipe_mutex_unlock(tmp->mutex);
+
+    return tmp;
+}
+
+void
+driFinishFreeSlabManager(struct _DriFreeSlabManager *fMan)
+{
+    struct timeval time;
+
+    time = fMan->nextCheck;
+    driTimeAdd(&time, &fMan->checkInterval);
+
+    pipe_mutex_lock(fMan->mutex);
+    driFreeTimeoutKBOsLocked(fMan, &time);
+    pipe_mutex_unlock(fMan->mutex);
+
+    assert(fMan->timeoutList.next == &fMan->timeoutList);
+    assert(fMan->unCached.next == &fMan->unCached);
+    assert(fMan->cached.next == &fMan->cached);
+
+    free(fMan);
+}
+
+static void
+driInitSizeHeader(struct _DriSlabPool *pool, uint32_t size,
+		  struct _DriSlabSizeHeader *header)
+{
+    pipe_mutex_init(header->mutex);
+    pipe_mutex_lock(header->mutex);
+
+    DRMINITLISTHEAD(&header->slabs);
+    DRMINITLISTHEAD(&header->freeSlabs);
+    DRMINITLISTHEAD(&header->delayedBuffers);
+
+    header->numDelayed = 0;
+    header->slabPool = pool;
+    header->bufSize = size;
+
+    pipe_mutex_unlock(header->mutex);
+}
+
+static void
+driFinishSizeHeader(struct _DriSlabSizeHeader *header)
+{
+    drmMMListHead *list, *next;
+    struct _DriSlabBuffer *buf;
+
+    pipe_mutex_lock(header->mutex);
+    for (list = header->delayedBuffers.next, next = list->next;
+	 list != &header->delayedBuffers;
+	 list = next, next = list->next) {
+
+	buf = DRMLISTENTRY(struct _DriSlabBuffer, list , head);
+	if (buf->fence) {
+	    (void) driFenceFinish(buf->fence, buf->fenceType, 0);
+	    driFenceUnReference(&buf->fence);
+	}
+	header->numDelayed--;
+	driSlabFreeBufferLocked(buf);
+    }
+    pipe_mutex_unlock(header->mutex);
+}
+
+static void
+pool_takedown(struct _DriBufferPool *driPool)
+{
+   struct _DriSlabPool *pool = driPool->data;
+   int i;
+
+   for (i=0; i<pool->numBuckets; ++i) {
+     driFinishSizeHeader(&pool->headers[i]);
+   }
+
+   free(pool->headers);
+   free(pool->bucketSizes);
+   free(pool);
+   free(driPool);
+}
+
+struct _DriBufferPool *
+driSlabPoolInit(int fd, uint64_t flags,
+		uint64_t validMask,
+		uint32_t smallestSize,
+		uint32_t numSizes,
+		uint32_t desiredNumBuffers,
+		uint32_t maxSlabSize,
+		uint32_t pageAlignment,
+		struct _DriFreeSlabManager *fMan)
+{
+    struct _DriBufferPool *driPool;
+    struct _DriSlabPool *pool;
+    uint32_t i;
+
+    driPool = calloc(1, sizeof(*driPool));
+    if (!driPool)
+	return NULL;
+
+    pool = calloc(1, sizeof(*pool));
+    if (!pool)
+	goto out_err0;
+
+    pool->bucketSizes = calloc(numSizes, sizeof(*pool->bucketSizes));
+    if (!pool->bucketSizes)
+	goto out_err1;
+
+    pool->headers = calloc(numSizes, sizeof(*pool->headers));
+    if (!pool->headers)
+	goto out_err2;
+
+    pool->fMan = fMan;
+    pool->proposedFlags = flags;
+    pool->validMask = validMask;
+    pool->numBuckets = numSizes;
+    pool->pageSize = getpagesize();
+    pool->fd = fd;
+    pool->pageAlignment = pageAlignment;
+    pool->maxSlabSize = maxSlabSize;
+    pool->desiredNumBuffers = desiredNumBuffers;
+
+    for (i=0; i<pool->numBuckets; ++i) {
+	pool->bucketSizes[i] = (smallestSize << i);
+	driInitSizeHeader(pool, pool->bucketSizes[i],
+			  &pool->headers[i]);
+    }
+
+    driPool->data = (void *) pool;
+    driPool->map = &pool_map;
+    driPool->unmap = &pool_unmap;
+    driPool->destroy = &pool_destroy;
+    driPool->offset = &pool_offset;
+    driPool->poolOffset = &pool_poolOffset;
+    driPool->flags = &pool_flags;
+    driPool->size = &pool_size;
+    driPool->create = &pool_create;
+    driPool->fence = &pool_fence;
+    driPool->kernel = &pool_kernel;
+    driPool->validate = &pool_validate;
+    driPool->waitIdle = &pool_waitIdle;
+    driPool->takeDown = &pool_takedown;
+
+    return driPool;
+
+  out_err2:
+    free(pool->bucketSizes);
+  out_err1:
+    free(pool);
+  out_err0:
+    free(driPool);
+
+    return NULL;
+}
diff --git a/src/gallium/winsys/drm/intel/dri/Makefile b/src/gallium/winsys/drm/intel/dri/Makefile
new file mode 100644
index 0000000000..2046441a22
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/Makefile
@@ -0,0 +1,33 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = i915_dri.so
+LIBNAME_EGL = egl_i915_dri.so
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	../common/libinteldrm.a \
+	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a
+
+
+DRIVER_SOURCES = \
+	intel_winsys_softpipe.c \
+	intel_swapbuffers.c \
+	intel_context.c \
+	intel_lock.c \
+	intel_screen.c
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+DRIVER_DEFINES = -I../common $(shell pkg-config libdrm --atleast-version=2.3.1 \
+				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+
+include ../../Makefile.template
+
+#intel_tex_layout.o: $(TOP)/src/mesa/drivers/dri/intel/intel_tex_layout.c
+
+symlinks:
diff --git a/src/gallium/winsys/drm/intel/dri/SConscript b/src/gallium/winsys/drm/intel/dri/SConscript
new file mode 100644
index 0000000000..6a4f50afcc
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/SConscript
@@ -0,0 +1,41 @@
+Import('*')
+
+if 'mesa' in env['statetrackers']:
+
+	env = drienv.Clone()
+
+	env.Append(CPPPATH = [
+		'../intel',
+		'server'
+	])
+
+	#MINIGLX_SOURCES = server/intel_dri.c
+
+	DRIVER_SOURCES = [
+		'intel_winsys_pipe.c',
+		'intel_winsys_softpipe.c',
+		'intel_winsys_i915.c',
+		'intel_batchbuffer.c',
+		'intel_swapbuffers.c',
+		'intel_context.c',
+		'intel_lock.c',
+		'intel_screen.c',
+		'intel_batchpool.c',
+	]
+
+	sources = \
+		COMMON_GALLIUM_SOURCES + \
+		COMMON_BM_SOURCES + \
+		DRIVER_SOURCES
+
+	drivers = [
+		softpipe,
+		i915simple
+	]
+
+	# TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+	env.SharedLibrary(
+		target ='i915tex_dri.so',
+		source = sources,
+		LIBS = drivers + mesa + auxiliaries + env['LIBS'],
+	)
diff --git a/src/gallium/winsys/drm/intel/dri/intel_batchbuffer.h b/src/gallium/winsys/drm/intel/dri/intel_batchbuffer.h
new file mode 100644
index 0000000000..3e95326168
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_batchbuffer.h
@@ -0,0 +1,24 @@
+#ifndef INTEL_BATCHBUFFER_H
+#define INTEL_BATCHBUFFER_H
+
+#include "intel_be_batchbuffer.h"
+
+/*
+ * Need to redefine the BATCH defines
+ */
+
+#undef BEGIN_BATCH
+#define BEGIN_BATCH(dwords, relocs) \
+   (i915_batchbuffer_check(&intel->base.batch->base, dwords, relocs))
+
+#undef OUT_BATCH
+#define OUT_BATCH(d) \
+   i915_batchbuffer_dword(&intel->base.batch->base, d)
+
+#undef OUT_RELOC
+#define OUT_RELOC(buf,flags,mask,delta) do { 					\
+   assert((delta) >= 0);							\
+   intel_be_offset_relocation(intel->base.batch, delta, buf, flags, mask); 	\
+} while (0)
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/intel_context.c b/src/gallium/winsys/drm/intel/dri/intel_context.c
new file mode 100644
index 0000000000..97ef731aaa
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_context.c
@@ -0,0 +1,337 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "i830_dri.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_swapbuffers.h"
+#include "intel_batchbuffer.h"
+#include "intel_winsys_softpipe.h"
+
+#include "i915simple/i915_screen.h"
+
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+
+#include "utils.h"
+
+
+#ifdef DEBUG
+int __intel_debug = 0;
+#endif
+
+
+#define need_GL_ARB_multisample
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_window_pos
+#define need_GL_EXT_blend_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_cull_vertex
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_multi_draw_arrays
+#define need_GL_EXT_secondary_color
+#define need_GL_NV_vertex_program
+#include "extension_helper.h"
+
+
+/**
+ * Extension strings exported by the intel driver.
+ *
+ * \note
+ * It appears that ARB_texture_env_crossbar has "disappeared" compared to the
+ * old i830-specific driver.
+ */
+const struct dri_extension card_extensions[] = {
+   {"GL_ARB_multisample", GL_ARB_multisample_functions},
+   {"GL_ARB_multitexture", NULL},
+   {"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
+   {"GL_ARB_texture_border_clamp", NULL},
+   {"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
+   {"GL_ARB_texture_cube_map", NULL},
+   {"GL_ARB_texture_env_add", NULL},
+   {"GL_ARB_texture_env_combine", NULL},
+   {"GL_ARB_texture_env_dot3", NULL},
+   {"GL_ARB_texture_mirrored_repeat", NULL},
+   {"GL_ARB_texture_rectangle", NULL},
+   {"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
+   {"GL_ARB_pixel_buffer_object", NULL},
+   {"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
+   {"GL_ARB_window_pos", GL_ARB_window_pos_functions},
+   {"GL_EXT_blend_color", GL_EXT_blend_color_functions},
+   {"GL_EXT_blend_equation_separate", GL_EXT_blend_equation_separate_functions},
+   {"GL_EXT_blend_func_separate", GL_EXT_blend_func_separate_functions},
+   {"GL_EXT_blend_minmax", GL_EXT_blend_minmax_functions},
+   {"GL_EXT_blend_subtract", NULL},
+   {"GL_EXT_cull_vertex", GL_EXT_cull_vertex_functions},
+   {"GL_EXT_fog_coord", GL_EXT_fog_coord_functions},
+   {"GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions},
+   {"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
+   {"GL_EXT_packed_depth_stencil", NULL},
+   {"GL_EXT_pixel_buffer_object", NULL},
+   {"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
+   {"GL_EXT_stencil_wrap", NULL},
+   {"GL_EXT_texture_edge_clamp", NULL},
+   {"GL_EXT_texture_env_combine", NULL},
+   {"GL_EXT_texture_env_dot3", NULL},
+   {"GL_EXT_texture_filter_anisotropic", NULL},
+   {"GL_EXT_texture_lod_bias", NULL},
+   {"GL_3DFX_texture_compression_FXT1", NULL},
+   {"GL_APPLE_client_storage", NULL},
+   {"GL_MESA_pack_invert", NULL},
+   {"GL_MESA_ycbcr_texture", NULL},
+   {"GL_NV_blend_square", NULL},
+   {"GL_NV_vertex_program", GL_NV_vertex_program_functions},
+   {"GL_NV_vertex_program1_1", NULL},
+   {"GL_SGIS_generate_mipmap", NULL },
+   {NULL, NULL}
+};
+
+
+
+#ifdef DEBUG
+static const struct dri_debug_control debug_control[] = {
+   {"ioctl", DEBUG_IOCTL},
+   {"bat", DEBUG_BATCH},
+   {"lock", DEBUG_LOCK},
+   {"swap", DEBUG_SWAP},
+   {NULL, 0}
+};
+#endif
+
+
+
+static void
+intel_lock_hardware(struct intel_be_context *context)
+{
+   struct intel_context *intel = (struct intel_context *)context;
+   LOCK_HARDWARE(intel);
+}
+
+static void
+intel_unlock_hardware(struct intel_be_context *context)
+{
+   struct intel_context *intel = (struct intel_context *)context;
+   UNLOCK_HARDWARE(intel);
+}
+
+static boolean
+intel_locked_hardware(struct intel_be_context *context)
+{
+   struct intel_context *intel = (struct intel_context *)context;
+   return intel->locked ? TRUE : FALSE;
+}
+
+GLboolean
+intelCreateContext(const __GLcontextModes * visual,
+                   __DRIcontextPrivate * driContextPriv,
+                   void *sharedContextPrivate)
+{
+   struct intel_context *intel = CALLOC_STRUCT(intel_context);
+   __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+   struct intel_screen *intelScreen = intel_screen(sPriv);
+   drmI830Sarea *saPriv = intelScreen->sarea;
+   int fthrottle_mode;
+   GLboolean havePools;
+   struct pipe_context *pipe;
+   struct st_context *st_share = NULL;
+
+   if (sharedContextPrivate) {
+      st_share = ((struct intel_context *) sharedContextPrivate)->st;
+   }
+
+   driContextPriv->driverPrivate = intel;
+   intel->intelScreen = intelScreen;
+   intel->driScreen = sPriv;
+   intel->sarea = saPriv;
+
+   driParseConfigFiles(&intel->optionCache, &intelScreen->optionCache,
+                       intel->driScreen->myNum, "i915");
+
+
+   /*
+    * memory pools
+    */
+   DRM_LIGHT_LOCK(sPriv->fd, &sPriv->pSAREA->lock, driContextPriv->hHWContext);
+   // ZZZ JB should be per screen and not be done per context
+   havePools = intelCreatePools(sPriv);
+   DRM_UNLOCK(sPriv->fd, &sPriv->pSAREA->lock, driContextPriv->hHWContext);
+   if (!havePools)
+      return GL_FALSE;
+
+
+   /* Dri stuff */
+   intel->hHWContext = driContextPriv->hHWContext;
+   intel->driFd = sPriv->fd;
+   intel->driHwLock = (drmLock *) & sPriv->pSAREA->lock;
+
+   fthrottle_mode = driQueryOptioni(&intel->optionCache, "fthrottle_mode");
+   intel->iw.irq_seq = -1;
+   intel->irqsEmitted = 0;
+
+   intel->last_swap_fence = NULL;
+   intel->first_swap_fence = NULL;
+
+#ifdef DEBUG
+   __intel_debug = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
+#endif
+   intel->base.hardware_lock = intel_lock_hardware;
+   intel->base.hardware_unlock = intel_unlock_hardware;
+   intel->base.hardware_locked = intel_locked_hardware;
+
+   intel_be_init_context(&intel->base, &intelScreen->base);
+
+   /*
+    * Pipe-related setup
+    */
+   if (getenv("INTEL_SP")) {
+      /* use softpipe driver instead of hw */
+      pipe = intel_create_softpipe( intel, &intelScreen->base.base );
+   }
+   else {
+      switch (intel->intelScreen->deviceID) {
+      case PCI_CHIP_I945_G:
+      case PCI_CHIP_I945_GM:
+      case PCI_CHIP_I945_GME:
+      case PCI_CHIP_G33_G:
+      case PCI_CHIP_Q33_G:
+      case PCI_CHIP_Q35_G:
+      case PCI_CHIP_I915_G:
+      case PCI_CHIP_I915_GM:
+	 pipe = i915_create_context(intelScreen->base.screen,
+				    &intelScreen->base.base,
+				    &intel->base.base);
+	 break;
+      default:
+	 fprintf(stderr, "Unknown PCIID %x in %s, using software driver\n",
+                 intel->intelScreen->deviceID, __FUNCTION__);
+
+	 pipe = intel_create_softpipe( intel, &intelScreen->base.base );
+	 break;
+      }
+   }
+
+   pipe->priv = intel;
+
+   intel->st = st_create_context(pipe, visual, st_share);
+
+   driInitExtensions( intel->st->ctx, card_extensions, GL_TRUE );
+
+   return GL_TRUE;
+}
+
+
+void
+intelDestroyContext(__DRIcontextPrivate * driContextPriv)
+{
+   struct intel_context *intel = intel_context(driContextPriv);
+
+   assert(intel);               /* should never be null */
+   if (intel) {
+      st_finish(intel->st);
+
+      if (intel->last_swap_fence) {
+	 driFenceFinish(intel->last_swap_fence, DRM_FENCE_TYPE_EXE, GL_TRUE);
+	 driFenceUnReference(&intel->last_swap_fence);
+	 intel->last_swap_fence = NULL;
+      }
+      if (intel->first_swap_fence) {
+	 driFenceFinish(intel->first_swap_fence, DRM_FENCE_TYPE_EXE, GL_TRUE);
+	 driFenceUnReference(&intel->first_swap_fence);
+	 intel->first_swap_fence = NULL;
+      }
+
+      if (intel->intelScreen->dummyContext == intel)
+         intel->intelScreen->dummyContext = NULL;
+
+      st_destroy_context(intel->st);
+      intel_be_destroy_context(&intel->base);
+      free(intel);
+   }
+}
+
+
+GLboolean
+intelUnbindContext(__DRIcontextPrivate * driContextPriv)
+{
+   struct intel_context *intel = intel_context(driContextPriv);
+   st_flush(intel->st, PIPE_FLUSH_RENDER_CACHE, NULL);
+   /* XXX make_current(NULL)? */
+   return GL_TRUE;
+}
+
+
+GLboolean
+intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
+                 __DRIdrawablePrivate * driDrawPriv,
+                 __DRIdrawablePrivate * driReadPriv)
+{
+   if (driContextPriv) {
+      struct intel_context *intel = intel_context(driContextPriv);
+      struct intel_framebuffer *draw_fb = intel_framebuffer(driDrawPriv);
+      struct intel_framebuffer *read_fb = intel_framebuffer(driReadPriv);
+
+      assert(draw_fb->stfb);
+      assert(read_fb->stfb);
+
+      /* This is for situations in which we need a rendering context but
+       * there may not be any currently bound.
+       */
+      intel->intelScreen->dummyContext = intel;
+
+      st_make_current(intel->st, draw_fb->stfb, read_fb->stfb);
+
+      if ((intel->driDrawable != driDrawPriv) ||
+	  (intel->lastStamp != driDrawPriv->lastStamp)) {
+         intel->driDrawable = driDrawPriv;
+         intelUpdateWindowSize(driDrawPriv);
+         intel->lastStamp = driDrawPriv->lastStamp;
+      }
+
+      /* The size of the draw buffer will have been updated above.
+       * If the readbuffer is a different window, check/update its size now.
+       */
+      if (driReadPriv != driDrawPriv) {
+         intelUpdateWindowSize(driReadPriv);
+      }
+
+   }
+   else {
+      st_make_current(NULL, NULL, NULL);
+   }
+
+   return GL_TRUE;
+}
diff --git a/src/gallium/winsys/drm/intel/dri/intel_context.h b/src/gallium/winsys/drm/intel/dri/intel_context.h
new file mode 100644
index 0000000000..5d22a422af
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_context.h
@@ -0,0 +1,164 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INTEL_CONTEXT_H
+#define INTEL_CONTEXT_H
+
+#include <stdint.h>
+#include "drm.h"
+
+#include "pipe/p_debug.h"
+
+#include "intel_screen.h"
+#include "i915_drm.h"
+
+#include "intel_be_context.h"
+
+
+struct pipe_context;
+struct intel_context;
+struct _DriBufferObject;
+struct st_context;
+
+
+#define INTEL_MAX_FIXUP 64
+
+/**
+ * Intel rendering context, contains a state tracker and intel-specific info.
+ */
+struct intel_context
+{
+   struct intel_be_context base;
+   struct st_context *st;
+
+   struct _DriFenceObject *last_swap_fence;
+   struct _DriFenceObject *first_swap_fence;
+
+//   struct intel_batchbuffer *batch;
+
+   boolean locked;
+   char *prevLockFile;
+   int prevLockLine;
+
+   uint irqsEmitted;
+   drm_i915_irq_wait_t iw;
+
+   drm_context_t hHWContext;
+   drmLock *driHwLock;
+   int driFd;
+
+   __DRIdrawablePrivate *driDrawable;
+   __DRIscreenPrivate *driScreen;
+   struct intel_screen *intelScreen;
+   drmI830Sarea *sarea;
+
+   uint lastStamp;
+
+   /**
+    * Configuration cache
+    */
+   driOptionCache optionCache;
+};
+
+
+
+/**
+ * Intel framebuffer.
+ */
+struct intel_framebuffer
+{
+   struct st_framebuffer *stfb;
+
+   /* other fields TBD */
+   int other;
+};
+
+
+
+
+/* These are functions now:
+ */
+void LOCK_HARDWARE( struct intel_context *intel );
+void UNLOCK_HARDWARE( struct intel_context *intel );
+
+extern char *__progname;
+
+
+
+/* ================================================================
+ * Debugging:
+ */
+#ifdef DEBUG
+extern int __intel_debug;
+
+#define DEBUG_SWAP	0x1
+#define DEBUG_LOCK      0x2
+#define DEBUG_IOCTL	0x4
+#define DEBUG_BATCH     0x8
+
+#define DBG(flag, ...)  do { 			\
+   if (__intel_debug & (DEBUG_##flag)) 		\
+      printf(__VA_ARGS__); 		\
+} while(0)
+
+#else
+#define DBG(flag, ...)
+#endif
+
+
+
+#define PCI_CHIP_845_G			0x2562
+#define PCI_CHIP_I830_M			0x3577
+#define PCI_CHIP_I855_GM		0x3582
+#define PCI_CHIP_I865_G			0x2572
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_Q33_G			0x29D2
+
+
+/** Cast wrapper */
+static INLINE struct intel_context *
+intel_context(__DRIcontextPrivate *driContextPriv)
+{
+   return (struct intel_context *) driContextPriv->driverPrivate;
+}
+
+
+/** Cast wrapper */
+static INLINE struct intel_framebuffer *
+intel_framebuffer(__DRIdrawablePrivate * driDrawPriv)
+{
+   return (struct intel_framebuffer *) driDrawPriv->driverPrivate;
+}
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/intel_lock.c b/src/gallium/winsys/drm/intel/dri/intel_lock.c
new file mode 100644
index 0000000000..ad1c202429
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_lock.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "main/glheader.h"
+#include "pipe/p_thread.h"
+#include <GL/internal/glcore.h>
+#include "state_tracker/st_public.h"
+#include "intel_context.h"
+#include "i830_dri.h"
+
+
+
+pipe_static_mutex( lockMutex );
+
+
+static void
+intelContendedLock(struct intel_context *intel, uint flags)
+{
+   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+   __DRIscreenPrivate *sPriv = intel->driScreen;
+   struct intel_screen *intelScreen = intel_screen(sPriv);
+   drmI830Sarea *sarea = intel->sarea;
+
+   drmGetLock(intel->driFd, intel->hHWContext, flags);
+
+   DBG(LOCK, "%s - got contended lock\n", __progname);
+
+   /* If the window moved, may need to set a new cliprect now.
+    *
+    * NOTE: This releases and regains the hw lock, so all state
+    * checking must be done *after* this call:
+    */
+   if (dPriv)
+      DRI_VALIDATE_DRAWABLE_INFO(sPriv, dPriv);
+
+   if (sarea->width != intelScreen->front.width ||
+       sarea->height != intelScreen->front.height) {
+
+      intelUpdateScreenRotation(sPriv, sarea);
+   }
+}
+
+
+/* Lock the hardware and validate our state.
+ */
+void LOCK_HARDWARE( struct intel_context *intel )
+{
+    char __ret = 0;
+
+    pipe_mutex_lock(lockMutex);
+    assert(!intel->locked);
+
+    DRM_CAS(intel->driHwLock, intel->hHWContext,
+            (DRM_LOCK_HELD|intel->hHWContext), __ret);
+
+    if (__ret)
+       intelContendedLock( intel, 0 );
+
+    DBG(LOCK, "%s - locked\n", __progname);
+
+    intel->locked = 1;
+}
+
+
+/* Unlock the hardware using the global current context
+ */
+void UNLOCK_HARDWARE( struct intel_context *intel )
+{
+   assert(intel->locked);
+   intel->locked = 0;
+
+   DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+
+   pipe_mutex_unlock(lockMutex);
+
+   DBG(LOCK, "%s - unlocked\n", __progname);
+}
diff --git a/src/gallium/winsys/drm/intel/dri/intel_reg.h b/src/gallium/winsys/drm/intel/dri/intel_reg.h
new file mode 100644
index 0000000000..4f33bee438
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_reg.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef _INTEL_REG_H_
+#define _INTEL_REG_H_
+
+
+#define BR00_BITBLT_CLIENT   0x40000000
+#define BR00_OP_COLOR_BLT    0x10000000
+#define BR00_OP_SRC_COPY_BLT 0x10C00000
+#define BR13_SOLID_PATTERN   0x80000000
+
+#define XY_COLOR_BLT_CMD		((2<<29)|(0x50<<22)|0x4)
+#define XY_COLOR_BLT_WRITE_ALPHA	(1<<21)
+#define XY_COLOR_BLT_WRITE_RGB		(1<<20)
+
+#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
+#define XY_SRC_COPY_BLT_WRITE_ALPHA     (1<<21)
+#define XY_SRC_COPY_BLT_WRITE_RGB       (1<<20)
+
+#define MI_WAIT_FOR_EVENT               ((0x3<<23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+#define MI_BATCH_BUFFER_END 	        (0xA<<23)
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c
new file mode 100644
index 0000000000..ed75368982
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c
@@ -0,0 +1,703 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "utils.h"
+#include "vblank.h"
+#include "xmlpool.h"
+
+#include "intel_context.h"
+#include "intel_screen.h"
+#include "intel_batchbuffer.h"
+#include "intel_swapbuffers.h"
+
+#include "i830_dri.h"
+#include "ws_dri_bufpool.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_inlines.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_cb_fbo.h"
+
+static void
+intelCreateSurface(struct intel_screen *intelScreen, struct pipe_winsys *winsys, unsigned handle);
+
+static void
+intelCreateSurface(struct intel_screen *intelScreen, struct pipe_winsys *winsys, unsigned handle)
+{
+   struct pipe_screen *screen = intelScreen->base.screen;
+   struct pipe_texture *texture;
+   struct pipe_texture templat;
+   struct pipe_surface *surface;
+   struct pipe_buffer *buffer;
+   unsigned pitch;
+
+   assert(intelScreen->front.cpp == 4);
+
+   buffer = intel_be_buffer_from_handle(&intelScreen->base,
+                                        "front", handle);
+
+   if (!buffer)
+      return;
+
+   intelScreen->front.buffer = dri_bo(buffer);
+
+   memset(&templat, 0, sizeof(templat));
+   templat.tex_usage |= PIPE_TEXTURE_USAGE_DISPLAY_TARGET;
+   templat.target = PIPE_TEXTURE_2D;
+   templat.last_level = 0;
+   templat.depth[0] = 1;
+   templat.format = PIPE_FORMAT_A8R8G8B8_UNORM;
+   templat.width[0] = intelScreen->front.width;
+   templat.height[0] = intelScreen->front.height;
+   pf_get_block(templat.format, &templat.block);
+   pitch = intelScreen->front.pitch;
+
+   texture = screen->texture_blanket(screen,
+                                     &templat,
+                                     &pitch,
+                                     buffer);
+
+   /* Unref the buffer we don't need it anyways */
+   pipe_buffer_reference(screen, &buffer, NULL);
+
+   surface = screen->get_tex_surface(screen,
+                                     texture,
+                                     0,
+                                     0,
+                                     0,
+                                     PIPE_BUFFER_USAGE_GPU_WRITE);
+
+   intelScreen->front.texture = texture;
+   intelScreen->front.surface = surface;
+}
+
+PUBLIC const char __driConfigOptions[] =
+   DRI_CONF_BEGIN DRI_CONF_SECTION_PERFORMANCE
+   DRI_CONF_FTHROTTLE_MODE(DRI_CONF_FTHROTTLE_IRQS)
+   DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
+   DRI_CONF_SECTION_END DRI_CONF_SECTION_QUALITY
+//   DRI_CONF_FORCE_S3TC_ENABLE(false)
+   DRI_CONF_ALLOW_LARGE_TEXTURES(1)
+   DRI_CONF_SECTION_END DRI_CONF_END;
+
+const uint __driNConfigOptions = 3;
+
+#ifdef USE_NEW_INTERFACE
+static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
+#endif /*USE_NEW_INTERFACE */
+
+extern const struct dri_extension card_extensions[];
+
+static GLboolean
+intel_get_param(__DRIscreenPrivate *psp, int param, int *value)
+{
+   int ret;
+   struct drm_i915_getparam gp;
+
+   gp.param = param;
+   gp.value = value;
+
+   ret = drmCommandWriteRead(psp->fd, DRM_I915_GETPARAM, &gp, sizeof(gp));
+   if (ret) {
+      fprintf(stderr, "drm_i915_getparam: %d\n", ret);
+      return GL_FALSE;
+   }
+
+   return GL_TRUE;
+}
+
+static void
+intelSetTexOffset(__DRIcontext *pDRICtx, int texname,
+		  unsigned long long offset, int depth, uint pitch)
+{
+   abort();
+#if 0
+   struct intel_context *intel = (struct intel_context*)
+      ((__DRIcontextPrivate*)pDRICtx->private)->driverPrivate;
+   struct gl_texture_object *tObj = _mesa_lookup_texture(&intel->ctx, texname);
+   struct st_texture_object *stObj = st_texture_object(tObj);
+
+   if (!stObj)
+      return;
+
+   if (stObj->pt)
+      st->pipe->texture_release(intel->st->pipe, &stObj->pt);
+
+   stObj->imageOverride = GL_TRUE;
+   stObj->depthOverride = depth;
+   stObj->pitchOverride = pitch;
+
+   if (offset)
+      stObj->textureOffset = offset;
+#endif
+}
+
+
+#if 0
+static void
+intelHandleDrawableConfig(__DRIdrawablePrivate *dPriv,
+			  __DRIcontextPrivate *pcp,
+			  __DRIDrawableConfigEvent *event)
+{
+   (void) dPriv;
+   (void) pcp;
+   (void) event;
+}
+#endif
+
+#if 0
+static void
+intelHandleBufferAttach(__DRIdrawablePrivate *dPriv,
+			__DRIcontextPrivate *pcp,
+			__DRIBufferAttachEvent *ba)
+{
+   struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv);
+
+   switch (ba->buffer.attachment) {
+   case DRI_DRAWABLE_BUFFER_FRONT_LEFT:
+      intelScreen->front.width = dPriv->w;
+      intelScreen->front.height = dPriv->h;
+      intelScreen->front.cpp = ba->buffer.cpp;
+      intelScreen->front.pitch = ba->buffer.pitch;
+      driGenBuffers(intelScreen->base.staticPool, "front", 1, &intelScreen->front.buffer, 0, 0, 0);
+      driBOSetReferenced(intelScreen->front.buffer, ba->buffer.handle);
+      break;
+
+   case DRI_DRAWABLE_BUFFER_BACK_LEFT:
+   case DRI_DRAWABLE_BUFFER_DEPTH:
+   case DRI_DRAWABLE_BUFFER_STENCIL:
+   case DRI_DRAWABLE_BUFFER_ACCUM:
+      /* anything ?? */
+      break;
+
+   default:
+      fprintf(stderr, "unhandled buffer attach event, attachment type %d\n",
+	      ba->buffer.attachment);
+      return;
+   }
+}
+#endif
+
+static const __DRItexOffsetExtension intelTexOffsetExtension = {
+   { __DRI_TEX_OFFSET },
+   intelSetTexOffset,
+};
+
+#if 0
+static const __DRItexBufferExtension intelTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   intelSetTexBuffer,
+};
+#endif
+
+static const __DRIextension *intelScreenExtensions[] = {
+    &driReadDrawableExtension,
+    &driCopySubBufferExtension.base,
+    &driSwapControlExtension.base,
+    &driFrameTrackingExtension.base,
+    &driMediaStreamCounterExtension.base,
+    &intelTexOffsetExtension.base,
+//    &intelTexBufferExtension.base,
+    NULL
+};
+
+
+static void
+intelPrintDRIInfo(struct intel_screen * intelScreen,
+                  __DRIscreenPrivate * sPriv, I830DRIPtr gDRIPriv)
+{
+   fprintf(stderr, "*** Front size:   0x%x  offset: 0x%x  pitch: %d\n",
+           intelScreen->front.size, intelScreen->front.offset,
+           intelScreen->front.pitch);
+   fprintf(stderr, "*** Memory : 0x%x\n", gDRIPriv->mem);
+}
+
+
+#if 0
+static void
+intelPrintSAREA(const drmI830Sarea * sarea)
+{
+   fprintf(stderr, "SAREA: sarea width %d  height %d\n", sarea->width,
+           sarea->height);
+   fprintf(stderr, "SAREA: pitch: %d\n", sarea->pitch);
+   fprintf(stderr,
+           "SAREA: front offset: 0x%08x  size: 0x%x  handle: 0x%x\n",
+           sarea->front_offset, sarea->front_size,
+           (unsigned) sarea->front_handle);
+   fprintf(stderr,
+           "SAREA: back  offset: 0x%08x  size: 0x%x  handle: 0x%x\n",
+           sarea->back_offset, sarea->back_size,
+           (unsigned) sarea->back_handle);
+   fprintf(stderr, "SAREA: depth offset: 0x%08x  size: 0x%x  handle: 0x%x\n",
+           sarea->depth_offset, sarea->depth_size,
+           (unsigned) sarea->depth_handle);
+   fprintf(stderr, "SAREA: tex   offset: 0x%08x  size: 0x%x  handle: 0x%x\n",
+           sarea->tex_offset, sarea->tex_size, (unsigned) sarea->tex_handle);
+   fprintf(stderr, "SAREA: rotation: %d\n", sarea->rotation);
+   fprintf(stderr,
+           "SAREA: rotated offset: 0x%08x  size: 0x%x\n",
+           sarea->rotated_offset, sarea->rotated_size);
+   fprintf(stderr, "SAREA: rotated pitch: %d\n", sarea->rotated_pitch);
+}
+#endif
+
+
+/**
+ * Use the information in the sarea to update the screen parameters
+ * related to screen rotation. Needs to be called locked.
+ */
+void
+intelUpdateScreenRotation(__DRIscreenPrivate * sPriv, drmI830Sarea * sarea)
+{
+   struct intel_screen *intelScreen = intel_screen(sPriv);
+
+   if (intelScreen->front.map) {
+      drmUnmap(intelScreen->front.map, intelScreen->front.size);
+      intelScreen->front.map = NULL;
+   }
+
+   if (intelScreen->front.buffer)
+      driDeleteBuffers(1, &intelScreen->front.buffer);
+
+   intelScreen->front.width = sarea->width;
+   intelScreen->front.height = sarea->height;
+   intelScreen->front.offset = sarea->front_offset;
+   intelScreen->front.pitch = sarea->pitch * intelScreen->front.cpp;
+   intelScreen->front.size = sarea->front_size;
+   intelScreen->front.handle = sarea->front_handle;
+
+   assert( sarea->front_size >=
+	   intelScreen->front.pitch * intelScreen->front.height );
+
+#if 0 /* JB not important */
+   if (!sarea->front_handle)
+      return;
+
+   if (drmMap(sPriv->fd,
+	      sarea->front_handle,
+	      intelScreen->front.size,
+	      (drmAddress *) & intelScreen->front.map) != 0) {
+      fprintf(stderr, "drmMap(frontbuffer) failed!\n");
+      return;
+   }
+#endif
+
+#if 0 /* JB */
+   if (intelScreen->staticPool) {
+      driGenBuffers(intelScreen->staticPool, "static region", 1,
+		    &intelScreen->front.buffer, 64,
+		    DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_NO_MOVE |
+		    DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE, 0);
+
+      driBOSetStatic(intelScreen->front.buffer,
+		     intelScreen->front.offset,
+		     intelScreen->front.pitch * intelScreen->front.height,
+		     intelScreen->front.map, 0);
+   }
+#else
+   if (intelScreen->base.staticPool) {
+      if (intelScreen->front.buffer) {
+	 driBOUnReference(intelScreen->front.buffer);
+	 pipe_surface_reference(&intelScreen->front.surface, NULL);
+	 pipe_texture_reference(&intelScreen->front.texture, NULL);
+      }
+      intelCreateSurface(intelScreen, &intelScreen->base.base, sarea->front_bo_handle);
+   }
+#endif
+}
+
+
+boolean
+intelCreatePools(__DRIscreenPrivate * sPriv)
+{
+   //unsigned batchPoolSize = 1024*1024;
+   struct intel_screen *intelScreen = intel_screen(sPriv);
+
+   if (intelScreen->havePools)
+      return GL_TRUE;
+
+   intelScreen->havePools = GL_TRUE;
+
+   if (intelScreen->sarea)
+	intelUpdateScreenRotation(sPriv, intelScreen->sarea);
+
+   return GL_TRUE;
+}
+
+static const char *
+intel_get_name( struct pipe_winsys *winsys )
+{
+   return "Intel/DRI/ttm";
+}
+
+/*
+ * The state tracker (should!) keep track of whether the fake
+ * frontbuffer has been touched by any rendering since the last time
+ * we copied its contents to the real frontbuffer.  Our task is easy:
+ */
+static void
+intel_flush_frontbuffer( struct pipe_winsys *winsys,
+                         struct pipe_surface *surf,
+                         void *context_private)
+{
+   struct intel_context *intel = (struct intel_context *) context_private;
+   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+
+   intelDisplaySurface(dPriv, surf, NULL);
+}
+
+static boolean
+intelInitDriver(__DRIscreenPrivate * sPriv)
+{
+   struct intel_screen *intelScreen;
+   I830DRIPtr gDRIPriv = (I830DRIPtr) sPriv->pDevPriv;
+
+   if (sPriv->devPrivSize != sizeof(I830DRIRec)) {
+      fprintf(stderr,
+              "\nERROR!  sizeof(I830DRIRec) does not match passed size from device driver\n");
+      return GL_FALSE;
+   }
+
+   /* Allocate the private area */
+   intelScreen = CALLOC_STRUCT(intel_screen);
+   if (!intelScreen)
+      return GL_FALSE;
+
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo(&intelScreen->optionCache,
+                      __driConfigOptions, __driNConfigOptions);
+
+   sPriv->private = (void *) intelScreen;
+   intelScreen->sarea = (drmI830Sarea *) (((GLubyte *) sPriv->pSAREA) +
+                                            gDRIPriv->sarea_priv_offset);
+
+   intelScreen->deviceID = gDRIPriv->deviceID;
+
+   intelScreen->front.cpp = gDRIPriv->cpp;
+   intelScreen->drmMinor = sPriv->drm_version.minor;
+   intelUpdateScreenRotation(sPriv, intelScreen->sarea);
+
+   if (0)
+      intelPrintDRIInfo(intelScreen, sPriv, gDRIPriv);
+
+   sPriv->extensions = intelScreenExtensions;
+
+   intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
+   intelScreen->base.base.get_name = intel_get_name;
+   intel_be_init_device(&intelScreen->base, sPriv->fd, intelScreen->deviceID);
+
+   return GL_TRUE;
+}
+
+
+static void
+intelDestroyScreen(__DRIscreenPrivate * sPriv)
+{
+   struct intel_screen *intelScreen = intel_screen(sPriv);
+
+   intel_be_destroy_device(&intelScreen->base);
+   /*  intelUnmapScreenRegions(intelScreen); */
+
+   FREE(intelScreen);
+   sPriv->private = NULL;
+}
+
+
+/**
+ * This is called when we need to set up GL rendering to a new X window.
+ */
+static boolean
+intelCreateBuffer(__DRIscreenPrivate * driScrnPriv,
+                  __DRIdrawablePrivate * driDrawPriv,
+                  const __GLcontextModes * visual, boolean isPixmap)
+{
+   if (isPixmap) {
+      return GL_FALSE;          /* not implemented */
+   }
+   else {
+      enum pipe_format colorFormat, depthFormat, stencilFormat;
+      struct intel_framebuffer *intelfb = CALLOC_STRUCT(intel_framebuffer);
+
+      if (!intelfb)
+         return GL_FALSE;
+
+      if (visual->redBits == 5)
+         colorFormat = PIPE_FORMAT_R5G6B5_UNORM;
+      else
+         colorFormat = PIPE_FORMAT_A8R8G8B8_UNORM;
+
+      if (visual->depthBits == 16)
+         depthFormat = PIPE_FORMAT_Z16_UNORM;
+      else if (visual->depthBits == 24)
+         depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+      else
+         depthFormat = PIPE_FORMAT_NONE;
+
+      if (visual->stencilBits == 8)
+         stencilFormat = PIPE_FORMAT_S8Z24_UNORM;
+      else
+         stencilFormat = PIPE_FORMAT_NONE;
+
+      intelfb->stfb = st_create_framebuffer(visual,
+                                            colorFormat,
+                                            depthFormat,
+                                            stencilFormat,
+                                            driDrawPriv->w,
+                                            driDrawPriv->h,
+                                            (void*) intelfb);
+      if (!intelfb->stfb) {
+         free(intelfb);
+         return GL_FALSE;
+      }
+
+      driDrawPriv->driverPrivate = (void *) intelfb;
+      return GL_TRUE;
+   }
+}
+
+static void
+intelDestroyBuffer(__DRIdrawablePrivate * driDrawPriv)
+{
+   struct intel_framebuffer *intelfb = intel_framebuffer(driDrawPriv);
+   assert(intelfb->stfb);
+   st_unreference_framebuffer(intelfb->stfb);
+   free(intelfb);
+}
+
+
+/**
+ * Get information about previous buffer swaps.
+ */
+static int
+intelGetSwapInfo(__DRIdrawablePrivate * dPriv, __DRIswapInfo * sInfo)
+{
+   if ((dPriv == NULL) || (dPriv->driverPrivate == NULL)
+       || (sInfo == NULL)) {
+      return -1;
+   }
+
+   return 0;
+}
+
+static __DRIconfig **
+intelFillInModes(__DRIscreenPrivate *psp,
+		 unsigned pixel_bits, unsigned depth_bits,
+                 unsigned stencil_bits, GLboolean have_back_buffer)
+{
+   __DRIconfig **configs;
+   __GLcontextModes *m;
+   unsigned num_modes;
+   unsigned depth_buffer_factor;
+   unsigned back_buffer_factor;
+   GLenum fb_format;
+   GLenum fb_type;
+   int i;
+
+   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
+    * support pageflipping at all.
+    */
+   static const GLenum back_buffer_modes[] = {
+      GLX_NONE, GLX_SWAP_UNDEFINED_OML, GLX_SWAP_COPY_OML
+   };
+
+   uint8_t depth_bits_array[3];
+   uint8_t stencil_bits_array[3];
+   uint8_t msaa_samples_array[1];
+
+
+   depth_bits_array[0] = 0;
+   depth_bits_array[1] = depth_bits;
+   depth_bits_array[2] = depth_bits;
+   msaa_samples_array[0] = 0;
+
+   /* Just like with the accumulation buffer, always provide some modes
+    * with a stencil buffer.  It will be a sw fallback, but some apps won't
+    * care about that.
+    */
+   stencil_bits_array[0] = 0;
+   stencil_bits_array[1] = 0;
+   if (depth_bits == 24)
+      stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+   stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+   depth_buffer_factor = ((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
+   back_buffer_factor = (have_back_buffer) ? 3 : 1;
+
+   num_modes = depth_buffer_factor * back_buffer_factor * 4;
+
+   if (pixel_bits == 16) {
+      fb_format = GL_RGB;
+      fb_type = GL_UNSIGNED_SHORT_5_6_5;
+   }
+   else {
+      fb_format = GL_BGRA;
+      fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+   }
+
+   configs = driCreateConfigs(fb_format, fb_type,
+			      depth_bits_array, stencil_bits_array,
+			      depth_buffer_factor, back_buffer_modes,
+			      back_buffer_factor, msaa_samples_array, 1);
+   if (configs == NULL) {
+    fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+              __LINE__);
+      return NULL;
+   }
+
+   /* Mark the visual as slow if there are "fake" stencil bits.
+    */
+   for (i = 0; configs[i]; i++) {
+      m = &configs[i]->modes;
+      if ((m->stencilBits != 0) && (m->stencilBits != stencil_bits)) {
+         m->visualRating = GLX_SLOW_CONFIG;
+      }
+   }
+
+   return configs;
+}
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \todo maybe fold this into intelInitDriver
+ *
+ * \return the __GLcontextModes supported by this driver
+ */
+static const __DRIconfig **intelInitScreen(__DRIscreenPrivate *psp)
+{
+#ifdef I915
+   static const __DRIversion ddx_expected = { 1, 5, 0 };
+#else
+   static const __DRIversion ddx_expected = { 1, 6, 0 };
+#endif
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 5, 0 };
+   I830DRIPtr dri_priv = (I830DRIPtr) psp->pDevPriv;
+
+   if (!driCheckDriDdxDrmVersions2("i915",
+                                   &psp->dri_version, &dri_expected,
+                                   &psp->ddx_version, &ddx_expected,
+                                   &psp->drm_version, &drm_expected)) {
+      return NULL;
+   }
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+   //intelInitExtensions(NULL, GL_TRUE);
+	   
+   if (!intelInitDriver(psp))
+       return NULL;
+
+   psp->extensions = intelScreenExtensions;
+
+   return (const __DRIconfig **)
+       intelFillInModes(psp, dri_priv->cpp * 8,
+			(dri_priv->cpp == 2) ? 16 : 24,
+			(dri_priv->cpp == 2) ? 0  : 8, 1);
+}
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * 
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **intelInitScreen2(__DRIscreenPrivate *psp)
+{
+   struct intel_screen *intelScreen;
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create is
+    * called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   //intelInitExtensions(NULL, GL_TRUE);
+
+   /* Allocate the private area */
+   intelScreen = CALLOC_STRUCT(intel_screen);
+   if (!intelScreen) {
+      fprintf(stderr, "\nERROR!  Allocating private area failed\n");
+      return GL_FALSE;
+   }
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo(&intelScreen->optionCache,
+                      __driConfigOptions, __driNConfigOptions);
+
+   psp->private = (void *) intelScreen;
+
+   intelScreen->drmMinor = psp->drm_version.minor;
+
+   /* Determine chipset ID? */
+   if (!intel_get_param(psp, I915_PARAM_CHIPSET_ID,
+			&intelScreen->deviceID))
+      return GL_FALSE;
+
+   psp->extensions = intelScreenExtensions;
+
+   intel_be_init_device(&intelScreen->base, psp->fd, intelScreen->deviceID);
+   intelScreen->base.base.flush_frontbuffer = intel_flush_frontbuffer;
+   intelScreen->base.base.get_name = intel_get_name;
+
+   return driConcatConfigs(intelFillInModes(psp, 16, 16, 0, 1),
+			   intelFillInModes(psp, 32, 24, 8, 1));
+}
+
+const struct __DriverAPIRec driDriverAPI = {
+   .InitScreen		 = intelInitScreen,
+   .DestroyScreen	 = intelDestroyScreen,
+   .CreateContext	 = intelCreateContext,
+   .DestroyContext	 = intelDestroyContext,
+   .CreateBuffer	 = intelCreateBuffer,
+   .DestroyBuffer	 = intelDestroyBuffer,
+   .SwapBuffers		 = intelSwapBuffers,
+   .MakeCurrent		 = intelMakeCurrent,
+   .UnbindContext	 = intelUnbindContext,
+   .GetSwapInfo		 = intelGetSwapInfo,
+   .GetDrawableMSC	 = driDrawableGetMSC32,
+   .WaitForMSC		 = driWaitForMSC32,
+   .CopySubBuffer	 = intelCopySubBuffer,
+
+   //.InitScreen2		 = intelInitScreen2,
+   //.HandleDrawableConfig = intelHandleDrawableConfig,
+   //.HandleBufferAttach	 = intelHandleBufferAttach,
+};
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.h b/src/gallium/winsys/drm/intel/dri/intel_screen.h
new file mode 100644
index 0000000000..0bb43a915c
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.h
@@ -0,0 +1,122 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _INTEL_SCREEN_H_
+#define _INTEL_SCREEN_H_
+
+#include "dri_util.h"
+#include "i830_common.h"
+#include "xmlconfig.h"
+#include "ws_dri_bufpool.h"
+
+#include "pipe/p_compiler.h"
+
+#include "intel_be_device.h"
+
+struct intel_screen
+{
+   struct intel_be_device base;
+
+   struct {
+      drm_handle_t handle;
+
+      /* We create a static dri buffer for the frontbuffer.
+       */
+      struct _DriBufferObject *buffer;
+      struct pipe_surface *surface;
+      struct pipe_texture *texture;
+
+      char *map;                   /* memory map */
+      int offset;                  /* from start of video mem, in bytes */
+      int pitch;                   /* row stride, in bytes */
+      int width;
+      int height;
+      int size;
+      int cpp;                     /* for front and back buffers */
+   } front;
+
+   int deviceID;
+   int drmMinor;
+
+   drmI830Sarea *sarea;
+
+   /**
+   * Configuration cache with default values for all contexts
+   */
+   driOptionCache optionCache;
+
+   boolean havePools;
+
+   /**
+    * Temporary(?) context to use for SwapBuffers or other situations in
+    * which we need a rendering context, but none is currently bound.
+    */
+   struct intel_context *dummyContext;
+
+   /*
+    * New stuff form the i915tex integration
+    */
+   unsigned batch_id;
+
+
+   struct pipe_winsys *winsys;
+};
+
+
+
+/** cast wrapper */
+static INLINE struct intel_screen *
+intel_screen(__DRIscreenPrivate *sPriv)
+{
+   return (struct intel_screen *) sPriv->private;
+}
+
+
+extern void
+intelUpdateScreenRotation(__DRIscreenPrivate * sPriv, drmI830Sarea * sarea);
+
+
+extern void intelDestroyContext(__DRIcontextPrivate * driContextPriv);
+
+extern boolean intelUnbindContext(__DRIcontextPrivate * driContextPriv);
+
+extern boolean
+intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
+                 __DRIdrawablePrivate * driDrawPriv,
+                 __DRIdrawablePrivate * driReadPriv);
+
+
+extern boolean
+intelCreatePools(__DRIscreenPrivate *sPriv);
+
+extern boolean
+intelCreateContext(const __GLcontextModes * visual,
+                   __DRIcontextPrivate * driContextPriv,
+                   void *sharedContextPrivate);
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
new file mode 100644
index 0000000000..34ad7eebe1
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.c
@@ -0,0 +1,260 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_swapbuffers.h"
+
+#include "intel_reg.h"
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_cb_fbo.h"
+
+#include "ws_dri_bufmgr.h"
+#include "intel_batchbuffer.h"
+
+/**
+ * Display a colorbuffer surface in an X window.
+ * Used for SwapBuffers and flushing front buffer rendering.
+ *
+ * \param dPriv  the window/drawable to display into
+ * \param surf  the surface to display
+ * \param rect  optional subrect of surface to display (may be NULL).
+ */
+void
+intelDisplaySurface(__DRIdrawablePrivate *dPriv,
+                    struct pipe_surface *surf,
+                    const drm_clip_rect_t *rect)
+{
+   struct intel_screen *intelScreen = intel_screen(dPriv->driScreenPriv);
+   struct intel_context *intel = intelScreen->dummyContext;
+
+   DBG(SWAP, "%s\n", __FUNCTION__);
+
+   if (!intel) {
+      /* XXX this is where some kind of extra/meta context could be useful */
+      return;
+   }
+
+   if (intel->last_swap_fence) {
+      driFenceFinish(intel->last_swap_fence, DRM_FENCE_TYPE_EXE, TRUE);
+      driFenceUnReference(&intel->last_swap_fence);
+      intel->last_swap_fence = NULL;
+   }
+   intel->last_swap_fence = intel->first_swap_fence;
+   intel->first_swap_fence = NULL;
+
+   /* The LOCK_HARDWARE is required for the cliprects.  Buffer offsets
+    * should work regardless.
+    */
+   LOCK_HARDWARE(intel);
+   /* if this drawable isn't currently bound the LOCK_HARDWARE done on the
+    * current context (which is what intelScreenContext should return) might
+    * not get a contended lock and thus cliprects not updated (tests/manywin)
+    */
+   if (intel_context(dPriv->driContextPriv) != intel)
+      DRI_VALIDATE_DRAWABLE_INFO(intel->driScreen, dPriv);
+
+
+   if (dPriv && dPriv->numClipRects) {
+      const int srcWidth = surf->width;
+      const int srcHeight = surf->height;
+      const int nbox = dPriv->numClipRects;
+      const drm_clip_rect_t *pbox = dPriv->pClipRects;
+      const int pitch = intelScreen->front.pitch / intelScreen->front.cpp;
+      const int cpp = intelScreen->front.cpp;
+      const int srcpitch = surf->stride / cpp;
+      int BR13, CMD;
+      int i;
+
+      ASSERT(surf->buffer);
+
+      DBG(SWAP, "screen pitch %d  src surface pitch %d\n",
+	  pitch, surf->stride);
+
+      if (cpp == 2) {
+	 BR13 = (pitch * cpp) | (0xCC << 16) | (1 << 24);
+	 CMD = XY_SRC_COPY_BLT_CMD;
+      }
+      else {
+	 BR13 = (pitch * cpp) | (0xCC << 16) | (1 << 24) | (1 << 25);
+	 CMD = (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+		XY_SRC_COPY_BLT_WRITE_RGB);
+      }
+
+      for (i = 0; i < nbox; i++, pbox++) {
+	 drm_clip_rect_t box;
+	 drm_clip_rect_t sbox;
+
+	 if (pbox->x1 > pbox->x2 ||
+	     pbox->y1 > pbox->y2 ||
+	     pbox->x2 > intelScreen->front.width ||
+	     pbox->y2 > intelScreen->front.height) {
+            /* invalid cliprect, skip it */
+	    continue;
+         }
+
+	 box = *pbox;
+
+	 if (rect) {
+            /* intersect cliprect with user-provided src rect */
+	    drm_clip_rect_t rrect;
+
+	    rrect.x1 = dPriv->x + rect->x1;
+	    rrect.y1 = (dPriv->h - rect->y1 - rect->y2) + dPriv->y;
+	    rrect.x2 = rect->x2 + rrect.x1;
+	    rrect.y2 = rect->y2 + rrect.y1;
+	    if (rrect.x1 > box.x1)
+	       box.x1 = rrect.x1;
+	    if (rrect.y1 > box.y1)
+	       box.y1 = rrect.y1;
+	    if (rrect.x2 < box.x2)
+	       box.x2 = rrect.x2;
+	    if (rrect.y2 < box.y2)
+	       box.y2 = rrect.y2;
+
+	    if (box.x1 > box.x2 || box.y1 > box.y2)
+	       continue;
+	 }
+
+	 /* restrict blit to size of actually rendered area */
+	 if (box.x2 - box.x1 > srcWidth)
+	    box.x2 = srcWidth + box.x1;
+	 if (box.y2 - box.y1 > srcHeight)
+	    box.y2 = srcHeight + box.y1;
+
+	 DBG(SWAP, "box x1 x2 y1 y2 %d %d %d %d\n",
+	     box.x1, box.x2, box.y1, box.y2);
+
+	 sbox.x1 = box.x1 - dPriv->x;
+	 sbox.y1 = box.y1 - dPriv->y;
+
+         assert(box.x1 < box.x2);
+         assert(box.y1 < box.y2);
+
+         /* XXX this could be done with pipe->surface_copy() */
+	 /* XXX should have its own batch buffer */
+	 if (!BEGIN_BATCH(8, 2)) {
+	    /*
+	     * Since we share this batch buffer with a context
+	     * we can't flush it since that risks a GPU lockup
+	     */
+	    assert(0);
+	    continue;
+	 }
+
+	 OUT_BATCH(CMD);
+	 OUT_BATCH(BR13);
+	 OUT_BATCH((box.y1 << 16) | box.x1);
+	 OUT_BATCH((box.y2 << 16) | box.x2);
+
+	 OUT_RELOC(intelScreen->front.buffer,
+		   DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
+		   DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, 0);
+	 OUT_BATCH((sbox.y1 << 16) | sbox.x1);
+	 OUT_BATCH((srcpitch * cpp) & 0xffff);
+	 OUT_RELOC(dri_bo(surf->buffer),
+                   DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+		   DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+
+      }
+
+      if (intel->first_swap_fence)
+	 driFenceUnReference(&intel->first_swap_fence);
+      intel->first_swap_fence = intel_be_batchbuffer_flush(intel->base.batch);
+   }
+
+   UNLOCK_HARDWARE(intel);
+
+   if (intel->lastStamp != dPriv->lastStamp) {
+      intelUpdateWindowSize(dPriv);
+      intel->lastStamp = dPriv->lastStamp;
+   }
+}
+
+
+
+/**
+ * This will be called whenever the currently bound window is moved/resized.
+ */
+void
+intelUpdateWindowSize(__DRIdrawablePrivate *dPriv)
+{
+   struct intel_framebuffer *intelfb = intel_framebuffer(dPriv);
+   assert(intelfb->stfb);
+   st_resize_framebuffer(intelfb->stfb, dPriv->w, dPriv->h);
+}
+
+
+
+void
+intelSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+   struct intel_framebuffer *intel_fb = intel_framebuffer(dPriv);
+   struct pipe_surface *back_surf;
+
+   assert(intel_fb);
+   assert(intel_fb->stfb);
+
+   back_surf = st_get_framebuffer_surface(intel_fb->stfb,
+                                          ST_SURFACE_BACK_LEFT);
+   if (back_surf) {
+      st_notify_swapbuffers(intel_fb->stfb);
+      intelDisplaySurface(dPriv, back_surf, NULL);
+      st_notify_swapbuffers_complete(intel_fb->stfb);
+   }
+}
+
+
+/**
+ * Called via glXCopySubBufferMESA() to copy a subrect of the back
+ * buffer to the front buffer/screen.
+ */
+void
+intelCopySubBuffer(__DRIdrawablePrivate * dPriv, int x, int y, int w, int h)
+{
+   struct intel_framebuffer *intel_fb = intel_framebuffer(dPriv);
+   struct pipe_surface *back_surf;
+
+   assert(intel_fb);
+   assert(intel_fb->stfb);
+
+   back_surf = st_get_framebuffer_surface(intel_fb->stfb,
+                                          ST_SURFACE_BACK_LEFT);
+   if (back_surf) {
+      drm_clip_rect_t rect;
+      rect.x1 = x;
+      rect.y1 = y;
+      rect.x2 = w;
+      rect.y2 = h;
+
+      st_notify_swapbuffers(intel_fb->stfb);
+      intelDisplaySurface(dPriv, back_surf, &rect);
+   }
+}
diff --git a/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.h b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.h
new file mode 100644
index 0000000000..46c9bab3af
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_swapbuffers.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INTEL_SWAPBUFFERS_H
+#define INTEL_SWAPBUFFERS_H
+
+
+struct pipe_surface;
+
+
+extern void intelDisplaySurface(__DRIdrawablePrivate * dPriv,
+                                struct pipe_surface *surf,
+                                const drm_clip_rect_t * rect);
+
+extern void intelSwapBuffers(__DRIdrawablePrivate * dPriv);
+
+extern void intelCopySubBuffer(__DRIdrawablePrivate * dPriv,
+                               int x, int y, int w, int h);
+
+extern void intelUpdateWindowSize(__DRIdrawablePrivate *dPriv);
+
+
+#endif /* INTEL_SWAPBUFFERS_H */
diff --git a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
new file mode 100644
index 0000000000..20920a2052
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.c
@@ -0,0 +1,82 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include "intel_context.h"
+#include "intel_winsys_softpipe.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+
+
+struct intel_softpipe_winsys {
+   struct softpipe_winsys sws;
+   struct intel_context *intel;
+};
+
+/**
+ * Return list of surface formats supported by this driver.
+ */
+static boolean
+intel_is_format_supported(struct softpipe_winsys *sws,
+                          enum pipe_format format)
+{
+   switch(format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+   case PIPE_FORMAT_R5G6B5_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      return TRUE;
+   default:
+      return FALSE;
+   }
+}
+
+
+/**
+ * Create rendering context which uses software rendering.
+ */
+struct pipe_context *
+intel_create_softpipe( struct intel_context *intel,
+                       struct pipe_winsys *winsys )
+{
+   struct intel_softpipe_winsys *isws = CALLOC_STRUCT( intel_softpipe_winsys );
+   struct pipe_screen *screen = softpipe_create_screen(winsys);
+
+   /* Fill in this struct with callbacks that softpipe will need to
+    * communicate with the window system, buffer manager, etc.
+    */
+   isws->sws.is_format_supported = intel_is_format_supported;
+   isws->intel = intel;
+
+   /* Create the softpipe context:
+    */
+   return softpipe_create( screen, winsys, &isws->sws );
+}
diff --git a/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.h b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.h
new file mode 100644
index 0000000000..5fa14cb749
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/intel_winsys_softpipe.h
@@ -0,0 +1,39 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INTEL_SOFTPIPE_H
+#define INTEL_SOFTPIPE_H
+
+struct pipe_winsys;
+struct pipe_context;
+struct intel_context;
+
+struct pipe_context *
+intel_create_softpipe( struct intel_context *intel,
+                       struct pipe_winsys *winsys );
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/dri/server/i830_common.h b/src/gallium/winsys/drm/intel/dri/server/i830_common.h
new file mode 100644
index 0000000000..3452ddb3c9
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/server/i830_common.h
@@ -0,0 +1,255 @@
+/**************************************************************************
+
+Copyright 2001 VA Linux Systems Inc., Fremont, California.
+Copyright 2002 Tungsten Graphics Inc., Cedar Park, Texas.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+
+#ifndef _I830_COMMON_H_
+#define _I830_COMMON_H_
+
+
+#define I830_NR_TEX_REGIONS 255	/* maximum due to use of chars for next/prev */
+#define I830_LOG_MIN_TEX_REGION_SIZE 14
+
+
+/* Driver specific DRM command indices
+ * NOTE: these are not OS specific, but they are driver specific
+ */
+#define DRM_I830_INIT                     0x00
+#define DRM_I830_FLUSH                    0x01
+#define DRM_I830_FLIP                     0x02
+#define DRM_I830_BATCHBUFFER              0x03
+#define DRM_I830_IRQ_EMIT                 0x04
+#define DRM_I830_IRQ_WAIT                 0x05
+#define DRM_I830_GETPARAM                 0x06
+#define DRM_I830_SETPARAM                 0x07
+#define DRM_I830_ALLOC                    0x08
+#define DRM_I830_FREE                     0x09
+#define DRM_I830_INIT_HEAP                0x0a
+#define DRM_I830_CMDBUFFER                0x0b
+#define DRM_I830_DESTROY_HEAP             0x0c
+#define DRM_I830_SET_VBLANK_PIPE          0x0d
+#define DRM_I830_GET_VBLANK_PIPE          0x0e
+#define DRM_I830_MMIO		       	  0x10
+
+typedef struct {
+   enum {
+      I830_INIT_DMA = 0x01,
+      I830_CLEANUP_DMA = 0x02,
+      I830_RESUME_DMA = 0x03
+   } func;
+   unsigned int mmio_offset;
+   int sarea_priv_offset;
+   unsigned int ring_start;
+   unsigned int ring_end;
+   unsigned int ring_size;
+   unsigned int front_offset;
+   unsigned int back_offset;
+   unsigned int depth_offset;
+   unsigned int w;
+   unsigned int h;
+   unsigned int pitch;
+   unsigned int pitch_bits;
+   unsigned int back_pitch;
+   unsigned int depth_pitch;
+   unsigned int cpp;
+   unsigned int chipset;
+} drmI830Init;
+
+typedef struct {
+	drmTextureRegion texList[I830_NR_TEX_REGIONS+1];
+        int last_upload;	/* last time texture was uploaded */
+        int last_enqueue;	/* last time a buffer was enqueued */
+	int last_dispatch;	/* age of the most recently dispatched buffer */
+	int ctxOwner;		/* last context to upload state */
+	/** Last context that used the buffer manager. */
+	int texAge;
+        int pf_enabled;		/* is pageflipping allowed? */
+        int pf_active;
+        int pf_current_page;	/* which buffer is being displayed? */
+        int perf_boxes;	        /* performance boxes to be displayed */
+	int width, height;      /* screen size in pixels */
+
+	drm_handle_t front_handle;
+	int front_offset;
+	int front_size;
+
+	drm_handle_t back_handle;
+	int back_offset;
+	int back_size;
+
+	drm_handle_t depth_handle;
+	int depth_offset;
+	int depth_size;
+
+	drm_handle_t tex_handle;
+	int tex_offset;
+	int tex_size;
+	int log_tex_granularity;
+	int pitch;
+	int rotation;           /* 0, 90, 180 or 270 */
+	int rotated_offset;
+	int rotated_size;
+	int rotated_pitch;
+	int virtualX, virtualY;
+
+	unsigned int front_tiled;
+	unsigned int back_tiled;
+	unsigned int depth_tiled;
+	unsigned int rotated_tiled;
+	unsigned int rotated2_tiled;
+
+	int planeA_x;
+	int planeA_y;
+	int planeA_w;
+	int planeA_h;
+	int planeB_x;
+	int planeB_y;
+	int planeB_w;
+	int planeB_h;
+
+	/* Triple buffering */
+	drm_handle_t third_handle;
+	int third_offset;
+	int third_size;
+	unsigned int third_tiled;
+
+	/* buffer object handles for the static buffers.  May change
+	 * over the lifetime of the client, though it doesn't in our current
+	 * implementation.
+	 */
+	unsigned int front_bo_handle;
+	unsigned int back_bo_handle;
+	unsigned int third_bo_handle;
+	unsigned int depth_bo_handle;
+} drmI830Sarea;
+
+/* Flags for perf_boxes
+ */
+#define I830_BOX_RING_EMPTY    0x1 /* populated by kernel */
+#define I830_BOX_FLIP          0x2 /* populated by kernel */
+#define I830_BOX_WAIT          0x4 /* populated by kernel & client */
+#define I830_BOX_TEXTURE_LOAD  0x8 /* populated by kernel */
+#define I830_BOX_LOST_CONTEXT  0x10 /* populated by client */
+
+
+typedef struct {
+   	int start;		/* agp offset */
+	int used;		/* nr bytes in use */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+        int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO*/
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+        drm_clip_rect_t *cliprects; /* pointer to userspace cliprects */
+} drmI830BatchBuffer;
+
+typedef struct {
+   	char *buf;		/* agp offset */
+	int sz; 		/* nr bytes in use */
+	int DR1;		/* hw flags for GFX_OP_DRAWRECT_INFO */
+        int DR4;		/* window origin for GFX_OP_DRAWRECT_INFO*/
+	int num_cliprects;	/* mulitpass with multiple cliprects? */
+        drm_clip_rect_t *cliprects; /* pointer to userspace cliprects */
+} drmI830CmdBuffer;
+
+typedef struct {
+	int *irq_seq;
+} drmI830IrqEmit;
+
+typedef struct {
+	int irq_seq;
+} drmI830IrqWait;
+
+typedef struct {
+	int param;
+	int *value;
+} drmI830GetParam;
+
+#define I830_PARAM_IRQ_ACTIVE     1
+#define I830_PARAM_ALLOW_BATCHBUFFER   2
+
+typedef struct {
+	int param;
+	int value;
+} drmI830SetParam;
+
+#define I830_SETPARAM_USE_MI_BATCHBUFFER_START  1
+#define I830_SETPARAM_TEX_LRU_LOG_GRANULARITY   2
+#define I830_SETPARAM_ALLOW_BATCHBUFFER         3
+
+
+/* A memory manager for regions of shared memory:
+ */
+#define I830_MEM_REGION_AGP 1
+
+typedef struct {
+	int region;
+	int alignment;
+	int size;
+	int *region_offset;	/* offset from start of fb or agp */
+} drmI830MemAlloc;
+
+typedef struct {
+	int region;
+	int region_offset;
+} drmI830MemFree;
+
+typedef struct {
+	int region;
+	int size;
+	int start;	
+} drmI830MemInitHeap;
+
+typedef struct {
+	int region;
+} drmI830MemDestroyHeap;
+
+#define DRM_I830_VBLANK_PIPE_A  1
+#define DRM_I830_VBLANK_PIPE_B  2
+
+typedef struct {
+        int pipe;
+} drmI830VBlankPipe;
+
+#define MMIO_READ  0
+#define MMIO_WRITE 1
+
+#define MMIO_REGS_IA_PRIMATIVES_COUNT           0
+#define MMIO_REGS_IA_VERTICES_COUNT             1
+#define MMIO_REGS_VS_INVOCATION_COUNT           2
+#define MMIO_REGS_GS_PRIMITIVES_COUNT           3
+#define MMIO_REGS_GS_INVOCATION_COUNT           4
+#define MMIO_REGS_CL_PRIMITIVES_COUNT           5
+#define MMIO_REGS_CL_INVOCATION_COUNT           6
+#define MMIO_REGS_PS_INVOCATION_COUNT           7
+#define MMIO_REGS_PS_DEPTH_COUNT                8
+
+typedef struct {
+        unsigned int read_write:1;
+        unsigned int reg:31;
+        void __user *data;
+} drmI830MMIO;
+
+#endif /* _I830_DRM_H_ */
diff --git a/src/gallium/winsys/drm/intel/dri/server/i830_dri.h b/src/gallium/winsys/drm/intel/dri/server/i830_dri.h
new file mode 100644
index 0000000000..0d514b6c38
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/dri/server/i830_dri.h
@@ -0,0 +1,62 @@
+
+#ifndef _I830_DRI_H
+#define _I830_DRI_H
+
+#include "xf86drm.h"
+#include "i830_common.h"
+
+#define I830_MAX_DRAWABLES 256
+
+#define I830_MAJOR_VERSION 1
+#define I830_MINOR_VERSION 7
+#define I830_PATCHLEVEL 2
+
+#define I830_REG_SIZE 0x80000
+
+typedef struct _I830DRIRec {
+   drm_handle_t regs;
+   drmSize regsSize;
+
+   drmSize unused1; /* backbufferSize */
+   drm_handle_t unused2; /* backbuffer */
+
+   drmSize unused3; /* depthbufferSize */
+   drm_handle_t unused4; /* depthbuffer */
+
+   drmSize unused5; /* rotatedSize */
+   drm_handle_t unused6; /* rotatedbuffer */
+
+   drm_handle_t unused7; /* textures */
+   int unused8; /* textureSize */
+
+   drm_handle_t unused9; /* agp_buffers */
+   drmSize unused10; /* agp_buf_size */
+
+   int deviceID;
+   int width;
+   int height;
+   int mem;
+   int cpp;
+   int bitsPerPixel;
+
+   int unused11[8]; /* was front/back/depth/rotated offset/pitch */
+
+   int unused12; /* logTextureGranularity */
+   int unused13; /* textureOffset */
+
+   int irq;
+   int sarea_priv_offset;
+} I830DRIRec, *I830DRIPtr;
+
+typedef struct {
+   /* Nothing here yet */
+   int dummy;
+} I830ConfigPrivRec, *I830ConfigPrivPtr;
+
+typedef struct {
+   /* Nothing here yet */
+   int dummy;
+} I830DRIContextRec, *I830DRIContextPtr;
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/Makefile b/src/gallium/winsys/drm/intel/egl/Makefile
new file mode 100644
index 0000000000..f0b5a44389
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/Makefile
@@ -0,0 +1,28 @@
+TOP = ../../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = EGL_i915.so
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/i915simple/libi915simple.a \
+	../common/libinteldrm.a
+
+DRIVER_SOURCES = \
+	intel_swapbuffers.c \
+	intel_context.c \
+	intel_device.c \
+	intel_egl.c
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+DRIVER_DEFINES = -I../common $(shell pkg-config libdrm --atleast-version=2.3.1 \
+				&& echo "-DDRM_VBLANK_FLIP=DRM_VBLANK_FLIP")
+
+include ../../Makefile.template
+
+symlinks:
diff --git a/src/gallium/winsys/drm/intel/egl/SConscript b/src/gallium/winsys/drm/intel/egl/SConscript
new file mode 100644
index 0000000000..0ad19d42a8
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/SConscript
@@ -0,0 +1,39 @@
+Import('*')
+
+env = drienv.Clone()
+
+env.Append(CPPPATH = [
+	'../intel',
+	'server'
+])
+
+#MINIGLX_SOURCES = server/intel_dri.c
+
+DRIVER_SOURCES = [
+	'intel_winsys_pipe.c',
+	'intel_winsys_softpipe.c',
+	'intel_winsys_i915.c',
+	'intel_batchbuffer.c',
+	'intel_swapbuffers.c',
+	'intel_context.c',
+	'intel_lock.c',
+	'intel_screen.c',
+	'intel_batchpool.c',
+]
+
+sources = \
+	COMMON_GALLIUM_SOURCES + \
+	COMMON_BM_SOURCES + \
+	DRIVER_SOURCES
+
+drivers = [
+	softpipe,
+	i915simple
+]
+
+# TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+env.SharedLibrary(
+	target ='i915tex_dri.so',
+	source = sources,
+	LIBS = drivers + mesa + auxiliaries + env['LIBS'],
+)
+\ No newline at end of file
diff --git a/src/gallium/winsys/drm/intel/egl/intel_batchbuffer.h b/src/gallium/winsys/drm/intel/egl/intel_batchbuffer.h
new file mode 100644
index 0000000000..3e95326168
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_batchbuffer.h
@@ -0,0 +1,24 @@
+#ifndef INTEL_BATCHBUFFER_H
+#define INTEL_BATCHBUFFER_H
+
+#include "intel_be_batchbuffer.h"
+
+/*
+ * Need to redefine the BATCH defines
+ */
+
+#undef BEGIN_BATCH
+#define BEGIN_BATCH(dwords, relocs) \
+   (i915_batchbuffer_check(&intel->base.batch->base, dwords, relocs))
+
+#undef OUT_BATCH
+#define OUT_BATCH(d) \
+   i915_batchbuffer_dword(&intel->base.batch->base, d)
+
+#undef OUT_RELOC
+#define OUT_RELOC(buf,flags,mask,delta) do { 					\
+   assert((delta) >= 0);							\
+   intel_be_offset_relocation(intel->base.batch, delta, buf, flags, mask); 	\
+} while (0)
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/intel_context.c b/src/gallium/winsys/drm/intel/egl/intel_context.c
new file mode 100644
index 0000000000..927addb834
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_context.c
@@ -0,0 +1,242 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "i915simple/i915_screen.h"
+
+#include "intel_device.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+
+#include "state_tracker/st_public.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "intel_egl.h"
+#include "utils.h"
+
+#ifdef DEBUG
+int __intel_debug = 0;
+#endif
+
+
+#define need_GL_ARB_multisample
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_window_pos
+#define need_GL_EXT_blend_color
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_cull_vertex
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_multi_draw_arrays
+#define need_GL_EXT_secondary_color
+#define need_GL_NV_vertex_program
+#include "extension_helper.h"
+
+
+/**
+ * Extension strings exported by the intel driver.
+ *
+ * \note
+ * It appears that ARB_texture_env_crossbar has "disappeared" compared to the
+ * old i830-specific driver.
+ */
+const struct dri_extension card_extensions[] = {
+	{"GL_ARB_multisample", GL_ARB_multisample_functions},
+	{"GL_ARB_multitexture", NULL},
+	{"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
+	{"GL_ARB_texture_border_clamp", NULL},
+	{"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
+	{"GL_ARB_texture_cube_map", NULL},
+	{"GL_ARB_texture_env_add", NULL},
+	{"GL_ARB_texture_env_combine", NULL},
+	{"GL_ARB_texture_env_dot3", NULL},
+	{"GL_ARB_texture_mirrored_repeat", NULL},
+	{"GL_ARB_texture_rectangle", NULL},
+	{"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
+	{"GL_ARB_pixel_buffer_object", NULL},
+	{"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
+	{"GL_ARB_window_pos", GL_ARB_window_pos_functions},
+	{"GL_EXT_blend_color", GL_EXT_blend_color_functions},
+	{"GL_EXT_blend_equation_separate", GL_EXT_blend_equation_separate_functions},
+	{"GL_EXT_blend_func_separate", GL_EXT_blend_func_separate_functions},
+	{"GL_EXT_blend_minmax", GL_EXT_blend_minmax_functions},
+	{"GL_EXT_blend_subtract", NULL},
+	{"GL_EXT_cull_vertex", GL_EXT_cull_vertex_functions},
+	{"GL_EXT_fog_coord", GL_EXT_fog_coord_functions},
+	{"GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions},
+	{"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
+	{"GL_EXT_packed_depth_stencil", NULL},
+	{"GL_EXT_pixel_buffer_object", NULL},
+	{"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
+	{"GL_EXT_stencil_wrap", NULL},
+	{"GL_EXT_texture_edge_clamp", NULL},
+	{"GL_EXT_texture_env_combine", NULL},
+	{"GL_EXT_texture_env_dot3", NULL},
+	{"GL_EXT_texture_filter_anisotropic", NULL},
+	{"GL_EXT_texture_lod_bias", NULL},
+	{"GL_3DFX_texture_compression_FXT1", NULL},
+	{"GL_APPLE_client_storage", NULL},
+	{"GL_MESA_pack_invert", NULL},
+	{"GL_MESA_ycbcr_texture", NULL},
+	{"GL_NV_blend_square", NULL},
+	{"GL_NV_vertex_program", GL_NV_vertex_program_functions},
+	{"GL_NV_vertex_program1_1", NULL},
+	{"GL_SGIS_generate_mipmap", NULL },
+	{NULL, NULL}
+};
+
+
+/*
+ * Hardware lock functions.
+ * Doesn't do anything in EGL
+ */
+
+static void
+intel_lock_hardware(struct intel_be_context *context)
+{
+	(void)context;
+}
+
+static void
+intel_unlock_hardware(struct intel_be_context *context)
+{
+	(void)context;
+}
+
+static boolean
+intel_locked_hardware(struct intel_be_context *context)
+{
+	(void)context;
+	return FALSE;
+}
+
+
+/*
+ * Misc functions.
+ */
+
+int
+intel_create_context(struct egl_drm_context *egl_context, const __GLcontextModes *visual, void *sharedContextPrivate)
+{
+	struct intel_context *intel = CALLOC_STRUCT(intel_context);
+	struct intel_device *device = (struct intel_device *)egl_context->device->priv;
+	struct pipe_context *pipe;
+	struct st_context *st_share = NULL;
+
+	egl_context->priv = intel;
+
+	intel->intel_device = device;
+	intel->egl_context = egl_context;
+	intel->egl_device = egl_context->device;
+
+	intel->base.hardware_lock = intel_lock_hardware;
+	intel->base.hardware_unlock = intel_unlock_hardware;
+	intel->base.hardware_locked = intel_locked_hardware;
+
+	intel_be_init_context(&intel->base, &device->base);
+
+#if 0
+	pipe = intel_create_softpipe(intel, screen->winsys);
+#else
+	pipe = i915_create_context(device->pipe, &device->base.base, &intel->base.base);
+#endif
+
+	pipe->priv = intel;
+
+	intel->st = st_create_context(pipe, visual, st_share);
+
+	device->dummy = intel;
+
+	return TRUE;
+}
+
+int
+intel_destroy_context(struct egl_drm_context *egl_context)
+{
+	struct intel_context *intel = egl_context->priv;
+
+	if (intel->intel_device->dummy == intel)
+		intel->intel_device->dummy = NULL;
+
+	st_destroy_context(intel->st);
+	intel_be_destroy_context(&intel->base);
+	free(intel);
+	return TRUE;
+}
+
+void
+intel_make_current(struct egl_drm_context *context, struct egl_drm_drawable *draw, struct egl_drm_drawable *read)
+{
+	if (context) {
+		struct intel_context *intel = (struct intel_context *)context->priv;
+		struct intel_framebuffer *draw_fb = (struct intel_framebuffer *)draw->priv;
+		struct intel_framebuffer *read_fb = (struct intel_framebuffer *)read->priv;
+
+		assert(draw_fb->stfb);
+		assert(read_fb->stfb);
+
+		st_make_current(intel->st, draw_fb->stfb, read_fb->stfb);
+
+		intel->egl_drawable = draw;
+
+		st_resize_framebuffer(draw_fb->stfb, draw->w, draw->h);
+
+		if (draw != read)
+			st_resize_framebuffer(read_fb->stfb, read->w, read->h);
+
+	} else {
+		st_make_current(NULL, NULL, NULL);
+	}
+}
+
+void
+intel_bind_frontbuffer(struct egl_drm_drawable *draw, struct egl_drm_frontbuffer *front)
+{
+	struct intel_device *device = (struct intel_device *)draw->device->priv;
+	struct intel_framebuffer *draw_fb = (struct intel_framebuffer *)draw->priv;
+
+	if (draw_fb->front_buffer)
+		driBOUnReference(draw_fb->front_buffer);
+
+	draw_fb->front_buffer = NULL;
+	draw_fb->front = NULL;
+
+	/* to unbind just call this function with front == NULL */
+	if (!front)
+		return;
+
+	draw_fb->front = front;
+
+	driGenBuffers(device->base.staticPool, "front", 1, &draw_fb->front_buffer, 0, 0, 0);
+	driBOSetReferenced(draw_fb->front_buffer, front->handle);
+
+	st_resize_framebuffer(draw_fb->stfb, draw->w, draw->h);
+}
diff --git a/src/gallium/winsys/drm/intel/egl/intel_context.h b/src/gallium/winsys/drm/intel/egl/intel_context.h
new file mode 100644
index 0000000000..477fdec7f7
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_context.h
@@ -0,0 +1,118 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INTEL_CONTEXT_H
+#define INTEL_CONTEXT_H
+
+#include "pipe/p_debug.h"
+#include "intel_be_context.h"
+
+
+struct st_context;
+struct egl_drm_device;
+struct egl_drm_context;
+struct egl_drm_frontbuffer;
+
+
+/**
+ * Intel rendering context, contains a state tracker and intel-specific info.
+ */
+struct intel_context
+{
+	struct intel_be_context base;
+
+	struct st_context *st;
+
+	struct intel_device *intel_device;
+
+	/* new egl stuff */
+	struct egl_drm_device *egl_device;
+	struct egl_drm_context *egl_context;
+	struct egl_drm_drawable *egl_drawable;
+};
+
+
+
+/**
+ * Intel framebuffer.
+ */
+struct intel_framebuffer
+{
+	struct st_framebuffer *stfb;
+
+	struct intel_device *device;
+	struct _DriBufferObject *front_buffer;
+	struct egl_drm_frontbuffer *front;
+};
+
+
+
+
+/* These are functions now:
+ */
+void LOCK_HARDWARE( struct intel_context *intel );
+void UNLOCK_HARDWARE( struct intel_context *intel );
+
+extern char *__progname;
+
+
+
+/* ================================================================
+ * Debugging:
+ */
+#ifdef DEBUG
+extern int __intel_debug;
+
+#define DEBUG_SWAP	0x1
+#define DEBUG_LOCK      0x2
+#define DEBUG_IOCTL	0x4
+#define DEBUG_BATCH     0x8
+
+#define DBG(flag, ...)  do { 			\
+   if (__intel_debug & (DEBUG_##flag)) 		\
+      printf(__VA_ARGS__); 		\
+} while(0)
+
+#else
+#define DBG(flag, ...)
+#endif
+
+
+#define PCI_CHIP_845_G			0x2562
+#define PCI_CHIP_I830_M			0x3577
+#define PCI_CHIP_I855_GM		0x3582
+#define PCI_CHIP_I865_G			0x2572
+#define PCI_CHIP_I915_G			0x2582
+#define PCI_CHIP_I915_GM		0x2592
+#define PCI_CHIP_I945_G			0x2772
+#define PCI_CHIP_I945_GM		0x27A2
+#define PCI_CHIP_I945_GME		0x27AE
+#define PCI_CHIP_G33_G			0x29C2
+#define PCI_CHIP_Q35_G			0x29B2
+#define PCI_CHIP_Q33_G			0x29D2
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/intel_device.c b/src/gallium/winsys/drm/intel/egl/intel_device.c
new file mode 100644
index 0000000000..1964745c99
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_device.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "utils.h"
+
+#include "state_tracker/st_public.h"
+#include "i915simple/i915_screen.h"
+
+#include "intel_context.h"
+#include "intel_device.h"
+#include "intel_batchbuffer.h"
+#include "intel_egl.h"
+
+
+extern const struct dri_extension card_extensions[];
+
+
+int
+intel_create_device(struct egl_drm_device *device)
+{
+	struct intel_device *intel_device;
+
+	/* Allocate the private area */
+	intel_device = CALLOC_STRUCT(intel_device);
+	if (!intel_device)
+		return FALSE;
+
+	device->priv = (void *)intel_device;
+	intel_device->device = device;
+
+	intel_device->deviceID = device->deviceID;
+
+	intel_be_init_device(&intel_device->base, device->drmFD, intel_device->deviceID);
+
+	intel_device->pipe = i915_create_screen(&intel_device->base.base, intel_device->deviceID);
+
+	/* hack */
+	driInitExtensions(NULL, card_extensions, GL_FALSE);
+
+	return TRUE;
+}
+
+int
+intel_destroy_device(struct egl_drm_device *device)
+{
+	struct intel_device *intel_device = (struct intel_device *)device->priv;
+	
+	intel_be_destroy_device(&intel_device->base);
+
+	free(intel_device);
+	device->priv = NULL;
+
+	return TRUE;
+}
+
+int
+intel_create_drawable(struct egl_drm_drawable *drawable,
+                      const __GLcontextModes * visual)
+{
+	enum pipe_format colorFormat, depthFormat, stencilFormat;
+	struct intel_framebuffer *intelfb = CALLOC_STRUCT(intel_framebuffer);
+
+	if (!intelfb)
+		return GL_FALSE;
+
+	intelfb->device = drawable->device->priv;
+
+	if (visual->redBits == 5)
+		colorFormat = PIPE_FORMAT_R5G6B5_UNORM;
+	else
+		colorFormat = PIPE_FORMAT_A8R8G8B8_UNORM;
+
+	if (visual->depthBits == 16)
+		depthFormat = PIPE_FORMAT_Z16_UNORM;
+	else if (visual->depthBits == 24)
+		depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+	else
+		depthFormat = PIPE_FORMAT_NONE;
+
+	if (visual->stencilBits == 8)
+		stencilFormat = PIPE_FORMAT_S8Z24_UNORM;
+	else
+		stencilFormat = PIPE_FORMAT_NONE;
+
+	intelfb->stfb = st_create_framebuffer(visual,
+			colorFormat,
+			depthFormat,
+			stencilFormat,
+			drawable->w,
+			drawable->h,
+			(void*) intelfb);
+
+	if (!intelfb->stfb) {
+		free(intelfb);
+		return GL_FALSE;
+	}
+
+	drawable->priv = (void *) intelfb;
+	return GL_TRUE;
+}
+
+int
+intel_destroy_drawable(struct egl_drm_drawable *drawable)
+{
+	struct intel_framebuffer *intelfb = (struct intel_framebuffer *)drawable->priv;
+	drawable->priv = NULL;
+
+	assert(intelfb->stfb);
+	st_unreference_framebuffer(intelfb->stfb);
+	free(intelfb);
+	return TRUE;
+}
diff --git a/src/gallium/winsys/drm/intel/egl/intel_device.h b/src/gallium/winsys/drm/intel/egl/intel_device.h
new file mode 100644
index 0000000000..323a7c2aef
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_device.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef _INTEL_SCREEN_H_
+#define _INTEL_SCREEN_H_
+
+#include "intel_be_device.h"
+
+#include "pipe/p_compiler.h"
+
+struct pipe_screen;
+struct egl_drm_device;
+struct intel_context;
+
+struct intel_device
+{
+	struct intel_be_device base;
+	struct pipe_screen *pipe;
+
+	int deviceID;
+	struct egl_drm_device *device;
+
+	struct intel_context *dummy;
+};
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/intel_egl.c b/src/gallium/winsys/drm/intel/egl/intel_egl.c
new file mode 100644
index 0000000000..ed464076ee
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_egl.c
@@ -0,0 +1,796 @@
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "eglconfig.h"
+#include "eglcontext.h"
+#include "egldisplay.h"
+#include "egldriver.h"
+#include "eglglobals.h"
+#include "eglmode.h"
+#include "eglscreen.h"
+#include "eglsurface.h"
+#include "egllog.h"
+
+#include "intel_egl.h"
+
+#include "xf86drm.h"
+#include "xf86drmMode.h"
+
+#include "intel_context.h"
+
+#include "state_tracker/st_public.h"
+
+#define MAX_SCREENS 16
+
+static void
+drm_get_device_id(struct egl_drm_device *device)
+{
+	char path[512];
+	FILE *file;
+
+	/* TODO get the real minor */
+	int minor = 0;
+
+	snprintf(path, sizeof(path), "/sys/class/drm/card%d/device/device", minor);
+	file = fopen(path, "r");
+	if (!file) {
+		_eglLog(_EGL_WARNING, "Could not retrive device ID\n");
+		return;
+	}
+
+	fgets(path, sizeof( path ), file);
+	sscanf(path, "%x", &device->deviceID);
+	fclose(file);
+}
+
+static struct egl_drm_device*
+egl_drm_create_device(int drmFD)
+{
+	struct egl_drm_device *device = malloc(sizeof(*device));
+	memset(device, 0, sizeof(*device));
+	device->drmFD = drmFD;
+
+	device->version = drmGetVersion(device->drmFD);
+
+	drm_get_device_id(device);
+
+	if (!intel_create_device(device)) {
+		free(device);
+		return NULL;
+	}
+
+	return device;
+}
+
+static void
+_egl_context_modes_destroy(__GLcontextModes *modes)
+{
+   _eglLog(_EGL_DEBUG, "%s", __FUNCTION__);
+
+   while (modes) {
+      __GLcontextModes * const next = modes->next;
+      free(modes);
+      modes = next;
+   }
+}
+/**
+ * Create a linked list of 'count' GLcontextModes.
+ * These are used during the client/server visual negotiation phase,
+ * then discarded.
+ */
+static __GLcontextModes *
+_egl_context_modes_create(unsigned count, size_t minimum_size)
+{
+   /* This code copied from libGLX, and modified */
+   const size_t size = (minimum_size > sizeof(__GLcontextModes))
+       ? minimum_size : sizeof(__GLcontextModes);
+   __GLcontextModes * head = NULL;
+   __GLcontextModes ** next;
+   unsigned   i;
+
+   _eglLog(_EGL_DEBUG, "%s %d %d", __FUNCTION__, count, minimum_size);
+
+   next = & head;
+   for (i = 0 ; i < count ; i++) {
+      *next = (__GLcontextModes *) calloc(1, size);
+      if (*next == NULL) {
+	 _egl_context_modes_destroy(head);
+	 head = NULL;
+	 break;
+      }
+      
+      (*next)->doubleBufferMode = 1;
+      (*next)->visualID = GLX_DONT_CARE;
+      (*next)->visualType = GLX_DONT_CARE;
+      (*next)->visualRating = GLX_NONE;
+      (*next)->transparentPixel = GLX_NONE;
+      (*next)->transparentRed = GLX_DONT_CARE;
+      (*next)->transparentGreen = GLX_DONT_CARE;
+      (*next)->transparentBlue = GLX_DONT_CARE;
+      (*next)->transparentAlpha = GLX_DONT_CARE;
+      (*next)->transparentIndex = GLX_DONT_CARE;
+      (*next)->xRenderable = GLX_DONT_CARE;
+      (*next)->fbconfigID = GLX_DONT_CARE;
+      (*next)->swapMethod = GLX_SWAP_UNDEFINED_OML;
+      (*next)->bindToTextureRgb = GLX_DONT_CARE;
+      (*next)->bindToTextureRgba = GLX_DONT_CARE;
+      (*next)->bindToMipmapTexture = GLX_DONT_CARE;
+      (*next)->bindToTextureTargets = 0;
+      (*next)->yInverted = GLX_DONT_CARE;
+
+      next = & ((*next)->next);
+   }
+
+   return head;
+}
+
+struct drm_screen;
+
+struct drm_driver
+{
+	_EGLDriver base;  /* base class/object */
+
+	drmModeResPtr res;
+
+	struct drm_screen *screens[MAX_SCREENS];
+	size_t count_screens;
+
+	struct egl_drm_device *device;
+};
+
+struct drm_surface
+{
+	_EGLSurface base;  /* base class/object */
+
+	struct egl_drm_drawable *drawable;
+};
+
+struct drm_context
+{
+	_EGLContext base;  /* base class/object */
+
+	struct egl_drm_context *context;
+};
+
+struct drm_screen
+{
+	_EGLScreen base;
+
+	/* currently only support one connector */
+	drmModeConnectorPtr connector;
+
+	/* Has this screen been shown */
+	int shown;
+
+	/* Surface that is currently attached to this screen */
+	struct drm_surface *surf;
+
+	/* backing buffer */
+	drmBO buffer;
+
+	/* framebuffer */
+	drmModeFBPtr fb;
+	uint32_t fbID;
+
+	/* crtc and mode used */
+	drmModeCrtcPtr crtc;
+	uint32_t crtcID;
+
+	struct drm_mode_modeinfo *mode;
+
+	/* geometry of the screen */
+	struct egl_drm_frontbuffer front;
+};
+
+static void
+drm_update_res(struct drm_driver *drm_drv)
+{
+	drmModeFreeResources(drm_drv->res);
+	drm_drv->res = drmModeGetResources(drm_drv->device->drmFD);
+}
+
+static void
+drm_add_modes_from_connector(_EGLScreen *screen, drmModeConnectorPtr connector)
+{
+	struct drm_mode_modeinfo *m;
+	int i;
+
+	for (i = 0; i < connector->count_modes; i++) {
+		m = &connector->modes[i];
+		_eglAddNewMode(screen, m->hdisplay, m->vdisplay, m->vrefresh, m->name);
+	}
+}
+
+
+static EGLBoolean
+drm_initialize(_EGLDriver *drv, EGLDisplay dpy, EGLint *major, EGLint *minor)
+{
+	_EGLDisplay *disp = _eglLookupDisplay(dpy);
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	struct drm_screen *screen = NULL;
+	drmModeConnectorPtr connector = NULL;
+	drmModeResPtr res = NULL;
+	unsigned count_connectors = 0;
+	int num_screens = 0;
+
+	EGLint i;
+	int fd;
+
+	fd = drmOpen("i915", NULL);
+	if (fd < 0) {
+		return EGL_FALSE;
+	}
+
+	drm_drv->device = egl_drm_create_device(fd);
+	if (!drm_drv->device) {
+		drmClose(fd);
+		return EGL_FALSE;
+	}
+
+	drm_update_res(drm_drv);
+	res = drm_drv->res;
+	if (res)
+		count_connectors = res->count_connectors;
+
+	for(i = 0; i < count_connectors && i < MAX_SCREENS; i++) {
+		connector = drmModeGetConnector(fd, res->connectors[i]);
+
+		if (!connector)
+			continue;
+
+		if (connector->connection != DRM_MODE_CONNECTED) {
+			drmModeFreeConnector(connector);
+			continue;
+		}
+
+		screen = malloc(sizeof(struct drm_screen));
+		memset(screen, 0, sizeof(*screen));
+		screen->connector = connector;
+		_eglInitScreen(&screen->base);
+		_eglAddScreen(disp, &screen->base);
+		drm_add_modes_from_connector(&screen->base, connector);
+		drm_drv->screens[num_screens++] = screen;
+	}
+	drm_drv->count_screens = num_screens;
+
+	/* for now we only have one config */
+	_EGLConfig *config = calloc(1, sizeof(*config));
+	memset(config, 1, sizeof(*config));
+	_eglInitConfig(config, 1);
+	_eglSetConfigAttrib(config, EGL_RED_SIZE, 8);
+	_eglSetConfigAttrib(config, EGL_GREEN_SIZE, 8);
+	_eglSetConfigAttrib(config, EGL_BLUE_SIZE, 8);
+	_eglSetConfigAttrib(config, EGL_ALPHA_SIZE, 8);
+	_eglSetConfigAttrib(config, EGL_BUFFER_SIZE, 32);
+	_eglSetConfigAttrib(config, EGL_DEPTH_SIZE, 24);
+	_eglSetConfigAttrib(config, EGL_STENCIL_SIZE, 8);
+	_eglSetConfigAttrib(config, EGL_SURFACE_TYPE, EGL_PBUFFER_BIT);
+	_eglAddConfig(disp, config);
+
+	drv->Initialized = EGL_TRUE;
+
+	*major = 1;
+	*minor = 4;
+
+	return EGL_TRUE;
+}
+
+static void
+drm_takedown_shown_screen(_EGLDriver *drv, struct drm_screen *screen)
+{
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	unsigned int i;
+
+	intel_bind_frontbuffer(screen->surf->drawable, NULL);
+	screen->surf = NULL;
+
+	for (i = 0; i < drm_drv->res->count_crtcs; i++) {
+		drmModeSetCrtc(
+			drm_drv->device->drmFD,
+			drm_drv->res->crtcs[i],
+			0, // FD
+			0, 0,
+			NULL, 0, // List of output ids
+			NULL);
+	}
+
+	drmModeRmFB(drm_drv->device->drmFD, screen->fbID);
+	drmModeFreeFB(screen->fb);
+	screen->fb = NULL;
+
+	drmBOUnreference(drm_drv->device->drmFD, &screen->buffer);
+
+	screen->shown = 0;
+}
+
+static EGLBoolean
+drm_terminate(_EGLDriver *drv, EGLDisplay dpy)
+{
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	struct drm_screen *screen;
+	int i = 0;
+
+	intel_destroy_device(drm_drv->device);
+	drmFreeVersion(drm_drv->device->version);
+
+	for (i = 0; i < drm_drv->count_screens; i++) {
+		screen = drm_drv->screens[i];
+
+		if (screen->shown)
+			drm_takedown_shown_screen(drv, screen);
+
+		drmModeFreeConnector(screen->connector);
+		_eglDestroyScreen(&screen->base);
+		drm_drv->screens[i] = NULL;
+	}
+
+	drmClose(drm_drv->device->drmFD);
+
+	free(drm_drv->device);
+
+	_eglCleanupDisplay(_eglLookupDisplay(dpy));
+	free(drm_drv);
+
+	return EGL_TRUE;
+}
+
+
+static struct drm_context *
+lookup_drm_context(EGLContext context)
+{
+	_EGLContext *c = _eglLookupContext(context);
+	return (struct drm_context *) c;
+}
+
+
+static struct drm_surface *
+lookup_drm_surface(EGLSurface surface)
+{
+	_EGLSurface *s = _eglLookupSurface(surface);
+	return (struct drm_surface *) s;
+}
+
+static struct drm_screen *
+lookup_drm_screen(EGLDisplay dpy, EGLScreenMESA screen)
+{
+	_EGLScreen *s = _eglLookupScreen(dpy, screen);
+	return (struct drm_screen *) s;
+}
+
+static __GLcontextModes*
+visual_from_config(_EGLConfig *conf)
+{
+	__GLcontextModes *visual;
+	(void)conf;
+
+	visual = _egl_context_modes_create(1, sizeof(*visual));
+	visual->redBits = 8;
+	visual->greenBits = 8;
+	visual->blueBits = 8;
+	visual->alphaBits = 8;
+
+	visual->rgbBits = 32;
+	visual->doubleBufferMode = 1;
+
+	visual->depthBits = 24;
+	visual->haveDepthBuffer = visual->depthBits > 0;
+	visual->stencilBits = 8;
+	visual->haveStencilBuffer = visual->stencilBits > 0;
+
+	return visual;
+}
+
+
+
+static EGLContext
+drm_create_context(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config, EGLContext share_list, const EGLint *attrib_list)
+{
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	struct drm_context *c;
+	struct drm_egl_context *share = NULL;
+	_EGLConfig *conf;
+	int i;
+	int ret;
+	__GLcontextModes *visual;
+	struct egl_drm_context *context;
+
+	conf = _eglLookupConfig(drv, dpy, config);
+	if (!conf) {
+		_eglError(EGL_BAD_CONFIG, "eglCreateContext");
+		return EGL_NO_CONTEXT;
+	}
+
+	for (i = 0; attrib_list && attrib_list[i] != EGL_NONE; i++) {
+		switch (attrib_list[i]) {
+			/* no attribs defined for now */
+			default:
+				_eglError(EGL_BAD_ATTRIBUTE, "eglCreateContext");
+				return EGL_NO_CONTEXT;
+		}
+	}
+
+	c = (struct drm_context *) calloc(1, sizeof(struct drm_context));
+	if (!c)
+		return EGL_NO_CONTEXT;
+
+	_eglInitContext(drv, dpy, &c->base, config, attrib_list);
+
+	context = malloc(sizeof(*context));
+	memset(context, 0, sizeof(*context));
+
+	if (!context)
+		goto err_c;
+
+	context->device = drm_drv->device;
+	visual = visual_from_config(conf);
+
+	ret = intel_create_context(context, visual, share);
+	free(visual);
+
+	if (!ret)
+		goto err_gl;
+
+	c->context = context;
+
+	/* generate handle and insert into hash table */
+	_eglSaveContext(&c->base);
+	assert(_eglGetContextHandle(&c->base));
+
+	return _eglGetContextHandle(&c->base);
+err_gl:
+	free(context);
+err_c:
+	free(c);
+	return EGL_NO_CONTEXT;
+}
+
+static EGLBoolean
+drm_destroy_context(_EGLDriver *drv, EGLDisplay dpy, EGLContext context)
+{
+	struct drm_context *fc = lookup_drm_context(context);
+	_eglRemoveContext(&fc->base);
+	if (fc->base.IsBound) {
+		fc->base.DeletePending = EGL_TRUE;
+	} else {
+		intel_destroy_context(fc->context);
+		free(fc->context);
+		free(fc);
+	}
+	return EGL_TRUE;
+}
+
+
+static EGLSurface
+drm_create_window_surface(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config, NativeWindowType window, const EGLint *attrib_list)
+{
+	return EGL_NO_SURFACE;
+}
+
+
+static EGLSurface
+drm_create_pixmap_surface(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config, NativePixmapType pixmap, const EGLint *attrib_list)
+{
+	return EGL_NO_SURFACE;
+}
+
+
+static EGLSurface
+drm_create_pbuffer_surface(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config,
+                         const EGLint *attrib_list)
+{
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	int i;
+	int ret;
+	int width = -1;
+	int height = -1;
+	struct drm_surface *surf = NULL;
+	struct egl_drm_drawable *drawable = NULL;
+	__GLcontextModes *visual;
+	_EGLConfig *conf;
+
+	conf = _eglLookupConfig(drv, dpy, config);
+	if (!conf) {
+		_eglError(EGL_BAD_CONFIG, "eglCreatePbufferSurface");
+		return EGL_NO_CONTEXT;
+	}
+
+	for (i = 0; attrib_list && attrib_list[i] != EGL_NONE; i++) {
+		switch (attrib_list[i]) {
+			case EGL_WIDTH:
+				width = attrib_list[++i];
+				break;
+			case EGL_HEIGHT:
+				height = attrib_list[++i];
+				break;
+			default:
+				_eglError(EGL_BAD_ATTRIBUTE, "eglCreatePbufferSurface");
+				return EGL_NO_SURFACE;
+		}
+	}
+
+	if (width < 1 || height < 1) {
+		_eglError(EGL_BAD_ATTRIBUTE, "eglCreatePbufferSurface");
+		return EGL_NO_SURFACE;
+	}
+
+	surf = (struct drm_surface *) calloc(1, sizeof(struct drm_surface));
+	if (!surf)
+		goto err;
+
+	if (!_eglInitSurface(drv, dpy, &surf->base, EGL_PBUFFER_BIT, config, attrib_list))
+		goto err_surf;
+
+	drawable = malloc(sizeof(*drawable));
+	memset(drawable, 0, sizeof(*drawable));
+
+	drawable->w = width;
+	drawable->h = height;
+
+	visual = visual_from_config(conf);
+
+	drawable->device = drm_drv->device;
+	ret = intel_create_drawable(drawable, visual);
+	free(visual);
+
+	if (!ret)
+		goto err_draw;
+
+	surf->drawable = drawable;
+
+	_eglSaveSurface(&surf->base);
+	return surf->base.Handle;
+
+err_draw:
+	free(drawable);
+err_surf:
+	free(surf);
+err:
+	return EGL_NO_SURFACE;
+}
+
+static EGLSurface
+drm_create_screen_surface_mesa(_EGLDriver *drv, EGLDisplay dpy, EGLConfig cfg,
+                               const EGLint *attrib_list)
+{
+	EGLSurface surf = drm_create_pbuffer_surface(drv, dpy, cfg, attrib_list);
+
+	return surf;
+}
+
+static struct drm_mode_modeinfo *
+drm_find_mode(drmModeConnectorPtr connector, _EGLMode *mode)
+{
+	int i;
+	struct drm_mode_modeinfo *m = NULL;
+
+	for (i = 0; i < connector->count_modes; i++) {
+		m = &connector->modes[i];
+		if (m->hdisplay == mode->Width && m->vdisplay == mode->Height && m->vrefresh == mode->RefreshRate)
+			break;
+		m = &connector->modes[0]; /* if we can't find one, return first */
+	}
+
+	return m;
+}
+static void
+draw(size_t x, size_t y, size_t w, size_t h, size_t pitch, size_t v, unsigned int *ptr)
+{
+    int i, j;
+
+    for (i = x; i < x + w; i++)
+        for(j = y; j < y + h; j++)
+            ptr[(i * pitch / 4) + j] = v;
+
+}
+
+static void
+prettyColors(int fd, unsigned int handle, size_t pitch)
+{
+	drmBO bo;
+	unsigned int *ptr;
+	void *p;
+	int i;
+
+	drmBOReference(fd, handle, &bo);
+	drmBOMap(fd, &bo, DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE, 0, &p);
+	ptr = (unsigned int*)p;
+
+	for (i = 0; i < (bo.size / 4); i++)
+		ptr[i] = 0xFFFFFFFF;
+
+	for (i = 0; i < 4; i++)
+		draw(i * 40, i * 40, 40, 40, pitch, 0, ptr);
+
+
+	draw(200, 100, 40, 40, pitch, 0xff00ff, ptr);
+	draw(100, 200, 40, 40, pitch, 0xff00ff, ptr);
+
+	drmBOUnmap(fd, &bo);
+}
+
+static EGLBoolean
+drm_show_screen_surface_mesa(_EGLDriver *drv, EGLDisplay dpy,
+                         EGLScreenMESA screen,
+                         EGLSurface surface, EGLModeMESA m)
+{
+	struct drm_driver *drm_drv = (struct drm_driver *)drv;
+	struct drm_surface *surf = lookup_drm_surface(surface);
+	struct drm_screen *scrn = lookup_drm_screen(dpy, screen);
+	_EGLMode *mode = _eglLookupMode(dpy, m);
+	size_t pitch = mode->Width * 4;
+	size_t size = mode->Height * pitch;
+	int ret;
+	unsigned int i,j,k;
+
+	if (scrn->shown)
+		drm_takedown_shown_screen(drv, scrn);
+
+	ret = drmBOCreate(drm_drv->device->drmFD, size, 0, 0,
+		DRM_BO_FLAG_READ |
+		DRM_BO_FLAG_WRITE |
+		DRM_BO_FLAG_MEM_TT |
+		DRM_BO_FLAG_MEM_VRAM |
+		DRM_BO_FLAG_NO_EVICT,
+		DRM_BO_HINT_DONT_FENCE, &scrn->buffer);
+
+	if (ret)
+		return EGL_FALSE;
+
+	prettyColors(drm_drv->device->drmFD, scrn->buffer.handle, pitch);
+
+	ret = drmModeAddFB(drm_drv->device->drmFD, mode->Width, mode->Height,
+			32, 32, pitch,
+			scrn->buffer.handle,
+			&scrn->fbID);
+
+	if (ret)
+		goto err_bo;
+
+	scrn->fb = drmModeGetFB(drm_drv->device->drmFD, scrn->fbID);
+	if (!scrn->fb)
+		goto err_bo;
+
+	for (j = 0; j < drm_drv->res->count_connectors; j++) {
+		drmModeConnector *con = drmModeGetConnector(drm_drv->device->drmFD, drm_drv->res->connectors[j]);
+		scrn->mode = drm_find_mode(con, mode);
+		if (!scrn->mode)
+			goto err_fb;
+
+		for (k = 0; k < con->count_encoders; k++) {
+			drmModeEncoder *enc = drmModeGetEncoder(drm_drv->device->drmFD, con->encoders[k]);
+			for (i = 0; i < drm_drv->res->count_crtcs; i++) {
+				if (enc->possible_crtcs & (1<<i)) {
+					ret = drmModeSetCrtc(
+						drm_drv->device->drmFD,
+						drm_drv->res->crtcs[i],
+						scrn->fbID,
+						0, 0,
+						&drm_drv->res->connectors[j], 1,
+						scrn->mode);	
+					/* skip the other crtcs now */
+					i = drm_drv->res->count_crtcs;
+				}
+			}
+		}
+	}
+
+	scrn->front.handle = scrn->buffer.handle;
+	scrn->front.pitch = pitch;
+	scrn->front.width = mode->Width;
+	scrn->front.height = mode->Height;
+
+	scrn->surf = surf;
+	intel_bind_frontbuffer(surf->drawable, &scrn->front);
+
+	scrn->shown = 1;
+
+	return EGL_TRUE;
+
+err_fb:
+	/* TODO remove fb */
+
+err_bo:
+	drmBOUnreference(drm_drv->device->drmFD, &scrn->buffer);
+	return EGL_FALSE;
+}
+
+static EGLBoolean
+drm_destroy_surface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
+{
+	struct drm_surface *fs = lookup_drm_surface(surface);
+	_eglRemoveSurface(&fs->base);
+	if (fs->base.IsBound) {
+		fs->base.DeletePending = EGL_TRUE;
+	} else {
+		intel_bind_frontbuffer(fs->drawable, NULL);
+		intel_destroy_drawable(fs->drawable);
+		free(fs->drawable);
+		free(fs);
+	}
+	return EGL_TRUE;
+}
+
+
+static EGLBoolean
+drm_make_current(_EGLDriver *drv, EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext context)
+{
+	struct drm_surface *readSurf = lookup_drm_surface(read);
+	struct drm_surface *drawSurf = lookup_drm_surface(draw);
+	struct drm_context *ctx = lookup_drm_context(context);
+	EGLBoolean b;
+
+	b = _eglMakeCurrent(drv, dpy, draw, read, context);
+	if (!b)
+		return EGL_FALSE;
+
+	if (ctx) {
+		if (!drawSurf || !readSurf)
+			return EGL_FALSE;
+
+		intel_make_current(ctx->context, drawSurf->drawable, readSurf->drawable);
+	} else {
+		intel_make_current(NULL, NULL, NULL);
+	}
+
+	return EGL_TRUE;
+}
+
+static EGLBoolean
+drm_swap_buffers(_EGLDriver *drv, EGLDisplay dpy, EGLSurface draw)
+{
+	struct drm_surface *surf = lookup_drm_surface(draw);
+	if (!surf)
+		return EGL_FALSE;
+
+	/* error checking */
+	if (!_eglSwapBuffers(drv, dpy, draw))
+		return EGL_FALSE;
+
+	intel_swap_buffers(surf->drawable);
+	return EGL_TRUE;
+}
+
+
+/**
+ * The bootstrap function.  Return a new drm_driver object and
+ * plug in API functions.
+ */
+_EGLDriver *
+_eglMain(_EGLDisplay *dpy, const char *args)
+{
+	struct drm_driver *drm;
+
+	drm = (struct drm_driver *) calloc(1, sizeof(struct drm_driver));
+	if (!drm) {
+		return NULL;
+	}
+
+	/* First fill in the dispatch table with defaults */
+	_eglInitDriverFallbacks(&drm->base);
+	/* then plug in our Drm-specific functions */
+	drm->base.API.Initialize = drm_initialize;
+	drm->base.API.Terminate = drm_terminate;
+	drm->base.API.CreateContext = drm_create_context;
+	drm->base.API.MakeCurrent = drm_make_current;
+	drm->base.API.CreateWindowSurface = drm_create_window_surface;
+	drm->base.API.CreatePixmapSurface = drm_create_pixmap_surface;
+	drm->base.API.CreatePbufferSurface = drm_create_pbuffer_surface;
+	drm->base.API.DestroySurface = drm_destroy_surface;
+	drm->base.API.DestroyContext = drm_destroy_context;
+	drm->base.API.CreateScreenSurfaceMESA = drm_create_screen_surface_mesa;
+	drm->base.API.ShowScreenSurfaceMESA = drm_show_screen_surface_mesa;
+	drm->base.API.SwapBuffers = drm_swap_buffers;
+
+	drm->base.ClientAPIsMask = EGL_OPENGL_BIT /*| EGL_OPENGL_ES_BIT*/;
+	drm->base.Name = "DRM/Gallium";
+
+	/* enable supported extensions */
+	drm->base.Extensions.MESA_screen_surface = EGL_TRUE;
+	drm->base.Extensions.MESA_copy_context = EGL_TRUE;
+
+	return &drm->base;
+}
diff --git a/src/gallium/winsys/drm/intel/egl/intel_egl.h b/src/gallium/winsys/drm/intel/egl/intel_egl.h
new file mode 100644
index 0000000000..1ee27e0847
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_egl.h
@@ -0,0 +1,53 @@
+
+#ifndef _INTEL_EGL_H_
+#define _INTEL_EGL_H_
+
+#include <xf86drm.h>
+
+struct egl_drm_device
+{
+	void *priv;
+	int drmFD;
+
+	drmVersionPtr version;
+	int deviceID;
+};
+
+struct egl_drm_context
+{
+	void *priv;
+	struct egl_drm_device *device;
+};
+
+struct egl_drm_drawable
+{
+	void *priv;
+	struct egl_drm_device *device;
+	size_t h;
+	size_t w;
+};
+
+struct egl_drm_frontbuffer
+{
+	uint32_t handle;
+	uint32_t pitch;
+	uint32_t width;
+	uint32_t height;
+};
+
+#include "GL/internal/glcore.h"
+
+int intel_create_device(struct egl_drm_device *device);
+int intel_destroy_device(struct egl_drm_device *device);
+
+int intel_create_context(struct egl_drm_context *context, const __GLcontextModes *visual, void *sharedContextPrivate);
+int intel_destroy_context(struct egl_drm_context *context);
+
+int intel_create_drawable(struct egl_drm_drawable *drawable, const __GLcontextModes * visual);
+int intel_destroy_drawable(struct egl_drm_drawable *drawable);
+
+void intel_make_current(struct egl_drm_context *context, struct egl_drm_drawable *draw, struct egl_drm_drawable *read);
+void intel_swap_buffers(struct egl_drm_drawable *draw);
+void intel_bind_frontbuffer(struct egl_drm_drawable *draw, struct egl_drm_frontbuffer *front);
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/intel_reg.h b/src/gallium/winsys/drm/intel/egl/intel_reg.h
new file mode 100644
index 0000000000..4f33bee438
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_reg.h
@@ -0,0 +1,53 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef _INTEL_REG_H_
+#define _INTEL_REG_H_
+
+
+#define BR00_BITBLT_CLIENT   0x40000000
+#define BR00_OP_COLOR_BLT    0x10000000
+#define BR00_OP_SRC_COPY_BLT 0x10C00000
+#define BR13_SOLID_PATTERN   0x80000000
+
+#define XY_COLOR_BLT_CMD		((2<<29)|(0x50<<22)|0x4)
+#define XY_COLOR_BLT_WRITE_ALPHA	(1<<21)
+#define XY_COLOR_BLT_WRITE_RGB		(1<<20)
+
+#define XY_SRC_COPY_BLT_CMD             ((2<<29)|(0x53<<22)|6)
+#define XY_SRC_COPY_BLT_WRITE_ALPHA     (1<<21)
+#define XY_SRC_COPY_BLT_WRITE_RGB       (1<<20)
+
+#define MI_WAIT_FOR_EVENT               ((0x3<<23))
+#define MI_WAIT_FOR_PLANE_B_FLIP        (1<<6)
+#define MI_WAIT_FOR_PLANE_A_FLIP        (1<<2)
+
+#define MI_BATCH_BUFFER_END 	        (0xA<<23)
+
+
+#endif
diff --git a/src/gallium/winsys/drm/intel/egl/intel_swapbuffers.c b/src/gallium/winsys/drm/intel/egl/intel_swapbuffers.c
new file mode 100644
index 0000000000..2edcbc79ff
--- /dev/null
+++ b/src/gallium/winsys/drm/intel/egl/intel_swapbuffers.c
@@ -0,0 +1,111 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "intel_device.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_cb_fbo.h"
+#include "intel_egl.h"
+
+
+static void
+intel_display_surface(struct egl_drm_drawable *draw,
+                      struct pipe_surface *surf);
+
+void intel_swap_buffers(struct egl_drm_drawable *draw)
+{
+	struct intel_framebuffer *intel_fb = (struct intel_framebuffer *)draw->priv;
+	struct pipe_surface *back_surf;
+
+	assert(intel_fb);
+	assert(intel_fb->stfb);
+
+	back_surf = st_get_framebuffer_surface(intel_fb->stfb, ST_SURFACE_BACK_LEFT);
+	if (back_surf) {
+		st_notify_swapbuffers(intel_fb->stfb);
+		if (intel_fb->front)
+			intel_display_surface(draw, back_surf);
+		st_notify_swapbuffers_complete(intel_fb->stfb);
+	}
+}
+
+static void
+intel_display_surface(struct egl_drm_drawable *draw,
+                      struct pipe_surface *surf)
+{
+	struct intel_context *intel = NULL;
+	struct intel_framebuffer *intel_fb = (struct intel_framebuffer *)draw->priv;
+	struct _DriFenceObject *fence;
+
+	//const int srcWidth = surf->width;
+	//const int srcHeight = surf->height;
+
+	intel = intel_fb->device->dummy;
+	if (!intel) {
+		printf("No dummy context\n");
+		return;
+	}
+
+	const int dstWidth = intel_fb->front->width;
+	const int dstHeight = intel_fb->front->height;
+	const int dstPitch = intel_fb->front->pitch / 4;//draw->front.cpp;
+
+	const int cpp = 4;//intel_fb->front->cpp;
+	const int srcPitch = surf->stride / cpp;
+
+	int BR13, CMD;
+	//int i;
+
+	BR13 = (dstPitch * cpp) | (0xCC << 16) | (1 << 24) | (1 << 25);
+	CMD = (XY_SRC_COPY_BLT_CMD | XY_SRC_COPY_BLT_WRITE_ALPHA |
+			XY_SRC_COPY_BLT_WRITE_RGB);
+
+	BEGIN_BATCH(8, 2);
+	OUT_BATCH(CMD);
+	OUT_BATCH(BR13);
+	OUT_BATCH((0 << 16) | 0);
+	OUT_BATCH((dstHeight << 16) | dstWidth);
+
+	OUT_RELOC(intel_fb->front_buffer,
+		DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE,
+		DRM_BO_MASK_MEM | DRM_BO_FLAG_WRITE, 0);
+
+	OUT_BATCH((0 << 16) | 0);
+	OUT_BATCH((srcPitch * cpp) & 0xffff);
+	OUT_RELOC(dri_bo(surf->buffer),
+			DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ,
+			DRM_BO_MASK_MEM | DRM_BO_FLAG_READ, 0);
+
+	fence = intel_be_batchbuffer_flush(intel->base.batch);
+	driFenceUnReference(&fence);
+	intel_be_batchbuffer_finish(intel->base.batch);
+}
diff --git a/src/gallium/winsys/drm/nouveau/Makefile b/src/gallium/winsys/drm/nouveau/Makefile
new file mode 100644
index 0000000000..81562ca78d
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/Makefile
@@ -0,0 +1,46 @@
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+LIBNAME = nouveau_dri.so
+
+MINIGLX_SOURCES =
+
+PIPE_DRIVERS = \
+	$(TOP)/src/gallium/drivers/softpipe/libsoftpipe.a \
+	$(TOP)/src/gallium/drivers/nv04/libnv04.a \
+	$(TOP)/src/gallium/drivers/nv10/libnv10.a \
+	$(TOP)/src/gallium/drivers/nv20/libnv20.a \
+	$(TOP)/src/gallium/drivers/nv30/libnv30.a \
+	$(TOP)/src/gallium/drivers/nv40/libnv40.a \
+	$(TOP)/src/gallium/drivers/nv50/libnv50.a
+
+DRIVER_SOURCES = \
+	nouveau_bo.c \
+	nouveau_channel.c \
+	nouveau_context.c \
+	nouveau_device.c \
+	nouveau_dma.c \
+	nouveau_fence.c \
+	nouveau_grobj.c \
+	nouveau_lock.c \
+	nouveau_notifier.c \
+	nouveau_pushbuf.c \
+	nouveau_resource.c \
+	nouveau_screen.c \
+	nouveau_swapbuffers.c \
+	nouveau_winsys.c \
+	nouveau_winsys_pipe.c \
+	nouveau_winsys_softpipe.c \
+	nv04_surface.c \
+	nv50_surface.c
+
+C_SOURCES = \
+	$(COMMON_GALLIUM_SOURCES) \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../Makefile.template
+
+symlinks:
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_bo.c b/src/gallium/winsys/drm/nouveau/nouveau_bo.c
new file mode 100644
index 0000000000..b5942994d9
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_bo.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static void
+nouveau_mem_free(struct nouveau_device *dev, struct drm_nouveau_mem_alloc *ma,
+		 void **map)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_mem_free mf;
+
+	if (map && *map) {
+		drmUnmap(*map, ma->size);
+		*map = NULL;
+	}
+
+	if (ma->size) {
+		mf.offset = ma->offset;
+		mf.flags = ma->flags;
+		drmCommandWrite(nvdev->fd, DRM_NOUVEAU_MEM_FREE,
+				&mf, sizeof(mf));
+		ma->size = 0;
+	}
+}
+
+static int
+nouveau_mem_alloc(struct nouveau_device *dev, unsigned size, unsigned align,
+		  uint32_t flags, struct drm_nouveau_mem_alloc *ma, void **map)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	int ret;
+
+	ma->alignment = align;
+	ma->size = size;
+	ma->flags = flags;
+	if (map)
+		ma->flags |= NOUVEAU_MEM_MAPPED;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_MEM_ALLOC, ma,
+				  sizeof(struct drm_nouveau_mem_alloc));
+	if (ret)
+		return ret;
+
+	if (map) {
+		ret = drmMap(nvdev->fd, ma->map_handle, ma->size, map);
+		if (ret) {
+			*map = NULL;
+			nouveau_mem_free(dev, ma, map);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static void
+nouveau_bo_tmp_del(void *priv)
+{
+	struct nouveau_resource *r = priv;
+
+	nouveau_fence_ref(NULL, (struct nouveau_fence **)&r->priv);
+	nouveau_resource_free(&r);
+}
+
+static unsigned
+nouveau_bo_tmp_max(struct nouveau_device_priv *nvdev)
+{
+	struct nouveau_resource *r = nvdev->sa_heap;
+	unsigned max = 0;
+
+	while (r) {
+		if (r->in_use && !nouveau_fence(r->priv)->emitted) {
+			r = r->next;
+			continue;
+		}
+
+		if (max < r->size)
+			max = r->size;
+		r = r->next;
+	}
+
+	return max;
+}
+
+static struct nouveau_resource *
+nouveau_bo_tmp(struct nouveau_channel *chan, unsigned size,
+	       struct nouveau_fence *fence)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_resource *r = NULL;
+	struct nouveau_fence *ref = NULL;
+
+	if (fence)
+		nouveau_fence_ref(fence, &ref);
+	else
+		nouveau_fence_new(chan, &ref);
+	assert(ref);
+
+	while (nouveau_resource_alloc(nvdev->sa_heap, size, ref, &r)) {
+		if (nouveau_bo_tmp_max(nvdev) < size) {
+			nouveau_fence_ref(NULL, &ref);
+			return NULL;
+		}
+
+		nouveau_fence_flush(chan);
+	}
+	nouveau_fence_signal_cb(ref, nouveau_bo_tmp_del, r);
+
+	return r;
+}
+
+int
+nouveau_bo_init(struct nouveau_device *dev)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	int ret;
+
+	ret = nouveau_mem_alloc(dev, 128*1024, 0, NOUVEAU_MEM_AGP |
+				NOUVEAU_MEM_PCI, &nvdev->sa, &nvdev->sa_map);
+	if (ret)
+		return ret;
+
+	ret = nouveau_resource_init(&nvdev->sa_heap, 0, nvdev->sa.size);
+	if (ret) {
+		nouveau_mem_free(dev, &nvdev->sa, &nvdev->sa_map);
+		return ret;
+	}
+
+	return 0;
+}
+
+void
+nouveau_bo_takedown(struct nouveau_device *dev)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	nouveau_mem_free(dev, &nvdev->sa, &nvdev->sa_map);
+}
+
+int
+nouveau_bo_new(struct nouveau_device *dev, uint32_t flags, int align,
+	       int size, struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+	int ret;
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo = calloc(1, sizeof(struct nouveau_bo_priv));
+	if (!nvbo)
+		return -ENOMEM;
+	nvbo->base.device = dev;
+	nvbo->base.size = size;
+	nvbo->base.handle = bo_to_ptr(nvbo);
+	nvbo->drm.alignment = align;
+	nvbo->refcount = 1;
+
+	if (flags & NOUVEAU_BO_TILED) {
+		nvbo->tiled = 1;
+		if (flags & NOUVEAU_BO_ZTILE)
+			nvbo->tiled |= 2;
+		flags &= ~NOUVEAU_BO_TILED;
+	}
+
+	ret = nouveau_bo_set_status(&nvbo->base, flags);
+	if (ret) {
+		free(nvbo);
+		return ret;
+	}
+
+	*bo = &nvbo->base;
+	return 0;
+}
+
+int
+nouveau_bo_user(struct nouveau_device *dev, void *ptr, int size,
+		struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo = calloc(1, sizeof(*nvbo));
+	if (!nvbo)
+		return -ENOMEM;
+	nvbo->base.device = dev;
+	
+	nvbo->sysmem = ptr;
+	nvbo->user = 1;
+
+	nvbo->base.size = size;
+	nvbo->base.offset = nvbo->drm.offset;
+	nvbo->base.handle = bo_to_ptr(nvbo);
+	nvbo->refcount = 1;
+	*bo = &nvbo->base;
+	return 0;
+}
+
+int
+nouveau_bo_ref(struct nouveau_device *dev, uint64_t handle,
+	       struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo = ptr_to_bo(handle);
+
+	if (!dev || !bo || *bo)
+		return -EINVAL;
+
+	nvbo->refcount++;
+	*bo = &nvbo->base;
+	return 0;
+}
+
+static void
+nouveau_bo_del_cb(void *priv)
+{
+	struct nouveau_bo_priv *nvbo = priv;
+
+	nouveau_fence_ref(NULL, &nvbo->fence);
+	nouveau_mem_free(nvbo->base.device, &nvbo->drm, &nvbo->map);
+	if (nvbo->sysmem && !nvbo->user)
+		free(nvbo->sysmem);
+	free(nvbo);
+}
+
+void
+nouveau_bo_del(struct nouveau_bo **bo)
+{
+	struct nouveau_bo_priv *nvbo;
+
+	if (!bo || !*bo)
+		return;
+	nvbo = nouveau_bo(*bo);
+	*bo = NULL;
+
+	if (--nvbo->refcount)
+		return;
+
+	if (nvbo->pending)
+		nouveau_pushbuf_flush(nvbo->pending->channel, 0);
+
+	if (nvbo->fence)
+		nouveau_fence_signal_cb(nvbo->fence, nouveau_bo_del_cb, nvbo);
+	else
+		nouveau_bo_del_cb(nvbo);
+}
+
+int
+nouveau_bo_map(struct nouveau_bo *bo, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+
+	if (!nvbo)
+		return -EINVAL;
+
+	if (nvbo->pending &&
+	    (nvbo->pending->flags & NOUVEAU_BO_WR || flags & NOUVEAU_BO_WR)) {
+		nouveau_pushbuf_flush(nvbo->pending->channel, 0);
+	}
+
+	if (flags & NOUVEAU_BO_WR)
+		nouveau_fence_wait(&nvbo->fence);
+	else
+		nouveau_fence_wait(&nvbo->wr_fence);
+
+	if (nvbo->sysmem)
+		bo->map = nvbo->sysmem;
+	else
+		bo->map = nvbo->map;
+	return 0;
+}
+
+void
+nouveau_bo_unmap(struct nouveau_bo *bo)
+{
+	bo->map = NULL;
+}
+
+static int
+nouveau_bo_upload(struct nouveau_bo_priv *nvbo)
+{
+	if (nvbo->fence)
+		nouveau_fence_wait(&nvbo->fence);
+	memcpy(nvbo->map, nvbo->sysmem, nvbo->drm.size);
+	return 0;
+}
+
+int
+nouveau_bo_set_status(struct nouveau_bo *bo, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct drm_nouveau_mem_alloc new;
+	void *new_map = NULL, *new_sysmem = NULL;
+	unsigned new_flags = 0, ret;
+
+	assert(!bo->map);
+
+	/* Check current memtype vs requested, if they match do nothing */
+	if ((nvbo->drm.flags & NOUVEAU_MEM_FB) && (flags & NOUVEAU_BO_VRAM))
+		return 0;
+	if ((nvbo->drm.flags & (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI)) &&
+	    (flags & NOUVEAU_BO_GART))
+		return 0;
+	if (nvbo->drm.size == 0 && nvbo->sysmem && (flags & NOUVEAU_BO_LOCAL))
+		return 0;
+
+	memset(&new, 0x00, sizeof(new));
+
+	/* Allocate new memory */
+	if (flags & NOUVEAU_BO_VRAM)
+		new_flags |= NOUVEAU_MEM_FB;
+	else
+	if (flags & NOUVEAU_BO_GART)
+		new_flags |= (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI);
+	
+	if (nvbo->tiled && flags) {
+		new_flags |= NOUVEAU_MEM_TILE;
+		if (nvbo->tiled & 2)
+			new_flags |= NOUVEAU_MEM_TILE_ZETA;
+	}
+
+	if (new_flags) {
+		ret = nouveau_mem_alloc(bo->device, bo->size,
+					nvbo->drm.alignment, new_flags,
+					&new, &new_map);
+		if (ret)
+			return ret;
+	} else
+	if (!nvbo->user) {
+		new_sysmem = malloc(bo->size);
+	}
+
+	/* Copy old -> new */
+	/*XXX: use M2MF */
+	if (nvbo->sysmem || nvbo->map) {
+		struct nouveau_pushbuf_bo *pbo = nvbo->pending;
+		nvbo->pending = NULL;
+		nouveau_bo_map(bo, NOUVEAU_BO_RD);
+		memcpy(new_map, bo->map, bo->size);
+		nouveau_bo_unmap(bo);
+		nvbo->pending = pbo;
+	}
+
+	/* Free old memory */
+	if (nvbo->fence)
+		nouveau_fence_wait(&nvbo->fence);
+	nouveau_mem_free(bo->device, &nvbo->drm, &nvbo->map);
+	if (nvbo->sysmem && !nvbo->user)
+		free(nvbo->sysmem);
+
+	nvbo->drm = new;
+	nvbo->map = new_map;
+	if (!nvbo->user)
+		nvbo->sysmem = new_sysmem;
+	bo->flags = flags;
+	bo->offset = nvbo->drm.offset;
+	return 0;
+}
+
+static int
+nouveau_bo_validate_user(struct nouveau_channel *chan, struct nouveau_bo *bo,
+			 struct nouveau_fence *fence, uint32_t flags)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_resource *r;
+
+	if (nvchan->user_charge + bo->size > nvdev->sa.size)
+		return 1;
+
+	if (!(flags & NOUVEAU_BO_GART))
+		return 1;
+
+	r = nouveau_bo_tmp(chan, bo->size, fence);
+	if (!r)
+		return 1;
+	nvchan->user_charge += bo->size;
+
+	memcpy(nvdev->sa_map + r->start, nvbo->sysmem, bo->size);
+
+	nvbo->offset = nvdev->sa.offset + r->start;
+	nvbo->flags = NOUVEAU_BO_GART;
+	return 0;
+}
+
+static int
+nouveau_bo_validate_bo(struct nouveau_channel *chan, struct nouveau_bo *bo,
+		       struct nouveau_fence *fence, uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	int ret;
+
+	ret = nouveau_bo_set_status(bo, flags);
+	if (ret) {
+		nouveau_fence_flush(chan);
+
+		ret = nouveau_bo_set_status(bo, flags);
+		if (ret)
+			return ret;
+	}
+
+	if (nvbo->user)
+		nouveau_bo_upload(nvbo);
+
+	nvbo->offset = nvbo->drm.offset;
+	if (nvbo->drm.flags & (NOUVEAU_MEM_AGP | NOUVEAU_MEM_PCI))
+		nvbo->flags = NOUVEAU_BO_GART;
+	else
+		nvbo->flags = NOUVEAU_BO_VRAM;
+
+	return 0;
+}
+
+int
+nouveau_bo_validate(struct nouveau_channel *chan, struct nouveau_bo *bo,
+		    uint32_t flags)
+{
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_fence *fence = nouveau_pushbuf(chan->pushbuf)->fence;
+	int ret;
+
+	assert(bo->map == NULL);
+
+	if (nvbo->user) {
+		ret = nouveau_bo_validate_user(chan, bo, fence, flags);
+		if (ret) {
+			ret = nouveau_bo_validate_bo(chan, bo, fence, flags);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ret = nouveau_bo_validate_bo(chan, bo, fence, flags);
+		if (ret)
+			return ret;
+	}
+
+	if (flags & NOUVEAU_BO_WR)
+		nouveau_fence_ref(fence, &nvbo->wr_fence);
+	nouveau_fence_ref(fence, &nvbo->fence);
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_channel.c b/src/gallium/winsys/drm/nouveau/nouveau_channel.c
new file mode 100644
index 0000000000..3b4dcd1ecf
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_channel.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+int
+nouveau_channel_alloc(struct nouveau_device *dev, uint32_t fb_ctxdma,
+		      uint32_t tt_ctxdma, struct nouveau_channel **chan)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct nouveau_channel_priv *nvchan;
+	int ret;
+
+	if (!nvdev || !chan || *chan)
+	    return -EINVAL;
+
+	nvchan = calloc(1, sizeof(struct nouveau_channel_priv));
+	if (!nvchan)
+		return -ENOMEM;
+	nvchan->base.device = dev;
+
+	nvchan->drm.fb_ctxdma_handle = fb_ctxdma;
+	nvchan->drm.tt_ctxdma_handle = tt_ctxdma;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_CHANNEL_ALLOC,
+				  &nvchan->drm, sizeof(nvchan->drm));
+	if (ret) {
+		free(nvchan);
+		return ret;
+	}
+
+	nvchan->base.id = nvchan->drm.channel;
+	if (nouveau_grobj_ref(&nvchan->base, nvchan->drm.fb_ctxdma_handle,
+			      &nvchan->base.vram) ||
+	    nouveau_grobj_ref(&nvchan->base, nvchan->drm.tt_ctxdma_handle,
+		    	      &nvchan->base.gart)) {
+		nouveau_channel_free((void *)&nvchan);
+		return -EINVAL;
+	}
+
+	ret = drmMap(nvdev->fd, nvchan->drm.ctrl, nvchan->drm.ctrl_size,
+		     (void*)&nvchan->user);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+	nvchan->put     = &nvchan->user[0x40/4];
+	nvchan->get     = &nvchan->user[0x44/4];
+	nvchan->ref_cnt = &nvchan->user[0x48/4];
+
+	ret = drmMap(nvdev->fd, nvchan->drm.notifier, nvchan->drm.notifier_size,
+		     (drmAddressPtr)&nvchan->notifier_block);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	ret = drmMap(nvdev->fd, nvchan->drm.cmdbuf, nvchan->drm.cmdbuf_size,
+		     (void*)&nvchan->pushbuf);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	ret = nouveau_grobj_alloc(&nvchan->base, 0x00000000, 0x0030,
+				  &nvchan->base.nullobj);
+	if (ret) {
+		nouveau_channel_free((void *)&nvchan);
+		return ret;
+	}
+
+	nouveau_dma_channel_init(&nvchan->base);
+	nouveau_pushbuf_init(&nvchan->base);
+
+	*chan = &nvchan->base;
+	return 0;
+}
+
+void
+nouveau_channel_free(struct nouveau_channel **chan)
+{
+	struct nouveau_channel_priv *nvchan;
+	struct nouveau_device_priv *nvdev;
+	struct drm_nouveau_channel_free cf;
+
+	if (!chan || !*chan)
+		return;
+	nvchan = nouveau_channel(*chan);
+	*chan = NULL;
+	nvdev = nouveau_device(nvchan->base.device);
+	
+	FIRE_RING_CH(&nvchan->base);
+
+	nouveau_grobj_free(&nvchan->base.vram);
+	nouveau_grobj_free(&nvchan->base.gart);
+	nouveau_grobj_free(&nvchan->base.nullobj);
+
+	cf.channel = nvchan->drm.channel;
+	drmCommandWrite(nvdev->fd, DRM_NOUVEAU_CHANNEL_FREE, &cf, sizeof(cf));
+	free(nvchan);
+}
+
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_context.c b/src/gallium/winsys/drm/nouveau/nouveau_context.c
new file mode 100644
index 0000000000..74413c408f
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_context.c
@@ -0,0 +1,346 @@
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+#include "utils.h"
+
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+
+#include "nouveau_context.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+#ifdef DEBUG
+static const struct dri_debug_control debug_control[] = {
+	{ "bo", DEBUG_BO },
+	{ NULL, 0 }
+};
+int __nouveau_debug = 0;
+#endif
+
+static void
+nouveau_channel_context_destroy(struct nouveau_channel_context *nvc)
+{
+	nouveau_grobj_free(&nvc->NvCtxSurf2D);
+	nouveau_grobj_free(&nvc->NvImageBlit);
+	nouveau_grobj_free(&nvc->NvGdiRect);
+	nouveau_grobj_free(&nvc->NvM2MF);
+	nouveau_grobj_free(&nvc->Nv2D);
+	nouveau_grobj_free(&nvc->NvSwzSurf);
+	nouveau_grobj_free(&nvc->NvSIFM);
+
+	nouveau_notifier_free(&nvc->sync_notifier);
+
+	nouveau_channel_free(&nvc->channel);
+
+	FREE(nvc);
+}
+
+static struct nouveau_channel_context *
+nouveau_channel_context_create(struct nouveau_device *dev)
+{
+	struct nouveau_channel_context *nvc;
+	int ret;
+
+	nvc = CALLOC_STRUCT(nouveau_channel_context);
+	if (!nvc)
+		return NULL;
+
+	if ((ret = nouveau_channel_alloc(dev, 0x8003d001, 0x8003d002,
+					 &nvc->channel))) {
+		NOUVEAU_ERR("Error creating GPU channel: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	nvc->next_handle = 0x80000000;
+
+	if ((ret = nouveau_notifier_alloc(nvc->channel, nvc->next_handle++, 1,
+					  &nvc->sync_notifier))) {
+		NOUVEAU_ERR("Error creating channel sync notifier: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		ret = nouveau_surface_channel_create_nv50(nvc);
+		break;
+	default:
+		ret = nouveau_surface_channel_create_nv04(nvc);
+		break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising surface objects: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	return nvc;
+}
+
+GLboolean
+nouveau_context_create(const __GLcontextModes *glVis,
+		       __DRIcontextPrivate *driContextPriv,
+		       void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *driScrnPriv = driContextPriv->driScreenPriv;
+	struct nouveau_screen  *nv_screen = driScrnPriv->private;
+	struct nouveau_context *nv = CALLOC_STRUCT(nouveau_context);
+	struct pipe_context *pipe = NULL;
+	struct st_context *st_share = NULL;
+	struct nouveau_channel_context *nvc = NULL;
+	struct nouveau_device *dev = nv_screen->device;
+	int i;
+
+	if (sharedContextPrivate) {
+		st_share = ((struct nouveau_context *)sharedContextPrivate)->st;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		/* NV10 */
+	case 0x30:
+		/* NV30 */
+	case 0x40:
+	case 0x60:
+		/* NV40 */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 */
+		break;
+	default:
+		NOUVEAU_ERR("Unsupported chipset: NV%02x\n", dev->chipset);
+		return GL_FALSE;
+	}
+
+	driContextPriv->driverPrivate = (void *)nv;
+	nv->nv_screen  = nv_screen;
+	nv->dri_screen = driScrnPriv;
+
+	{
+		struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+		nvdev->ctx  = driContextPriv->hHWContext;
+		nvdev->lock = (drmLock *)&driScrnPriv->pSAREA->lock;
+	}
+
+	driParseConfigFiles(&nv->dri_option_cache, &nv_screen->option_cache,
+			    nv->dri_screen->myNum, "nouveau");
+#ifdef DEBUG
+	__nouveau_debug = driParseDebugString(getenv("NOUVEAU_DEBUG"),
+					      debug_control);
+#endif
+
+	/*XXX: Hack up a fake region and buffer object for front buffer.
+	 *     This will go away with TTM, replaced with a simple reference
+	 *     of the front buffer handle passed to us by the DDX.
+	 */
+	{
+		struct pipe_surface *fb_surf;
+		struct nouveau_pipe_buffer *fb_buf;
+		struct nouveau_bo_priv *fb_bo;
+
+		fb_bo = calloc(1, sizeof(struct nouveau_bo_priv));
+		fb_bo->drm.offset = nv_screen->front_offset;
+		fb_bo->drm.flags = NOUVEAU_MEM_FB;
+		fb_bo->drm.size = nv_screen->front_pitch * 
+				  nv_screen->front_height;
+		fb_bo->refcount = 1;
+		fb_bo->base.flags = NOUVEAU_BO_PIN | NOUVEAU_BO_VRAM;
+		fb_bo->base.offset = fb_bo->drm.offset;
+		fb_bo->base.handle = (unsigned long)fb_bo;
+		fb_bo->base.size = fb_bo->drm.size;
+		fb_bo->base.device = nv_screen->device;
+
+		fb_buf = calloc(1, sizeof(struct nouveau_pipe_buffer));
+		fb_buf->bo = &fb_bo->base;
+
+		fb_surf = calloc(1, sizeof(struct pipe_surface));
+		if (nv_screen->front_cpp == 2)
+			fb_surf->format = PIPE_FORMAT_R5G6B5_UNORM;
+		else
+			fb_surf->format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		pf_get_block(fb_surf->format, &fb_surf->block);
+		fb_surf->width = nv_screen->front_pitch / nv_screen->front_cpp;
+		fb_surf->height = nv_screen->front_height;
+		fb_surf->stride = fb_surf->width * fb_surf->block.size;
+		fb_surf->refcount = 1;
+		fb_surf->buffer = &fb_buf->base;
+
+		nv->frontbuffer = fb_surf;
+	}
+
+	/* Attempt to share a single channel between multiple contexts from
+	 * a single process.
+	 */
+	nvc = nv_screen->nvc;
+	if (!nvc && st_share) {
+		struct nouveau_context *snv = st_share->pipe->priv;
+		if (snv) {
+			nvc = snv->nvc;
+		}
+	}
+
+	/*XXX: temporary - disable multi-context/single-channel on pre-NV4x */
+	switch (dev->chipset & 0xf0) {
+	case 0x40:
+	case 0x60:
+		/* NV40 class */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 class */
+		break;
+	default:
+		nvc = NULL;
+		break;
+	}
+
+	if (!nvc) {
+		nvc = nouveau_channel_context_create(dev);
+		if (!nvc) {
+			NOUVEAU_ERR("Failed initialising GPU context\n");
+			return GL_FALSE;
+		}
+		nv_screen->nvc = nvc;
+	}
+
+	nvc->refcount++;
+	nv->nvc = nvc;
+
+	/* Find a free slot for a pipe context, allocate a new one if needed */
+	nv->pctx_id = -1;
+	for (i = 0; i < nvc->nr_pctx; i++) {
+		if (nvc->pctx[i] == NULL) {
+			nv->pctx_id = i;
+			break;
+		}
+	}
+
+	if (nv->pctx_id < 0) {
+		nv->pctx_id = nvc->nr_pctx++;
+		nvc->pctx =
+			realloc(nvc->pctx,
+				sizeof(struct pipe_context *) * nvc->nr_pctx);
+	}
+
+	/* Create pipe */
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		if (nouveau_surface_init_nv50(nv))
+			return GL_FALSE;
+		break;
+	default:
+		if (nouveau_surface_init_nv04(nv))
+			return GL_FALSE;
+		break;
+	}
+
+	if (!getenv("NOUVEAU_FORCE_SOFTPIPE")) {
+		struct pipe_screen *pscreen;
+
+		pipe = nouveau_pipe_create(nv);
+		if (!pipe)
+			NOUVEAU_ERR("Couldn't create hw pipe\n");
+		pscreen = nvc->pscreen;
+
+		nv->cap.hw_vertex_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_VTXBUF);
+		nv->cap.hw_index_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF);
+	}
+
+	if (!pipe) {
+		NOUVEAU_MSG("Using softpipe\n");
+		pipe = nouveau_create_softpipe(nv);
+		if (!pipe) {
+			NOUVEAU_ERR("Error creating pipe, bailing\n");
+			return GL_FALSE;
+		}
+	}
+
+	pipe->priv = nv;
+	nv->st = st_create_context(pipe, glVis, st_share);
+	return GL_TRUE;
+}
+
+void
+nouveau_context_destroy(__DRIcontextPrivate *driContextPriv)
+{
+	struct nouveau_context *nv = driContextPriv->driverPrivate;
+	struct nouveau_channel_context *nvc = nv->nvc;
+
+	assert(nv);
+
+	st_finish(nv->st);
+	st_destroy_context(nv->st);
+
+	if (nv->pctx_id >= 0) {
+		nvc->pctx[nv->pctx_id] = NULL;
+		if (--nvc->refcount <= 0) {
+			nouveau_channel_context_destroy(nvc);
+			nv->nv_screen->nvc = NULL;
+		}
+	}
+
+	free(nv);
+}
+
+GLboolean
+nouveau_context_bind(__DRIcontextPrivate *driContextPriv,
+		     __DRIdrawablePrivate *driDrawPriv,
+		     __DRIdrawablePrivate *driReadPriv)
+{
+	struct nouveau_context *nv;
+	struct nouveau_framebuffer *draw, *read;
+
+	if (!driContextPriv) {
+		st_make_current(NULL, NULL, NULL);
+		return GL_TRUE;
+	}
+
+	nv = driContextPriv->driverPrivate;
+	draw = driDrawPriv->driverPrivate;
+	read = driReadPriv->driverPrivate;
+
+	st_make_current(nv->st, draw->stfb, read->stfb);
+
+	if ((nv->dri_drawable != driDrawPriv) ||
+	    (nv->last_stamp != driDrawPriv->lastStamp)) {
+		nv->dri_drawable = driDrawPriv;
+		st_resize_framebuffer(draw->stfb, driDrawPriv->w,
+				      driDrawPriv->h);
+		nv->last_stamp = driDrawPriv->lastStamp;
+	}
+
+	if (driDrawPriv != driReadPriv) {
+		st_resize_framebuffer(read->stfb, driReadPriv->w,
+				      driReadPriv->h);
+	}
+
+	return GL_TRUE;
+}
+
+GLboolean
+nouveau_context_unbind(__DRIcontextPrivate *driContextPriv)
+{
+	struct nouveau_context *nv = driContextPriv->driverPrivate;
+	(void)nv;
+
+	st_flush(nv->st, 0, NULL);
+	return GL_TRUE;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_context.h b/src/gallium/winsys/drm/nouveau/nouveau_context.h
new file mode 100644
index 0000000000..77e2147a2c
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_context.h
@@ -0,0 +1,113 @@
+#ifndef __NOUVEAU_CONTEXT_H__
+#define __NOUVEAU_CONTEXT_H__
+
+#include "dri_util.h"
+#include "xmlconfig.h"
+
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+struct nouveau_framebuffer {
+	struct st_framebuffer *stfb;
+};
+
+struct nouveau_channel_context {
+	struct pipe_screen *pscreen;
+	int refcount;
+
+	unsigned cur_pctx;
+	unsigned nr_pctx;
+	struct pipe_context **pctx;
+
+	struct nouveau_channel  *channel;
+
+	struct nouveau_notifier *sync_notifier;
+
+	/* Common */
+	struct nouveau_grobj    *NvM2MF;
+	/* NV04-NV40 */
+	struct nouveau_grobj    *NvCtxSurf2D;
+	struct nouveau_grobj	*NvSwzSurf;
+	struct nouveau_grobj    *NvImageBlit;
+	struct nouveau_grobj    *NvGdiRect;
+	struct nouveau_grobj	*NvSIFM;
+	/* G80 */
+	struct nouveau_grobj    *Nv2D;
+
+	uint32_t                 next_handle;
+	uint32_t                 next_subchannel;
+	uint32_t                 next_sequence;
+};
+
+struct nouveau_context {
+	struct st_context *st;
+
+	/* DRI stuff */
+	__DRIscreenPrivate    *dri_screen;
+	__DRIdrawablePrivate  *dri_drawable;
+	unsigned int           last_stamp;
+	driOptionCache         dri_option_cache;
+	drm_context_t          drm_context;
+	drmLock                drm_lock;
+	GLboolean              locked;
+	struct nouveau_screen *nv_screen;
+	struct pipe_surface *frontbuffer;
+
+	struct {
+		int hw_vertex_buffer;
+		int hw_index_buffer;
+	} cap;
+
+	/* Hardware context */
+	struct nouveau_channel_context *nvc;
+	int pctx_id;
+
+	/* pipe_surface accel */
+	struct pipe_surface *surf_src, *surf_dst;
+	unsigned surf_src_offset, surf_dst_offset;
+	int  (*surface_copy_prep)(struct nouveau_context *,
+				  struct pipe_surface *dst,
+				  struct pipe_surface *src);
+	void (*surface_copy)(struct nouveau_context *, unsigned dx, unsigned dy,
+			     unsigned sx, unsigned sy, unsigned w, unsigned h);
+	void (*surface_copy_done)(struct nouveau_context *);
+	int (*surface_fill)(struct nouveau_context *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern GLboolean nouveau_context_create(const __GLcontextModes *,
+					__DRIcontextPrivate *, void *);
+extern void nouveau_context_destroy(__DRIcontextPrivate *);
+extern GLboolean nouveau_context_bind(__DRIcontextPrivate *,
+				      __DRIdrawablePrivate *draw,
+				      __DRIdrawablePrivate *read);
+extern GLboolean nouveau_context_unbind(__DRIcontextPrivate *);
+
+#ifdef DEBUG
+extern int __nouveau_debug;
+
+#define DEBUG_BO (1 << 0)
+
+#define DBG(flag, ...) do {                   \
+	if (__nouveau_debug & (DEBUG_##flag)) \
+		NOUVEAU_ERR(__VA_ARGS__);     \
+} while(0)
+#else
+#define DBG(flag, ...)
+#endif
+
+extern void LOCK_HARDWARE(struct nouveau_context *);
+extern void UNLOCK_HARDWARE(struct nouveau_context *);
+
+extern int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *);
+extern int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *);
+extern int nouveau_surface_init_nv04(struct nouveau_context *);
+extern int nouveau_surface_init_nv50(struct nouveau_context *);
+
+extern uint32_t *nouveau_pipe_dma_beginp(struct nouveau_grobj *, int, int);
+extern void nouveau_pipe_dma_kickoff(struct nouveau_channel *);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_device.c b/src/gallium/winsys/drm/nouveau/nouveau_device.c
new file mode 100644
index 0000000000..0b452fcd02
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_device.c
@@ -0,0 +1,159 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+
+int
+nouveau_device_open_existing(struct nouveau_device **dev, int close,
+			     int fd, drm_context_t ctx)
+{
+	struct nouveau_device_priv *nvdev;
+	int ret;
+
+	if (!dev || *dev)
+	    return -EINVAL;
+
+	nvdev = calloc(1, sizeof(*nvdev));
+	if (!nvdev)
+	    return -ENOMEM;
+	nvdev->fd = fd;
+	nvdev->ctx = ctx;
+	nvdev->needs_close = close;
+
+	drmCommandNone(nvdev->fd, DRM_NOUVEAU_CARD_INIT);
+
+	if ((ret = nouveau_bo_init(&nvdev->base))) {
+		nouveau_device_close((void *)&nvdev);
+		return ret;
+	}
+
+	{
+		uint64_t value;
+
+		ret = nouveau_device_get_param(&nvdev->base,
+					       NOUVEAU_GETPARAM_CHIPSET_ID,
+					       &value);
+		if (ret) {
+			nouveau_device_close((void *)&nvdev);
+			return ret;
+		}
+		nvdev->base.chipset = value;
+	}
+
+	*dev = &nvdev->base;
+	return 0;
+}
+
+int
+nouveau_device_open(struct nouveau_device **dev, const char *busid)
+{
+	drm_context_t ctx;
+	int fd, ret;
+
+	if (!dev || *dev)
+		return -EINVAL;
+
+	fd = drmOpen("nouveau", busid);
+	if (fd < 0)
+		return -EINVAL;
+
+	ret = drmCreateContext(fd, &ctx);
+	if (ret) {
+		drmClose(fd);
+		return ret;
+	}
+
+	ret = nouveau_device_open_existing(dev, 1, fd, ctx);
+	if (ret) {
+	    drmDestroyContext(fd, ctx);
+	    drmClose(fd);
+	    return ret;
+	}
+
+	return 0;
+}
+
+void
+nouveau_device_close(struct nouveau_device **dev)
+{
+	struct nouveau_device_priv *nvdev;
+
+	if (dev || !*dev)
+		return;
+	nvdev = nouveau_device(*dev);
+	*dev = NULL;
+
+	nouveau_bo_takedown(&nvdev->base);
+
+	if (nvdev->needs_close) {
+		drmDestroyContext(nvdev->fd, nvdev->ctx);
+		drmClose(nvdev->fd);
+	}
+	free(nvdev);
+}
+
+int
+nouveau_device_get_param(struct nouveau_device *dev,
+			 uint64_t param, uint64_t *value)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_getparam g;
+	int ret;
+
+	if (!nvdev || !value)
+		return -EINVAL;
+
+	g.param = param;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_GETPARAM,
+				  &g, sizeof(g));
+	if (ret)
+		return ret;
+
+	*value = g.value;
+	return 0;
+}
+
+int
+nouveau_device_set_param(struct nouveau_device *dev,
+			 uint64_t param, uint64_t value)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	struct drm_nouveau_setparam s;
+	int ret;
+
+	if (!nvdev)
+		return -EINVAL;
+
+	s.param = param;
+	s.value = value;
+	ret = drmCommandWriteRead(nvdev->fd, DRM_NOUVEAU_SETPARAM,
+				  &s, sizeof(s));
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dma.c b/src/gallium/winsys/drm/nouveau/nouveau_dma.c
new file mode 100644
index 0000000000..f8a8ba04f6
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dma.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdint.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static inline uint32_t
+READ_GET(struct nouveau_channel_priv *nvchan)
+{
+	return *nvchan->get;
+}
+
+static inline void
+WRITE_PUT(struct nouveau_channel_priv *nvchan, uint32_t val)
+{
+	uint32_t put = ((val << 2) + nvchan->dma->base);
+	volatile int dum;
+
+	NOUVEAU_DMA_BARRIER;
+	dum = READ_GET(nvchan);
+
+	*nvchan->put = put;
+	nvchan->dma->put = val;
+#ifdef NOUVEAU_DMA_TRACE
+	NOUVEAU_MSG("WRITE_PUT %d/0x%08x\n", nvchan->drm.channel, put);
+#endif
+
+	NOUVEAU_DMA_BARRIER;
+}
+
+static inline int
+LOCAL_GET(struct nouveau_dma_priv *dma, uint32_t *val)
+{
+	uint32_t get = *val;
+
+	if (get >= dma->base && get <= (dma->base + (dma->max << 2))) {
+		*val = (get - dma->base) >> 2;
+		return 1;
+	}
+
+	return 0;
+}
+
+void
+nouveau_dma_channel_init(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	int i;
+
+	nvchan->dma = &nvchan->dma_master;
+	nvchan->dma->base = nvchan->drm.put_base;
+	nvchan->dma->cur  = nvchan->dma->put = 0;
+	nvchan->dma->max  = (nvchan->drm.cmdbuf_size >> 2) - 2;
+	nvchan->dma->free = nvchan->dma->max - nvchan->dma->cur;
+
+	RING_SPACE_CH(chan, RING_SKIPS);
+	for (i = 0; i < RING_SKIPS; i++)
+		OUT_RING_CH(chan, 0);
+}
+
+#define CHECK_TIMEOUT() do {                                                   \
+	if ((NOUVEAU_TIME_MSEC() - t_start) > NOUVEAU_DMA_TIMEOUT)             \
+		return - EBUSY;                                                \
+} while(0)
+
+int
+nouveau_dma_wait(struct nouveau_channel *chan, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	uint32_t get, t_start;
+
+	FIRE_RING_CH(chan);
+
+	t_start = NOUVEAU_TIME_MSEC();
+	while (dma->free < size) {
+		CHECK_TIMEOUT();
+
+		get = READ_GET(nvchan);
+		if (!LOCAL_GET(dma, &get))
+			continue;
+
+		if (dma->put >= get) {
+			dma->free = dma->max - dma->cur;
+
+			if (dma->free < size) {
+#ifdef NOUVEAU_DMA_DEBUG
+				dma->push_free = 1;
+#endif
+				OUT_RING_CH(chan, 0x20000000 | dma->base);
+				if (get <= RING_SKIPS) {
+					/*corner case - will be idle*/
+					if (dma->put <= RING_SKIPS)
+						WRITE_PUT(nvchan,
+							  RING_SKIPS + 1);
+
+					do {
+						CHECK_TIMEOUT();
+						get = READ_GET(nvchan);
+						if (!LOCAL_GET(dma, &get))
+							get = 0;
+					} while (get <= RING_SKIPS);
+				}
+
+				WRITE_PUT(nvchan, RING_SKIPS);
+				dma->cur  = dma->put = RING_SKIPS;
+				dma->free = get - (RING_SKIPS + 1);
+			}
+		} else {
+			dma->free = get - dma->cur - 1;
+		}
+	}
+
+	return 0;
+}
+
+#ifdef NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+static void
+nouveau_dma_parse_pushbuf(struct nouveau_channel *chan, int get, int put)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	unsigned mthd_count = 0;
+	
+	while (get != put) {
+		uint32_t gpuget = (get << 2) + nvchan->drm.put_base;
+		uint32_t data;
+
+		if (get < 0 || get >= nvchan->drm.cmdbuf_size) {
+			NOUVEAU_ERR("DMA_PT 0x%08x\n", gpuget);
+			assert(0);
+		}
+		data = nvchan->pushbuf[get++];
+
+		if (mthd_count) {
+			NOUVEAU_MSG("0x%08x 0x%08x\n", gpuget, data);
+			mthd_count--;
+			continue;
+		}
+
+		switch (data & 0x60000000) {
+		case 0x00000000:
+			mthd_count = (data >> 18) & 0x7ff;
+			NOUVEAU_MSG("0x%08x 0x%08x MTHD "
+				    "Sc %d Mthd 0x%04x Size %d\n",
+				    gpuget, data, (data>>13) & 7, data & 0x1ffc,
+				    mthd_count);
+			break;
+		case 0x20000000:
+			get = (data & 0x1ffffffc) >> 2;
+			NOUVEAU_MSG("0x%08x 0x%08x JUMP 0x%08x\n",
+				    gpuget, data, data & 0x1ffffffc);
+			continue;
+		case 0x40000000:
+			mthd_count = (data >> 18) & 0x7ff;
+			NOUVEAU_MSG("0x%08x 0x%08x NINC "
+				    "Sc %d Mthd 0x%04x Size %d\n",
+				    gpuget, data, (data>>13) & 7, data & 0x1ffc,
+				    mthd_count);
+			break;
+		case 0x60000000:
+			/* DMA_OPCODE_CALL apparently, doesn't seem to work on
+			 * my NV40 at least..
+			 */
+			/* fall-through */
+		default:
+			NOUVEAU_MSG("DMA_PUSHER 0x%08x 0x%08x\n",
+				    gpuget, data);
+			assert(0);
+		}
+	}
+}
+#endif
+
+void
+nouveau_dma_kickoff(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+	if (dma->cur == dma->put)
+		return;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free) {
+		NOUVEAU_ERR("Packet incomplete: %d left\n", dma->push_free);
+		return;
+	}
+#endif
+
+#ifdef NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+	nouveau_dma_parse_pushbuf(chan, dma->put, dma->cur);
+#endif
+
+	WRITE_PUT(nvchan, dma->cur);
+}
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dma.h b/src/gallium/winsys/drm/nouveau/nouveau_dma.h
new file mode 100644
index 0000000000..cfa6d26e82
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dma.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DMA_H__
+#define __NOUVEAU_DMA_H__
+
+#include <string.h>
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+#define RING_SKIPS 8
+
+extern int  nouveau_dma_wait(struct nouveau_channel *chan, int size);
+extern void nouveau_dma_subc_bind(struct nouveau_grobj *);
+extern void nouveau_dma_channel_init(struct nouveau_channel *);
+extern void nouveau_dma_kickoff(struct nouveau_channel *);
+
+#ifdef NOUVEAU_DMA_DEBUG
+static char faulty[1024];
+#endif
+
+static inline void
+nouveau_dma_out(struct nouveau_channel *chan, uint32_t data)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free == 0) {
+		NOUVEAU_ERR("No space left in packet at %s\n", faulty);
+		return;
+	}
+	dma->push_free--;
+#endif
+#ifdef NOUVEAU_DMA_TRACE
+	{
+		uint32_t offset = (dma->cur << 2) + dma->base;
+		NOUVEAU_MSG("\tOUT_RING %d/0x%08x -> 0x%08x\n",
+			    nvchan->drm.channel, offset, data);
+	}
+#endif
+	nvchan->pushbuf[dma->cur + (dma->base - nvchan->drm.put_base)/4] = data;
+	dma->cur++;
+}
+
+static inline void
+nouveau_dma_outp(struct nouveau_channel *chan, uint32_t *ptr, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	(void)dma;
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free < size) {
+		NOUVEAU_ERR("Packet too small.  Free=%d, Need=%d\n",
+			    dma->push_free, size);
+		return;
+	}
+#endif
+#ifdef NOUVEAU_DMA_TRACE
+	while (size--) {
+		nouveau_dma_out(chan, *ptr);
+		ptr++;
+	}
+#else
+	memcpy(&nvchan->pushbuf[dma->cur], ptr, size << 2);
+#ifdef NOUVEAU_DMA_DEBUG
+	dma->push_free -= size;
+#endif
+	dma->cur += size;
+#endif
+}
+
+static inline void
+nouveau_dma_space(struct nouveau_channel *chan, int size)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+
+	if (dma->free < size) {
+		if (nouveau_dma_wait(chan, size) && chan->hang_notify)
+			chan->hang_notify(chan);
+	}
+	dma->free -= size;
+#ifdef NOUVEAU_DMA_DEBUG
+	dma->push_free = size;
+#endif
+}
+
+static inline void
+nouveau_dma_begin(struct nouveau_channel *chan, struct nouveau_grobj *grobj,
+		  int method, int size, const char* file, int line)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *dma = nvchan->dma;
+	(void)dma;
+
+#ifdef NOUVEAU_DMA_TRACE
+	NOUVEAU_MSG("BEGIN_RING %d/%08x/%d/0x%04x/%d\n", nvchan->drm.channel,
+		    grobj->handle, grobj->subc, method, size);
+#endif
+
+#ifdef NOUVEAU_DMA_DEBUG
+	if (dma->push_free) {
+		NOUVEAU_ERR("Previous packet incomplete: %d left at %s\n",
+			    dma->push_free, faulty);
+		return;
+	}
+	sprintf(faulty,"%s:%d",file,line);
+#endif
+
+	nouveau_dma_space(chan, (size + 1));
+	nouveau_dma_out(chan, (size << 18) | (grobj->subc << 13) | method);
+}
+
+#define RING_SPACE_CH(ch,sz)         nouveau_dma_space((ch), (sz))
+#define BEGIN_RING_CH(ch,gr,m,sz)    nouveau_dma_begin((ch), (gr), (m), (sz), __FUNCTION__, __LINE__ )
+#define OUT_RING_CH(ch, data)        nouveau_dma_out((ch), (data))
+#define OUT_RINGp_CH(ch,ptr,dwords)  nouveau_dma_outp((ch), (void*)(ptr),      \
+						      (dwords))
+#define FIRE_RING_CH(ch)             nouveau_dma_kickoff((ch))
+#define WAIT_RING_CH(ch,sz)          nouveau_dma_wait((ch), (sz))
+		
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_dri.h b/src/gallium/winsys/drm/nouveau/nouveau_dri.h
new file mode 100644
index 0000000000..1207c2d609
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_dri.h
@@ -0,0 +1,28 @@
+#ifndef _NOUVEAU_DRI_
+#define _NOUVEAU_DRI_
+
+#include "xf86drm.h"
+#include "drm.h"
+#include "nouveau_drm.h"
+
+struct nouveau_dri {
+	uint32_t device_id;	/**< \brief PCI device ID */
+	uint32_t width;		/**< \brief width in pixels of display */
+	uint32_t height;	/**< \brief height in scanlines of display */
+	uint32_t depth;		/**< \brief depth of display (8, 15, 16, 24) */
+	uint32_t bpp;		/**< \brief bit depth of display (8, 16, 24, 32) */
+
+	uint32_t bus_type;	/**< \brief ths bus type */
+	uint32_t bus_mode;	/**< \brief bus mode (used for AGP, maybe also for PCI-E ?) */
+
+	uint32_t front_offset;	/**< \brief front buffer offset */
+	uint32_t front_pitch;	/**< \brief front buffer pitch */
+	uint32_t back_offset;	/**< \brief private back buffer offset */
+	uint32_t back_pitch;	/**< \brief private back buffer pitch */
+	uint32_t depth_offset;	/**< \brief private depth buffer offset */
+	uint32_t depth_pitch;	/**< \brief private depth buffer pitch */
+
+};
+
+#endif
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_drmif.h b/src/gallium/winsys/drm/nouveau/nouveau_drmif.h
new file mode 100644
index 0000000000..dcd6a5eb0a
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_drmif.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NOUVEAU_DRMIF_H__
+#define __NOUVEAU_DRMIF_H__
+
+#include <stdint.h>
+#include <xf86drm.h>
+#include <nouveau_drm.h>
+
+#include "nouveau/nouveau_device.h"
+#include "nouveau/nouveau_channel.h"
+#include "nouveau/nouveau_grobj.h"
+#include "nouveau/nouveau_notifier.h"
+#include "nouveau/nouveau_bo.h"
+#include "nouveau/nouveau_resource.h"
+#include "nouveau/nouveau_pushbuf.h"
+
+struct nouveau_device_priv {
+	struct nouveau_device base;
+
+	int fd;
+	drm_context_t ctx;
+	drmLock *lock;
+	int needs_close;
+
+	struct drm_nouveau_mem_alloc sa;
+	void *sa_map;
+	struct nouveau_resource *sa_heap;
+};
+#define nouveau_device(n) ((struct nouveau_device_priv *)(n))
+
+extern int
+nouveau_device_open_existing(struct nouveau_device **, int close,
+			     int fd, drm_context_t ctx);
+
+extern int
+nouveau_device_open(struct nouveau_device **, const char *busid);
+
+extern void
+nouveau_device_close(struct nouveau_device **);
+
+extern int
+nouveau_device_get_param(struct nouveau_device *, uint64_t param, uint64_t *v);
+
+extern int
+nouveau_device_set_param(struct nouveau_device *, uint64_t param, uint64_t val);
+
+struct nouveau_fence {
+	struct nouveau_channel *channel;
+};
+
+struct nouveau_fence_cb {
+	struct nouveau_fence_cb *next;
+	void (*func)(void *);
+	void *priv;
+};
+
+struct nouveau_fence_priv {
+	struct nouveau_fence base;
+	int refcount;
+
+	struct nouveau_fence *next;
+	struct nouveau_fence_cb *signal_cb;
+
+	uint32_t sequence;
+	int emitted;
+	int signalled;
+};
+#define nouveau_fence(n) ((struct nouveau_fence_priv *)(n))
+
+extern int
+nouveau_fence_new(struct nouveau_channel *, struct nouveau_fence **);
+
+extern int
+nouveau_fence_ref(struct nouveau_fence *, struct nouveau_fence **);
+
+extern int
+nouveau_fence_signal_cb(struct nouveau_fence *, void (*)(void *), void *);
+
+extern void
+nouveau_fence_emit(struct nouveau_fence *);
+
+extern int
+nouveau_fence_wait(struct nouveau_fence **);
+
+extern void
+nouveau_fence_flush(struct nouveau_channel *);
+
+struct nouveau_pushbuf_reloc {
+	struct nouveau_pushbuf_bo *pbbo;
+	uint32_t *ptr;
+	uint32_t flags;
+	uint32_t data;
+	uint32_t vor;
+	uint32_t tor;
+};
+
+struct nouveau_pushbuf_bo {
+	struct nouveau_channel *channel;
+	struct nouveau_bo *bo;
+	unsigned flags;
+	unsigned handled;
+};
+
+#define NOUVEAU_PUSHBUF_MAX_BUFFERS 1024
+#define NOUVEAU_PUSHBUF_MAX_RELOCS 1024
+struct nouveau_pushbuf_priv {
+	struct nouveau_pushbuf base;
+
+	struct nouveau_fence *fence;
+
+	unsigned nop_jump;
+	unsigned start;
+	unsigned size;
+
+	struct nouveau_pushbuf_bo *buffers;
+	unsigned nr_buffers;
+	struct nouveau_pushbuf_reloc *relocs;
+	unsigned nr_relocs;
+};
+#define nouveau_pushbuf(n) ((struct nouveau_pushbuf_priv *)(n))
+
+#define pbbo_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_pbbo(h) ((struct nouveau_pushbuf_bo *)(unsigned long)(h))
+#define pbrel_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_pbrel(h) ((struct nouveau_pushbuf_reloc *)(unsigned long)(h))
+#define bo_to_ptr(o) ((uint64_t)(unsigned long)(o))
+#define ptr_to_bo(h) ((struct nouveau_bo_priv *)(unsigned long)(h))
+
+extern int
+nouveau_pushbuf_init(struct nouveau_channel *);
+
+extern int
+nouveau_pushbuf_flush(struct nouveau_channel *, unsigned min);
+
+extern int
+nouveau_pushbuf_emit_reloc(struct nouveau_channel *, void *ptr,
+			   struct nouveau_bo *, uint32_t data, uint32_t flags,
+			   uint32_t vor, uint32_t tor);
+
+struct nouveau_dma_priv {
+	uint32_t base;
+	uint32_t max;
+	uint32_t cur;
+	uint32_t put;
+	uint32_t free;
+
+	int push_free;
+} dma;
+
+struct nouveau_channel_priv {
+	struct nouveau_channel base;
+
+	struct drm_nouveau_channel_alloc drm;
+
+	uint32_t *pushbuf;
+	void     *notifier_block;
+
+	volatile uint32_t *user;
+	volatile uint32_t *put;
+	volatile uint32_t *get;
+	volatile uint32_t *ref_cnt;
+
+	struct nouveau_dma_priv dma_master;
+	struct nouveau_dma_priv dma_bufmgr;
+	struct nouveau_dma_priv *dma;
+
+	struct nouveau_fence *fence_head;
+	struct nouveau_fence *fence_tail;
+	uint32_t fence_sequence;
+
+	struct nouveau_pushbuf_priv pb;
+
+	unsigned user_charge;
+};
+#define nouveau_channel(n) ((struct nouveau_channel_priv *)(n))
+
+extern int
+nouveau_channel_alloc(struct nouveau_device *, uint32_t fb, uint32_t tt,
+		      struct nouveau_channel **);
+
+extern void
+nouveau_channel_free(struct nouveau_channel **);
+
+struct nouveau_grobj_priv {
+	struct nouveau_grobj base;
+};
+#define nouveau_grobj(n) ((struct nouveau_grobj_priv *)(n))
+
+extern int nouveau_grobj_alloc(struct nouveau_channel *, uint32_t handle,
+			       int class, struct nouveau_grobj **);
+extern int nouveau_grobj_ref(struct nouveau_channel *, uint32_t handle,
+			     struct nouveau_grobj **);
+extern void nouveau_grobj_free(struct nouveau_grobj **);
+
+
+struct nouveau_notifier_priv {
+	struct nouveau_notifier base;
+
+	struct drm_nouveau_notifierobj_alloc drm;
+	volatile void *map;
+};
+#define nouveau_notifier(n) ((struct nouveau_notifier_priv *)(n))
+
+extern int
+nouveau_notifier_alloc(struct nouveau_channel *, uint32_t handle, int count,
+		       struct nouveau_notifier **);
+
+extern void
+nouveau_notifier_free(struct nouveau_notifier **);
+
+extern void
+nouveau_notifier_reset(struct nouveau_notifier *, int id);
+
+extern uint32_t
+nouveau_notifier_status(struct nouveau_notifier *, int id);
+
+extern uint32_t
+nouveau_notifier_return_val(struct nouveau_notifier *, int id);
+
+extern int
+nouveau_notifier_wait_status(struct nouveau_notifier *, int id, int status,
+			     int timeout);
+
+struct nouveau_bo_priv {
+	struct nouveau_bo base;
+
+	struct nouveau_pushbuf_bo *pending;
+	struct nouveau_fence *fence;
+	struct nouveau_fence *wr_fence;
+
+	struct drm_nouveau_mem_alloc drm;
+	void *map;
+
+	void *sysmem;
+	int user;
+
+	int refcount;
+
+	uint64_t offset;
+	uint64_t flags;
+	int tiled;
+};
+#define nouveau_bo(n) ((struct nouveau_bo_priv *)(n))
+
+extern int
+nouveau_bo_init(struct nouveau_device *);
+
+extern void
+nouveau_bo_takedown(struct nouveau_device *);
+
+extern int
+nouveau_bo_new(struct nouveau_device *, uint32_t flags, int align, int size,
+	       struct nouveau_bo **);
+
+extern int
+nouveau_bo_user(struct nouveau_device *, void *ptr, int size,
+		struct nouveau_bo **);
+
+extern int
+nouveau_bo_ref(struct nouveau_device *, uint64_t handle, struct nouveau_bo **);
+
+extern int
+nouveau_bo_set_status(struct nouveau_bo *, uint32_t flags);
+
+extern void
+nouveau_bo_del(struct nouveau_bo **);
+
+extern int
+nouveau_bo_map(struct nouveau_bo *, uint32_t flags);
+
+extern void
+nouveau_bo_unmap(struct nouveau_bo *);
+
+extern int
+nouveau_bo_validate(struct nouveau_channel *, struct nouveau_bo *,
+		    uint32_t flags);
+
+extern int
+nouveau_resource_init(struct nouveau_resource **heap, unsigned start,
+		      unsigned size);
+
+extern int
+nouveau_resource_alloc(struct nouveau_resource *heap, int size, void *priv,
+		       struct nouveau_resource **);
+
+extern void
+nouveau_resource_free(struct nouveau_resource **);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_fence.c b/src/gallium/winsys/drm/nouveau/nouveau_fence.c
new file mode 100644
index 0000000000..e7b0b4ff07
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_fence.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+#include "nouveau_local.h"
+
+static void
+nouveau_fence_del_unsignalled(struct nouveau_fence *fence)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(fence->channel);
+	struct nouveau_fence *le;
+
+	if (nvchan->fence_head == fence) {
+		nvchan->fence_head = nouveau_fence(fence)->next;
+		if (nvchan->fence_head == NULL)
+			nvchan->fence_tail = NULL;
+		return;
+	}
+
+	le = nvchan->fence_head;
+	while (le && nouveau_fence(le)->next != fence)
+		le = nouveau_fence(le)->next;
+	assert(le && nouveau_fence(le)->next == fence);
+	nouveau_fence(le)->next = nouveau_fence(fence)->next;
+	if (nvchan->fence_tail == fence)
+		nvchan->fence_tail = le;
+}
+
+static void
+nouveau_fence_del(struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!fence || !*fence)
+		return;
+	nvfence = nouveau_fence(*fence);
+	*fence = NULL;
+
+	if (--nvfence->refcount)
+		return;
+
+	if (nvfence->emitted && !nvfence->signalled) {
+		if (nvfence->signal_cb) {
+			nvfence->refcount++;
+			nouveau_fence_wait((void *)&nvfence);
+			return;
+		}
+
+		nouveau_fence_del_unsignalled(&nvfence->base);
+	}
+	free(nvfence);
+}
+
+int
+nouveau_fence_new(struct nouveau_channel *chan, struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!chan || !fence || *fence)
+		return -EINVAL;
+	
+	nvfence = calloc(1, sizeof(struct nouveau_fence_priv));
+	if (!nvfence)
+		return -ENOMEM;
+	nvfence->base.channel = chan;
+	nvfence->refcount = 1;
+
+	*fence = &nvfence->base;
+	return 0;
+}
+
+int
+nouveau_fence_ref(struct nouveau_fence *ref, struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+
+	if (!fence)
+		return -EINVAL;
+
+	if (*fence) {
+		nouveau_fence_del(fence);
+		*fence = NULL;
+	}
+
+	if (ref) {
+		nvfence = nouveau_fence(ref);
+		nvfence->refcount++;	
+		*fence = &nvfence->base;
+	}
+
+	return 0;
+}
+
+int
+nouveau_fence_signal_cb(struct nouveau_fence *fence, void (*func)(void *),
+			void *priv)
+{
+	struct nouveau_fence_priv *nvfence = nouveau_fence(fence);
+	struct nouveau_fence_cb *cb;
+
+	if (!nvfence || !func)
+		return -EINVAL;
+
+	cb = malloc(sizeof(struct nouveau_fence_cb));
+	if (!cb)
+		return -ENOMEM;
+
+	cb->func = func;
+	cb->priv = priv;
+	cb->next = nvfence->signal_cb;
+	nvfence->signal_cb = cb;
+	return 0;
+}
+
+void
+nouveau_fence_emit(struct nouveau_fence *fence)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(fence->channel);
+	struct nouveau_fence_priv *nvfence = nouveau_fence(fence);
+
+	nvfence->emitted = 1;
+	nvfence->sequence = ++nvchan->fence_sequence;
+	if (nvfence->sequence == 0xffffffff)
+		NOUVEAU_ERR("AII wrap unhandled\n");
+
+	/*XXX: assumes subc 0 is populated */
+	RING_SPACE_CH(fence->channel, 2);
+	OUT_RING_CH  (fence->channel, 0x00040050);
+	OUT_RING_CH  (fence->channel, nvfence->sequence);
+
+	if (nvchan->fence_tail) {
+		nouveau_fence(nvchan->fence_tail)->next = fence;
+	} else {
+		nvchan->fence_head = fence;
+	}
+	nvchan->fence_tail = fence;
+}
+
+void
+nouveau_fence_flush(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	uint32_t sequence = *nvchan->ref_cnt;
+
+	while (nvchan->fence_head) {
+		struct nouveau_fence_priv *nvfence;
+	
+		nvfence = nouveau_fence(nvchan->fence_head);
+		if (nvfence->sequence > sequence)
+			break;
+		nouveau_fence_del_unsignalled(&nvfence->base);
+		nvfence->signalled = 1;
+
+		if (nvfence->signal_cb) {
+			struct nouveau_fence *fence = NULL;
+
+			nouveau_fence_ref(&nvfence->base, &fence);
+
+			while (nvfence->signal_cb) {
+				struct nouveau_fence_cb *cb;
+				
+				cb = nvfence->signal_cb;
+				nvfence->signal_cb = cb->next;
+				cb->func(cb->priv);
+				free(cb);
+			}
+
+			nouveau_fence_ref(NULL, &fence);
+		}
+	}
+}
+
+int
+nouveau_fence_wait(struct nouveau_fence **fence)
+{
+	struct nouveau_fence_priv *nvfence;
+	
+	if (!fence || !*fence)
+		return -EINVAL;
+	nvfence = nouveau_fence(*fence);
+
+	if (nvfence->emitted) {
+		while (!nvfence->signalled)
+			nouveau_fence_flush(nvfence->base.channel);
+	}
+	nouveau_fence_ref(NULL, fence);
+
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_grobj.c b/src/gallium/winsys/drm/nouveau/nouveau_grobj.c
new file mode 100644
index 0000000000..51523897d5
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_grobj.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+
+int
+nouveau_grobj_alloc(struct nouveau_channel *chan, uint32_t handle,
+		    int class, struct nouveau_grobj **grobj)
+{
+	struct nouveau_device_priv *nvdev = nouveau_device(chan->device);
+	struct nouveau_grobj_priv *nvgrobj;
+	struct drm_nouveau_grobj_alloc g;
+	int ret;
+
+	if (!nvdev || !grobj || *grobj)
+		return -EINVAL;
+
+	nvgrobj = calloc(1, sizeof(*nvgrobj));
+	if (!nvgrobj)
+		return -ENOMEM;
+	nvgrobj->base.channel = chan;
+	nvgrobj->base.handle  = handle;
+	nvgrobj->base.grclass = class;
+
+	g.channel = chan->id;
+	g.handle  = handle;
+	g.class   = class;
+	ret = drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GROBJ_ALLOC,
+			      &g, sizeof(g));
+	if (ret) {
+		nouveau_grobj_free((void *)&nvgrobj);
+		return ret;
+	}
+
+	*grobj = &nvgrobj->base;
+	return 0;
+}
+
+int
+nouveau_grobj_ref(struct nouveau_channel *chan, uint32_t handle,
+		  struct nouveau_grobj **grobj)
+{
+	struct nouveau_grobj_priv *nvgrobj;
+
+	if (!chan || !grobj || *grobj)
+		return -EINVAL;
+
+	nvgrobj = calloc(1, sizeof(struct nouveau_grobj_priv));
+	if (!nvgrobj)
+		return -ENOMEM;
+	nvgrobj->base.channel = chan;
+	nvgrobj->base.handle = handle;
+	nvgrobj->base.grclass = 0;
+
+	*grobj = &nvgrobj->base;
+	return 0;
+}
+
+void
+nouveau_grobj_free(struct nouveau_grobj **grobj)
+{
+	struct nouveau_device_priv *nvdev;
+	struct nouveau_channel_priv *chan;
+	struct nouveau_grobj_priv *nvgrobj;
+
+	if (!grobj || !*grobj)
+		return;
+	nvgrobj = nouveau_grobj(*grobj);
+	*grobj = NULL;
+
+
+	chan = nouveau_channel(nvgrobj->base.channel);
+	nvdev = nouveau_device(chan->base.device);
+
+	if (nvgrobj->base.grclass) {
+		struct drm_nouveau_gpuobj_free f;
+
+		f.channel = chan->drm.channel;
+		f.handle  = nvgrobj->base.handle;
+		drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GPUOBJ_FREE,
+				&f, sizeof(f));	
+	}
+	free(nvgrobj);
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_local.h b/src/gallium/winsys/drm/nouveau/nouveau_local.h
new file mode 100644
index 0000000000..e878a40803
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_local.h
@@ -0,0 +1,117 @@
+#ifndef __NOUVEAU_LOCAL_H__
+#define __NOUVEAU_LOCAL_H__
+
+#include "pipe/p_compiler.h"
+#include "nouveau_winsys_pipe.h"
+#include <stdio.h>
+
+struct pipe_buffer;
+
+/* Debug output */
+#define NOUVEAU_MSG(fmt, args...) do {                                         \
+	fprintf(stdout, "nouveau: "fmt, ##args);                               \
+	fflush(stdout);                                                        \
+} while(0)
+
+#define NOUVEAU_ERR(fmt, args...) do {                                         \
+	fprintf(stderr, "%s:%d -  "fmt, __func__, __LINE__, ##args);           \
+	fflush(stderr);                                                        \
+} while(0)
+
+#define NOUVEAU_TIME_MSEC() 0
+
+/* User FIFO control */
+//#define NOUVEAU_DMA_TRACE
+//#define NOUVEAU_DMA_DEBUG
+//#define NOUVEAU_DMA_DUMP_POSTRELOC_PUSHBUF
+#define NOUVEAU_DMA_BARRIER 
+#define NOUVEAU_DMA_TIMEOUT 2000
+
+/* Push buffer access macros */
+static INLINE void
+OUT_RING(struct nouveau_channel *chan, unsigned data)
+{
+	*(chan->pushbuf->cur++) = (data);
+}
+
+static INLINE void
+OUT_RINGp(struct nouveau_channel *chan, uint32_t *data, unsigned size)
+{
+	memcpy(chan->pushbuf->cur, data, size * 4);
+	chan->pushbuf->cur += size;
+}
+
+static INLINE void
+OUT_RINGf(struct nouveau_channel *chan, float f)
+{
+	union { uint32_t i; float f; } c;
+	c.f = f;
+	OUT_RING(chan, c.i);
+}
+
+static INLINE void
+BEGIN_RING(struct nouveau_channel *chan, struct nouveau_grobj *gr,
+	   unsigned mthd, unsigned size)
+{
+	if (chan->pushbuf->remaining < (size + 1))
+		nouveau_pushbuf_flush(chan, (size + 1));
+	OUT_RING(chan, (gr->subc << 13) | (size << 18) | mthd);
+	chan->pushbuf->remaining -= (size + 1);
+}
+
+static INLINE void
+FIRE_RING(struct nouveau_channel *chan)
+{
+	nouveau_pushbuf_flush(chan, 0);
+}
+
+static INLINE void
+BIND_RING(struct nouveau_channel *chan, struct nouveau_grobj *gr, unsigned subc)
+{
+	gr->subc = subc;
+	BEGIN_RING(chan, gr, 0x0000, 1);
+	OUT_RING  (chan, gr->handle);
+}
+
+static INLINE void
+OUT_RELOC(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	  unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	nouveau_pushbuf_emit_reloc(chan, chan->pushbuf->cur++, bo,
+				   data, flags, vor, tor);
+}
+
+/* Raw data + flags depending on FB/TT buffer */
+static INLINE void
+OUT_RELOCd(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned data, unsigned flags, unsigned vor, unsigned tor)
+{
+	OUT_RELOC(chan, bo, data, flags | NOUVEAU_BO_OR, vor, tor);
+}
+
+/* FB/TT object handle */
+static INLINE void
+OUT_RELOCo(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned flags)
+{
+	OUT_RELOC(chan, bo, 0, flags | NOUVEAU_BO_OR,
+		  chan->vram->handle, chan->gart->handle);
+}
+
+/* Low 32-bits of offset */
+static INLINE void
+OUT_RELOCl(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned delta, unsigned flags)
+{
+	OUT_RELOC(chan, bo, delta, flags | NOUVEAU_BO_LOW, 0, 0);
+}
+
+/* High 32-bits of offset */
+static INLINE void
+OUT_RELOCh(struct nouveau_channel *chan, struct nouveau_bo *bo,
+	   unsigned delta, unsigned flags)
+{
+	OUT_RELOC(chan, bo, delta, flags | NOUVEAU_BO_HIGH, 0, 0);
+}
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_lock.c b/src/gallium/winsys/drm/nouveau/nouveau_lock.c
new file mode 100644
index 0000000000..9adb9ac854
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_lock.c
@@ -0,0 +1,94 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+
+_glthread_DECLARE_STATIC_MUTEX( lockMutex );
+
+static void
+nouveau_contended_lock(struct nouveau_context *nv, GLuint flags)
+{
+	__DRIdrawablePrivate *dPriv = nv->dri_drawable;
+	__DRIscreenPrivate *sPriv = nv->dri_screen;
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	drmGetLock(nvdev->fd, nvdev->ctx, flags);
+
+	/* If the window moved, may need to set a new cliprect now.
+	 *
+	 * NOTE: This releases and regains the hw lock, so all state
+	 * checking must be done *after* this call:
+	 */
+	if (dPriv)
+		DRI_VALIDATE_DRAWABLE_INFO(sPriv, dPriv);
+}
+
+/* Lock the hardware and validate our state.
+ */
+void
+LOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+	char __ret=0;
+
+	_glthread_LOCK_MUTEX(lockMutex);
+	assert(!nv->locked);
+	
+	DRM_CAS(nvdev->lock, nvdev->ctx,
+		(DRM_LOCK_HELD | nvdev->ctx), __ret);
+	
+	if (__ret)
+		nouveau_contended_lock(nv, 0);
+	nv->locked = GL_TRUE;
+}
+
+
+  /* Unlock the hardware using the global current context 
+   */
+void
+UNLOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen *nv_screen = nv->nv_screen;
+	struct nouveau_device *dev = nv_screen->device;
+	struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+	assert(nv->locked);
+	nv->locked = GL_FALSE;
+
+	DRM_UNLOCK(nvdev->fd, nvdev->lock, nvdev->ctx);
+
+	_glthread_UNLOCK_MUTEX(lockMutex);
+} 
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_notifier.c b/src/gallium/winsys/drm/nouveau/nouveau_notifier.c
new file mode 100644
index 0000000000..01e8f38440
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_notifier.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+#define NOTIFIER(__v)                                                          \
+	struct nouveau_notifier_priv *nvnotify = nouveau_notifier(notifier);   \
+	volatile uint32_t *__v = (void*)nvnotify->map + (id * 32)
+
+int
+nouveau_notifier_alloc(struct nouveau_channel *chan, uint32_t handle,
+		       int count, struct nouveau_notifier **notifier)
+{
+	struct nouveau_notifier_priv *nvnotify;
+	int ret;
+
+	if (!chan || !notifier || *notifier)
+		return -EINVAL;
+
+	nvnotify = calloc(1, sizeof(struct nouveau_notifier_priv));
+	if (!nvnotify)
+		return -ENOMEM;
+	nvnotify->base.channel = chan;
+	nvnotify->base.handle  = handle;
+
+	nvnotify->drm.channel = chan->id;
+	nvnotify->drm.handle  = handle;
+	nvnotify->drm.count   = count;
+	if ((ret = drmCommandWriteRead(nouveau_device(chan->device)->fd,
+				       DRM_NOUVEAU_NOTIFIEROBJ_ALLOC,
+				       &nvnotify->drm,
+				       sizeof(nvnotify->drm)))) {
+		nouveau_notifier_free((void *)&nvnotify);
+		return ret;
+	}
+
+	nvnotify->map = (void *)nouveau_channel(chan)->notifier_block +
+				nvnotify->drm.offset;
+	*notifier = &nvnotify->base;
+	return 0;
+}
+
+void
+nouveau_notifier_free(struct nouveau_notifier **notifier)
+{
+
+	struct nouveau_notifier_priv *nvnotify;
+	struct nouveau_channel_priv *nvchan;
+	struct nouveau_device_priv *nvdev;
+	struct drm_nouveau_gpuobj_free f;
+
+	if (!notifier || !*notifier)
+		return;
+	nvnotify = nouveau_notifier(*notifier);
+	*notifier = NULL;
+
+	nvchan = nouveau_channel(nvnotify->base.channel);
+	nvdev   = nouveau_device(nvchan->base.device);
+
+	f.channel = nvchan->drm.channel;
+	f.handle  = nvnotify->base.handle;
+	drmCommandWrite(nvdev->fd, DRM_NOUVEAU_GPUOBJ_FREE, &f, sizeof(f));		
+	free(nvnotify);
+}
+
+void
+nouveau_notifier_reset(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	n[NV_NOTIFY_TIME_0      /4] = 0x00000000;
+	n[NV_NOTIFY_TIME_1      /4] = 0x00000000;
+	n[NV_NOTIFY_RETURN_VALUE/4] = 0x00000000;
+	n[NV_NOTIFY_STATE       /4] = (NV_NOTIFY_STATE_STATUS_IN_PROCESS <<
+				       NV_NOTIFY_STATE_STATUS_SHIFT);
+}
+
+uint32_t
+nouveau_notifier_status(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	return n[NV_NOTIFY_STATE/4] >> NV_NOTIFY_STATE_STATUS_SHIFT;
+}
+
+uint32_t
+nouveau_notifier_return_val(struct nouveau_notifier *notifier, int id)
+{
+	NOTIFIER(n);
+
+	return n[NV_NOTIFY_RETURN_VALUE/4];
+}
+
+int
+nouveau_notifier_wait_status(struct nouveau_notifier *notifier, int id,
+			     int status, int timeout)
+{
+	NOTIFIER(n);
+	uint32_t time = 0, t_start = NOUVEAU_TIME_MSEC();
+
+	while (time <= timeout) {
+		uint32_t v;
+
+		v = n[NV_NOTIFY_STATE/4] >> NV_NOTIFY_STATE_STATUS_SHIFT;
+		if (v == status)
+			return 0;
+
+		if (timeout)
+			time = NOUVEAU_TIME_MSEC() - t_start;
+	}
+
+	return -EBUSY;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c b/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c
new file mode 100644
index 0000000000..815046ba85
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_pushbuf.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+#define PB_BUFMGR_DWORDS   (4096 / 2)
+#define PB_MIN_USER_DWORDS  2048
+
+static int
+nouveau_pushbuf_space(struct nouveau_channel *chan, unsigned min)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_pushbuf_priv *nvpb = &nvchan->pb;
+
+	assert((min + 1) <= nvchan->dma->max);
+
+	/* Wait for enough space in push buffer */
+	min = min < PB_MIN_USER_DWORDS ? PB_MIN_USER_DWORDS : min;
+	min += 1; /* a bit extra for the NOP */
+	if (nvchan->dma->free < min)
+		WAIT_RING_CH(chan, min);
+
+	/* Insert NOP, may turn into a jump later */
+	RING_SPACE_CH(chan, 1);
+	nvpb->nop_jump = nvchan->dma->cur;
+	OUT_RING_CH(chan, 0);
+
+	/* Any remaining space is available to the user */
+	nvpb->start = nvchan->dma->cur;
+	nvpb->size = nvchan->dma->free;
+	nvpb->base.channel = chan;
+	nvpb->base.remaining = nvpb->size;
+	nvpb->base.cur = &nvchan->pushbuf[nvpb->start];
+
+	/* Create a new fence object for this "frame" */
+	nouveau_fence_ref(NULL, &nvpb->fence);
+	nouveau_fence_new(chan, &nvpb->fence);
+
+	return 0;
+}
+
+int
+nouveau_pushbuf_init(struct nouveau_channel *chan)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_dma_priv *m = &nvchan->dma_master;
+	struct nouveau_dma_priv *b = &nvchan->dma_bufmgr;
+	int i;
+
+	if (!nvchan)
+		return -EINVAL;
+
+	/* Reassign last bit of push buffer for a "separate" bufmgr
+	 * ring buffer
+	 */
+	m->max -= PB_BUFMGR_DWORDS;
+	m->free -= PB_BUFMGR_DWORDS;
+
+	b->base = m->base + ((m->max + 2) << 2);
+	b->max = PB_BUFMGR_DWORDS - 2;
+	b->cur = b->put = 0;
+	b->free = b->max - b->cur;
+
+	/* Some NOPs just to be safe
+	 *XXX: RING_SKIPS
+	 */
+	nvchan->dma = b;
+	RING_SPACE_CH(chan, 8);
+	for (i = 0; i < 8; i++)
+		OUT_RING_CH(chan, 0);
+	nvchan->dma = m;
+
+	nouveau_pushbuf_space(chan, 0);
+	chan->pushbuf = &nvchan->pb.base;
+
+	nvchan->pb.buffers = calloc(NOUVEAU_PUSHBUF_MAX_BUFFERS,
+				    sizeof(struct nouveau_pushbuf_bo));
+	nvchan->pb.relocs = calloc(NOUVEAU_PUSHBUF_MAX_RELOCS,
+				   sizeof(struct nouveau_pushbuf_reloc));
+	return 0;
+}
+
+static uint32_t
+nouveau_pushbuf_calc_reloc(struct nouveau_bo *bo,
+			   struct nouveau_pushbuf_reloc *r)
+{
+	uint32_t push;
+
+	if (r->flags & NOUVEAU_BO_LOW) {
+		push = bo->offset + r->data;
+	} else
+	if (r->flags & NOUVEAU_BO_HIGH) {
+		push = (bo->offset + r->data) >> 32;
+	} else {
+		push = r->data;
+	}
+
+	if (r->flags & NOUVEAU_BO_OR) {
+		if (bo->flags & NOUVEAU_BO_VRAM)
+			push |= r->vor;
+		else
+			push |= r->tor;
+	}
+
+	return push;
+}
+
+/* This would be our TTM "superioctl" */
+int
+nouveau_pushbuf_flush(struct nouveau_channel *chan, unsigned min)
+{
+	struct nouveau_channel_priv *nvchan = nouveau_channel(chan);
+	struct nouveau_pushbuf_priv *nvpb = &nvchan->pb;
+	int ret, i;
+
+	if (nvpb->base.remaining == nvpb->size)
+		return 0;
+
+	nouveau_fence_flush(chan);
+
+	nvpb->size -= nvpb->base.remaining;
+	nvchan->dma->cur += nvpb->size;
+	nvchan->dma->free -= nvpb->size;
+	assert(nvchan->dma->cur <= nvchan->dma->max);
+
+	nvchan->dma = &nvchan->dma_bufmgr;
+	nvchan->pushbuf[nvpb->nop_jump] = 0x20000000 |
+		(nvchan->dma->base + (nvchan->dma->cur << 2));
+
+	/* Validate buffers + apply relocations */
+	nvchan->user_charge = 0;
+	for (i = 0; i < nvpb->nr_relocs; i++) {
+		struct nouveau_pushbuf_reloc *r = &nvpb->relocs[i];
+		struct nouveau_pushbuf_bo *pbbo = r->pbbo;
+		struct nouveau_bo *bo = pbbo->bo;
+
+		/* Validated, mem matches presumed, no relocation necessary */
+		if (pbbo->handled & 2) {
+			if (!(pbbo->handled & 1))
+				assert(0);
+			continue;
+		}
+
+		/* Not yet validated, do it now */
+		if (!(pbbo->handled & 1)) {
+			ret = nouveau_bo_validate(chan, bo, pbbo->flags);
+			if (ret) {
+				assert(0);
+				return ret;
+			}
+			pbbo->handled |= 1;
+
+			if (bo->offset == nouveau_bo(bo)->offset &&
+			    bo->flags == nouveau_bo(bo)->flags) {
+				pbbo->handled |= 2;
+				continue;
+			}
+			bo->offset = nouveau_bo(bo)->offset;
+			bo->flags = nouveau_bo(bo)->flags;
+		}
+
+		/* Apply the relocation */
+		*r->ptr = nouveau_pushbuf_calc_reloc(bo, r);
+	}
+	nvpb->nr_relocs = 0;
+
+	/* Dereference all buffers on validate list */
+	for (i = 0; i < nvpb->nr_buffers; i++) {
+		struct nouveau_pushbuf_bo *pbbo = &nvpb->buffers[i];
+
+		nouveau_bo(pbbo->bo)->pending = NULL;
+		nouveau_bo_del(&pbbo->bo);
+	}
+	nvpb->nr_buffers = 0;
+
+	/* Switch back to user's ring */
+	RING_SPACE_CH(chan, 1);
+	OUT_RING_CH(chan, 0x20000000 | ((nvpb->start << 2) +
+					nvchan->dma_master.base));
+	nvchan->dma = &nvchan->dma_master;
+
+	/* Fence + kickoff */
+	nouveau_fence_emit(nvpb->fence);
+	FIRE_RING_CH(chan);
+
+	/* Allocate space for next push buffer */
+	ret = nouveau_pushbuf_space(chan, min);
+	assert(!ret);
+
+	return 0;
+}
+
+static struct nouveau_pushbuf_bo *
+nouveau_pushbuf_emit_buffer(struct nouveau_channel *chan, struct nouveau_bo *bo)
+{
+	struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(chan->pushbuf);
+	struct nouveau_bo_priv *nvbo = nouveau_bo(bo);
+	struct nouveau_pushbuf_bo *pbbo;
+
+	if (nvbo->pending)
+		return nvbo->pending;
+
+	if (nvpb->nr_buffers >= NOUVEAU_PUSHBUF_MAX_BUFFERS)
+		return NULL;
+	pbbo = nvpb->buffers + nvpb->nr_buffers++;
+	nvbo->pending = pbbo;
+
+	nouveau_bo_ref(bo->device, bo->handle, &pbbo->bo);
+	pbbo->channel = chan;
+	pbbo->flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_GART;
+	pbbo->handled = 0;
+	return pbbo;
+}
+
+int
+nouveau_pushbuf_emit_reloc(struct nouveau_channel *chan, void *ptr,
+			   struct nouveau_bo *bo, uint32_t data, uint32_t flags,
+			   uint32_t vor, uint32_t tor)
+{
+	struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(chan->pushbuf);
+	struct nouveau_pushbuf_bo *pbbo;
+	struct nouveau_pushbuf_reloc *r;
+
+	if (nvpb->nr_relocs >= NOUVEAU_PUSHBUF_MAX_RELOCS)
+		return -ENOMEM;
+
+	pbbo = nouveau_pushbuf_emit_buffer(chan, bo);
+	if (!pbbo)
+		return -ENOMEM;
+	pbbo->flags |= (flags & NOUVEAU_BO_RDWR);
+	pbbo->flags &= (flags | NOUVEAU_BO_RDWR);
+
+	r = nvpb->relocs + nvpb->nr_relocs++;
+	r->pbbo = pbbo;
+	r->ptr = ptr;
+	r->flags = flags;
+	r->data = data;
+	r->vor = vor;
+	r->tor = tor;
+
+	if (flags & NOUVEAU_BO_DUMMY)
+		*(uint32_t *)ptr = 0;
+	else
+		*(uint32_t *)ptr = nouveau_pushbuf_calc_reloc(bo, r);
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_resource.c b/src/gallium/winsys/drm/nouveau/nouveau_resource.c
new file mode 100644
index 0000000000..3bbcb5c45e
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_resource.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright 2007 Nouveau Project
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+
+#include "nouveau_drmif.h"
+#include "nouveau_local.h"
+
+int
+nouveau_resource_init(struct nouveau_resource **heap,
+		      unsigned start, unsigned size)
+{
+	struct nouveau_resource *r;
+
+	r = calloc(1, sizeof(struct nouveau_resource));
+	if (!r)
+		return 1;
+
+	r->start = start;
+	r->size  = size;
+	*heap = r;
+	return 0;
+}
+
+int
+nouveau_resource_alloc(struct nouveau_resource *heap, int size, void *priv,
+		       struct nouveau_resource **res)
+{
+	struct nouveau_resource *r;
+
+	if (!heap || !size || !res || *res)
+		return 1;
+
+	while (heap) {
+		if (!heap->in_use && heap->size >= size) {
+			r = calloc(1, sizeof(struct nouveau_resource));
+			if (!r)
+				return 1;
+
+			r->start  = (heap->start + heap->size) - size;
+			r->size   = size;
+			r->in_use = 1;
+			r->priv   = priv;
+
+			heap->size -= size;
+
+			r->next = heap->next;
+			if (heap->next)
+				heap->next->prev = r;
+			r->prev = heap;
+			heap->next = r;
+
+			*res = r;
+			return 0;
+		}
+			
+		heap = heap->next;
+	}
+
+	return 1;
+}
+
+void
+nouveau_resource_free(struct nouveau_resource **res)
+{
+	struct nouveau_resource *r;
+
+	if (!res || !*res)
+		return;
+	r = *res;
+	*res = NULL;
+
+	r->in_use = 0;
+
+	if (r->next && !r->next->in_use) {
+		struct nouveau_resource *new = r->next;
+
+		new->prev = r->prev;
+		if (r->prev)
+			r->prev->next = new;
+		new->size += r->size;
+		new->start = r->start;
+
+		free(r);
+		r = new;
+	}
+
+	if (r->prev && !r->prev->in_use) {
+		r->prev->next = r->next;
+		if (r->next)
+			r->next->prev = r->prev;
+		r->prev->size += r->size;
+		free(r);
+	}
+	
+}
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_screen.c b/src/gallium/winsys/drm/nouveau/nouveau_screen.c
new file mode 100644
index 0000000000..c6d0c53588
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_screen.c
@@ -0,0 +1,265 @@
+#include "utils.h"
+#include "vblank.h"
+#include "xmlpool.h"
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_cb_fbo.h"
+
+#include "nouveau_context.h"
+#include "nouveau_drm.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 11
+#error nouveau_drm.h version does not match expected version
+#endif
+
+/* Extension stuff, enabling of extensions handled by Gallium's GL state
+ * tracker.  But, we still need to define the entry points we want.
+ */
+#define need_GL_ARB_fragment_program
+#define need_GL_ARB_multisample
+#define need_GL_ARB_occlusion_query
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
+#define need_GL_ARB_texture_compression
+#define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
+#define need_GL_ARB_vertex_buffer_object
+#define need_GL_EXT_compiled_vertex_array
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_framebuffer_object
+#define need_GL_VERSION_2_0
+#define need_GL_VERSION_2_1
+#include "extension_helper.h"
+
+const struct dri_extension card_extensions[] =
+{
+	{ "GL_ARB_multisample", GL_ARB_multisample_functions },
+	{ "GL_ARB_occlusion_query", GL_ARB_occlusion_query_functions },
+	{ "GL_ARB_point_parameters", GL_ARB_point_parameters_functions },
+	{ "GL_ARB_shader_objects", GL_ARB_shader_objects_functions },
+	{ "GL_ARB_shading_language_100", GL_VERSION_2_0_functions },
+	{ "GL_ARB_shading_language_120", GL_VERSION_2_1_functions },
+	{ "GL_ARB_texture_compression", GL_ARB_texture_compression_functions },
+	{ "GL_ARB_vertex_program", GL_ARB_vertex_program_functions },
+	{ "GL_ARB_vertex_shader", GL_ARB_vertex_shader_functions },
+	{ "GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions },
+	{ "GL_EXT_compiled_vertex_array", GL_EXT_compiled_vertex_array_functions },
+	{ "GL_EXT_fog_coord", GL_EXT_fog_coord_functions },
+	{ "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+	{ "GL_EXT_secondary_color", GL_EXT_secondary_color_functions },
+	{ NULL, 0 }
+};
+
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 0;
+
+extern const struct dri_extension common_extensions[];
+extern const struct dri_extension nv40_extensions[];
+
+static GLboolean
+nouveau_create_buffer(__DRIscreenPrivate * driScrnPriv,
+		      __DRIdrawablePrivate * driDrawPriv,
+		      const __GLcontextModes *glVis, GLboolean pixmapBuffer)
+{
+	struct nouveau_framebuffer *nvfb;
+	enum pipe_format colour, depth, stencil;
+
+	if (pixmapBuffer)
+		return GL_FALSE;
+
+	nvfb = CALLOC_STRUCT(nouveau_framebuffer);
+	if (!nvfb)
+		return GL_FALSE;
+
+	if (glVis->redBits == 5)
+		colour = PIPE_FORMAT_R5G6B5_UNORM;
+	else
+		colour = PIPE_FORMAT_A8R8G8B8_UNORM;
+
+	if (glVis->depthBits == 16)
+		depth = PIPE_FORMAT_Z16_UNORM;
+	else if (glVis->depthBits == 24)
+		depth = PIPE_FORMAT_Z24S8_UNORM;
+	else
+		depth = PIPE_FORMAT_NONE;
+
+	if (glVis->stencilBits == 8)
+		stencil = PIPE_FORMAT_Z24S8_UNORM;
+	else
+		stencil = PIPE_FORMAT_NONE;
+
+	nvfb->stfb = st_create_framebuffer(glVis, colour, depth, stencil,
+					   driDrawPriv->w, driDrawPriv->h,
+					   (void*)nvfb);
+	if (!nvfb->stfb) {
+		free(nvfb);
+		return  GL_FALSE;
+	}
+
+	driDrawPriv->driverPrivate = (void *)nvfb;
+	return GL_TRUE;
+}
+
+static void
+nouveau_destroy_buffer(__DRIdrawablePrivate * driDrawPriv)
+{
+	struct nouveau_framebuffer *nvfb;
+	
+	nvfb = (struct nouveau_framebuffer *)driDrawPriv->driverPrivate;
+	st_unreference_framebuffer(nvfb->stfb);
+	free(nvfb);
+}
+
+static __DRIconfig **
+nouveau_fill_in_modes(__DRIscreenPrivate *psp,
+		      unsigned pixel_bits, unsigned depth_bits,
+		      unsigned stencil_bits, GLboolean have_back_buffer)
+{
+	__DRIconfig **configs;
+	unsigned depth_buffer_factor;
+	unsigned back_buffer_factor;
+	GLenum fb_format;
+	GLenum fb_type;
+
+	static const GLenum back_buffer_modes[] = {
+		GLX_NONE, GLX_SWAP_UNDEFINED_OML,
+	};
+
+	uint8_t depth_bits_array[3];
+	uint8_t stencil_bits_array[3];
+	uint8_t msaa_samples_array[1];
+
+	depth_bits_array[0] = 0;
+	depth_bits_array[1] = depth_bits;
+	depth_bits_array[2] = depth_bits;
+
+	/* Just like with the accumulation buffer, always provide some modes
+	 * with a stencil buffer.  It will be a sw fallback, but some apps won't
+	 * care about that.
+	 */
+	stencil_bits_array[0] = 0;
+	stencil_bits_array[1] = 0;
+	if (depth_bits == 24)
+		stencil_bits_array[1] = (stencil_bits == 0) ? 8 : stencil_bits;
+	stencil_bits_array[2] = (stencil_bits == 0) ? 8 : stencil_bits;
+
+	msaa_samples_array[0] = 0;
+
+	depth_buffer_factor =
+		((depth_bits != 0) || (stencil_bits != 0)) ? 3 : 1;
+	back_buffer_factor = (have_back_buffer) ? 3 : 1;
+
+	if (pixel_bits == 16) {
+		fb_format = GL_RGB;
+		fb_type = GL_UNSIGNED_SHORT_5_6_5;
+	}
+	else {
+		fb_format = GL_BGRA;
+		fb_type = GL_UNSIGNED_INT_8_8_8_8_REV;
+	}
+
+	configs = driCreateConfigs(fb_format, fb_type,
+				   depth_bits_array, stencil_bits_array,
+				   depth_buffer_factor, back_buffer_modes,
+				   back_buffer_factor, msaa_samples_array, 1);
+	if (configs == NULL) {
+	 fprintf(stderr, "[%s:%u] Error creating FBConfig!\n",
+			 __func__, __LINE__);
+		return NULL;
+	}
+
+	return configs;
+}
+
+static const __DRIconfig **
+nouveau_screen_create(__DRIscreenPrivate *psp)
+{
+	struct nouveau_dri *nv_dri = psp->pDevPriv;
+	struct nouveau_screen *nv_screen;
+	static const __DRIversion ddx_expected =
+		{ 0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL };
+	static const __DRIversion dri_expected = { 4, 0, 0 };
+	static const __DRIversion drm_expected =
+		{ 0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL };
+	int ret;
+
+	if (!driCheckDriDdxDrmVersions2("nouveau",
+					&psp->dri_version, &dri_expected,
+					&psp->ddx_version, &ddx_expected,
+					&psp->drm_version, &drm_expected)) {
+		return NULL;
+	}
+
+	if (drm_expected.patch != psp->drm_version.patch) {
+		fprintf(stderr, "Incompatible DRM patch level.\n"
+				"Expected: %d\n" "Current : %d\n",
+			drm_expected.patch, psp->drm_version.patch);
+		return NULL;
+	}
+
+	driInitExtensions(NULL, card_extensions, GL_FALSE);
+
+	if (psp->devPrivSize != sizeof(struct nouveau_dri)) {
+		NOUVEAU_ERR("DRI struct mismatch between DDX/DRI\n");
+		return GL_FALSE;
+	}
+
+	nv_screen = CALLOC_STRUCT(nouveau_screen);
+	if (!nv_screen)
+		return GL_FALSE;
+	nv_screen->driScrnPriv = psp;
+	psp->private = (void *)nv_screen;
+
+	driParseOptionInfo(&nv_screen->option_cache,
+			   __driConfigOptions, __driNConfigOptions);
+
+	if ((ret = nouveau_device_open_existing(&nv_screen->device, 0,
+						psp->fd, 0))) {
+		NOUVEAU_ERR("Failed opening nouveau device: %d\n", ret);
+		return GL_FALSE;
+	}
+
+	nv_screen->front_offset = nv_dri->front_offset;
+	nv_screen->front_pitch  = nv_dri->front_pitch * (nv_dri->bpp / 8);
+	nv_screen->front_cpp = nv_dri->bpp / 8;
+	nv_screen->front_height = nv_dri->height;
+
+	return (const __DRIconfig **)
+		nouveau_fill_in_modes(psp, nv_dri->bpp,
+				      (nv_dri->bpp == 16) ? 16 : 24,
+				      (nv_dri->bpp == 16) ? 0 : 8, 1);
+}
+
+static void
+nouveau_screen_destroy(__DRIscreenPrivate *driScrnPriv)
+{
+	struct nouveau_screen *nv_screen = driScrnPriv->private;
+
+	driScrnPriv->private = NULL;
+	FREE(nv_screen);
+}
+
+const struct __DriverAPIRec
+driDriverAPI = {
+	.InitScreen	= nouveau_screen_create,
+	.DestroyScreen	= nouveau_screen_destroy,
+	.CreateContext	= nouveau_context_create,
+	.DestroyContext	= nouveau_context_destroy,
+	.CreateBuffer	= nouveau_create_buffer,
+	.DestroyBuffer	= nouveau_destroy_buffer,
+	.SwapBuffers	= nouveau_swap_buffers,
+	.MakeCurrent	= nouveau_context_bind,
+	.UnbindContext	= nouveau_context_unbind,
+	.CopySubBuffer	= nouveau_copy_sub_buffer,
+
+	.InitScreen2	= NULL, /* one day, I promise! */
+};
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_screen.h b/src/gallium/winsys/drm/nouveau/nouveau_screen.h
new file mode 100644
index 0000000000..388d6be9bb
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_screen.h
@@ -0,0 +1,20 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+#include "xmlconfig.h"
+
+struct nouveau_screen {
+	__DRIscreenPrivate *driScrnPriv;
+	driOptionCache      option_cache;
+
+	struct nouveau_device *device;
+
+	uint32_t front_offset;
+	uint32_t front_pitch;
+	uint32_t front_cpp;
+	uint32_t front_height;
+
+	void *nvc;
+};
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c
new file mode 100644
index 0000000000..70e0104e83
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.c
@@ -0,0 +1,86 @@
+#include "main/glheader.h"
+#include "glapi/glthread.h"
+#include <GL/internal/glcore.h>
+
+#include "pipe/p_context.h"
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_cb_fbo.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+void
+nouveau_copy_buffer(__DRIdrawablePrivate *dPriv, struct pipe_surface *surf,
+		    const drm_clip_rect_t *rect)
+{
+	struct nouveau_context *nv = dPriv->driContextPriv->driverPrivate;
+	drm_clip_rect_t *pbox;
+	int nbox, i;
+
+	LOCK_HARDWARE(nv);
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(nv);
+		return;
+	}
+	pbox = dPriv->pClipRects;
+	nbox = dPriv->numClipRects;
+
+	nv->surface_copy_prep(nv, nv->frontbuffer, surf);
+	for (i = 0; i < nbox; i++, pbox++) {
+		int sx, sy, dx, dy, w, h;
+
+		sx = pbox->x1 - dPriv->x;
+		sy = pbox->y1 - dPriv->y;
+		dx = pbox->x1;
+		dy = pbox->y1;
+		w  = pbox->x2 - pbox->x1;
+		h  = pbox->y2 - pbox->y1;
+
+		nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	}
+
+	FIRE_RING(nv->nvc->channel);
+	UNLOCK_HARDWARE(nv);
+
+	if (nv->last_stamp != dPriv->lastStamp) {
+		struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+		st_resize_framebuffer(nvfb->stfb, dPriv->w, dPriv->h);
+		nv->last_stamp = dPriv->lastStamp;
+	}
+}
+
+void
+nouveau_copy_sub_buffer(__DRIdrawablePrivate *dPriv, int x, int y, int w, int h)
+{
+	struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+	struct pipe_surface *surf;
+
+	surf = st_get_framebuffer_surface(nvfb->stfb, ST_SURFACE_BACK_LEFT);
+	if (surf) {
+		drm_clip_rect_t rect;
+		rect.x1 = x;
+		rect.y1 = y;
+		rect.x2 = x + w;
+		rect.y2 = y + h;
+
+		st_notify_swapbuffers(nvfb->stfb);
+		nouveau_copy_buffer(dPriv, surf, &rect);
+	}
+}
+
+void
+nouveau_swap_buffers(__DRIdrawablePrivate *dPriv)
+{
+	struct nouveau_framebuffer *nvfb = dPriv->driverPrivate;
+	struct pipe_surface *surf;
+
+	surf = st_get_framebuffer_surface(nvfb->stfb, ST_SURFACE_BACK_LEFT);
+	if (surf) {
+		st_notify_swapbuffers(nvfb->stfb);
+		nouveau_copy_buffer(dPriv, surf, NULL);
+	}
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h
new file mode 100644
index 0000000000..825d3da6da
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_swapbuffers.h
@@ -0,0 +1,10 @@
+#ifndef __NOUVEAU_SWAPBUFFERS_H__
+#define __NOUVEAU_SWAPBUFFERS_H__
+
+extern void nouveau_copy_buffer(__DRIdrawablePrivate *, struct pipe_surface *,
+				const drm_clip_rect_t *);
+extern void nouveau_copy_sub_buffer(__DRIdrawablePrivate *,
+				    int x, int y, int w, int h);
+extern void nouveau_swap_buffers(__DRIdrawablePrivate *);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys.c
new file mode 100644
index 0000000000..364340e1d3
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys.c
@@ -0,0 +1,161 @@
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+#include "nouveau/nouveau_winsys.h"
+
+static int
+nouveau_pipe_notifier_alloc(struct nouveau_winsys *nvws, int count,
+			    struct nouveau_notifier **notify)
+{
+	struct nouveau_context *nv = nvws->nv;
+
+	return nouveau_notifier_alloc(nv->nvc->channel, nv->nvc->next_handle++,
+				      count, notify);
+}
+
+static int
+nouveau_pipe_grobj_alloc(struct nouveau_winsys *nvws, int grclass,
+			 struct nouveau_grobj **grobj)
+{
+	struct nouveau_context *nv = nvws->nv;
+	struct nouveau_channel *chan = nv->nvc->channel;
+	int ret;
+
+	ret = nouveau_grobj_alloc(chan, nv->nvc->next_handle++,
+				  grclass, grobj);
+	if (ret)
+		return ret;
+
+	assert(nv->nvc->next_subchannel < 7);
+	BIND_RING(chan, *grobj, nv->nvc->next_subchannel++);
+	return 0;
+}
+
+static int
+nouveau_pipe_surface_copy(struct nouveau_winsys *nvws, struct pipe_surface *dst,
+			  unsigned dx, unsigned dy, struct pipe_surface *src,
+			  unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_context *nv = nvws->nv;
+
+	if (nv->surface_copy_prep(nv, dst, src))
+		return 1;
+	nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	nv->surface_copy_done(nv);
+
+	return 0;
+}
+
+static int
+nouveau_pipe_surface_fill(struct nouveau_winsys *nvws, struct pipe_surface *dst,
+			  unsigned dx, unsigned dy, unsigned w, unsigned h,
+			  unsigned value)
+{
+	if (nvws->nv->surface_fill(nvws->nv, dst, dx, dy, w, h, value))
+		return 1;
+	return 0;
+}
+
+static int
+nouveau_pipe_push_reloc(struct nouveau_winsys *nvws, void *ptr,
+			struct pipe_buffer *buf, uint32_t data,
+			uint32_t flags, uint32_t vor, uint32_t tor)
+{
+	return nouveau_pushbuf_emit_reloc(nvws->channel, ptr,
+					  nouveau_buffer(buf)->bo,
+					  data, flags, vor, tor);
+}
+
+static int
+nouveau_pipe_push_flush(struct nouveau_winsys *nvws, unsigned size,
+			struct pipe_fence_handle **fence)
+{
+	if (fence) {
+		struct nouveau_pushbuf *pb = nvws->channel->pushbuf;
+		struct nouveau_pushbuf_priv *nvpb = nouveau_pushbuf(pb);
+		struct nouveau_fence *ref = NULL;
+
+		nouveau_fence_ref(nvpb->fence, &ref);
+		*fence = (struct pipe_fence_handle *)ref;
+	}
+
+	return nouveau_pushbuf_flush(nvws->channel, size);
+}
+
+struct pipe_context *
+nouveau_pipe_create(struct nouveau_context *nv)
+{
+	struct nouveau_channel_context *nvc = nv->nvc;
+	struct nouveau_winsys *nvws = CALLOC_STRUCT(nouveau_winsys);
+	struct pipe_screen *(*hws_create)(struct pipe_winsys *,
+					  struct nouveau_winsys *);
+	struct pipe_context *(*hw_create)(struct pipe_screen *, unsigned);
+	struct pipe_winsys *ws;
+	unsigned chipset = nv->nv_screen->device->chipset;
+
+	if (!nvws)
+		return NULL;
+
+	switch (chipset & 0xf0) {
+	case 0x10:
+		hws_create = nv10_screen_create;
+		hw_create = nv10_create;
+		break;
+	case 0x20:
+		hws_create = nv20_screen_create;
+		hw_create = nv20_create;
+		break;
+	case 0x30:
+		hws_create = nv30_screen_create;
+		hw_create = nv30_create;
+		break;
+	case 0x40:
+	case 0x60:
+		hws_create = nv40_screen_create;
+		hw_create = nv40_create;
+		break;
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		hws_create = nv50_screen_create;
+		hw_create = nv50_create;
+		break;
+	default:
+		NOUVEAU_ERR("Unknown chipset NV%02x\n", chipset);
+		return NULL;
+	}
+
+	nvws->nv		= nv;
+	nvws->channel		= nv->nvc->channel;
+
+	nvws->res_init		= nouveau_resource_init;
+	nvws->res_alloc		= nouveau_resource_alloc;
+	nvws->res_free		= nouveau_resource_free;
+
+	nvws->push_reloc        = nouveau_pipe_push_reloc;
+	nvws->push_flush	= nouveau_pipe_push_flush;
+
+	nvws->grobj_alloc	= nouveau_pipe_grobj_alloc;
+	nvws->grobj_free	= nouveau_grobj_free;
+
+	nvws->notifier_alloc	= nouveau_pipe_notifier_alloc;
+	nvws->notifier_free	= nouveau_notifier_free;
+	nvws->notifier_reset	= nouveau_notifier_reset;
+	nvws->notifier_status	= nouveau_notifier_status;
+	nvws->notifier_retval	= nouveau_notifier_return_val;
+	nvws->notifier_wait	= nouveau_notifier_wait_status;
+
+	nvws->surface_copy	= nouveau_pipe_surface_copy;
+	nvws->surface_fill	= nouveau_pipe_surface_fill;
+
+	ws = nouveau_create_pipe_winsys(nv);
+
+	if (!nvc->pscreen)
+		nvc->pscreen = hws_create(ws, nvws);
+	nvc->pctx[nv->pctx_id] = hw_create(nvc->pscreen, nv->pctx_id);
+	return nvc->pctx[nv->pctx_id];
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c
new file mode 100644
index 0000000000..5276806de6
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.c
@@ -0,0 +1,206 @@
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+#include "nouveau_winsys_pipe.h"
+
+static void
+nouveau_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surf,
+			  void *context_private)
+{
+	struct nouveau_context *nv = context_private;
+	__DRIdrawablePrivate *dPriv = nv->dri_drawable;
+
+	nouveau_copy_buffer(dPriv, surf, NULL);
+}
+
+static const char *
+nouveau_get_name(struct pipe_winsys *pws)
+{
+	return "Nouveau/DRI";
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
+		       unsigned usage, unsigned size)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_context *nv = nvpws->nv;
+	struct nouveau_device *dev = nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+	uint32_t flags;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.alignment = alignment;
+	nvbuf->base.usage = usage;
+	nvbuf->base.size = size;
+
+	flags = NOUVEAU_BO_LOCAL;
+
+	if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+		if (usage & NOUVEAU_BUFFER_USAGE_TEXTURE)
+			flags |= NOUVEAU_BO_GART;
+		flags |= NOUVEAU_BO_VRAM;
+
+		switch (dev->chipset & 0xf0) {
+		case 0x50:
+		case 0x80:
+		case 0x90:
+			flags |= NOUVEAU_BO_TILED;
+			if (usage & NOUVEAU_BUFFER_USAGE_ZETA)
+				flags |= NOUVEAU_BO_ZTILE;
+			break;
+		default:
+			break;
+		}
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+		if (nv->cap.hw_vertex_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_INDEX) {
+		if (nv->cap.hw_index_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (nouveau_bo_new(dev, flags, alignment, size, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_user_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_device *dev = nvpws->nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.size = bytes;
+
+	if (nouveau_bo_user(dev, ptr, bytes, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static void
+nouveau_pipe_bo_del(struct pipe_winsys *ws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_del(&nvbuf->bo);
+	free(nvbuf);
+}
+
+static void *
+nouveau_pipe_bo_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
+		    unsigned flags)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+	uint32_t map_flags = 0;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ)
+		map_flags |= NOUVEAU_BO_RD;
+	if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+		map_flags |= NOUVEAU_BO_WR;
+
+	if (nouveau_bo_map(nvbuf->bo, map_flags))
+		return NULL;
+	return nvbuf->bo->map;
+}
+
+static void
+nouveau_pipe_bo_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_unmap(nvbuf->bo);
+}
+
+static INLINE struct nouveau_fence *
+nouveau_pipe_fence(struct pipe_fence_handle *pfence)
+{
+	return (struct nouveau_fence *)pfence;
+}
+
+static void
+nouveau_pipe_fence_reference(struct pipe_winsys *ws,
+			     struct pipe_fence_handle **ptr,
+			     struct pipe_fence_handle *pfence)
+{
+	nouveau_fence_ref((void *)pfence, (void *)ptr);
+}
+
+static int
+nouveau_pipe_fence_signalled(struct pipe_winsys *ws,
+			     struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)ws;
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+
+	if (nouveau_fence(fence)->signalled == 0)
+		nouveau_fence_flush(nvpws->nv->nvc->channel);
+
+	return !nouveau_fence(fence)->signalled;
+}
+
+static int
+nouveau_pipe_fence_finish(struct pipe_winsys *ws,
+			  struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+	struct nouveau_fence *ref = NULL;
+
+	nouveau_fence_ref(fence, &ref);
+	return nouveau_fence_wait(&ref);
+}
+
+struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv)
+{
+	struct nouveau_pipe_winsys *nvpws;
+	struct pipe_winsys *pws;
+
+	nvpws = CALLOC_STRUCT(nouveau_pipe_winsys);
+	if (!nvpws)
+		return NULL;
+	nvpws->nv = nv;
+	pws = &nvpws->pws;
+
+	pws->flush_frontbuffer = nouveau_flush_frontbuffer;
+
+	pws->buffer_create = nouveau_pipe_bo_create;
+	pws->buffer_destroy = nouveau_pipe_bo_del;
+	pws->user_buffer_create = nouveau_pipe_bo_user_create;
+	pws->buffer_map = nouveau_pipe_bo_map;
+	pws->buffer_unmap = nouveau_pipe_bo_unmap;
+
+	pws->fence_reference = nouveau_pipe_fence_reference;
+	pws->fence_signalled = nouveau_pipe_fence_signalled;
+	pws->fence_finish = nouveau_pipe_fence_finish;
+
+	pws->get_name = nouveau_get_name;
+
+	return &nvpws->pws;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h
new file mode 100644
index 0000000000..6a03ac0d77
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_pipe.h
@@ -0,0 +1,34 @@
+#ifndef NOUVEAU_PIPE_WINSYS_H
+#define NOUVEAU_PIPE_WINSYS_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_winsys.h"
+#include "nouveau_context.h"
+
+struct nouveau_pipe_buffer {
+	struct pipe_buffer base;
+	struct nouveau_bo *bo;
+};
+
+static inline struct nouveau_pipe_buffer *
+nouveau_buffer(struct pipe_buffer *buf)
+{
+	return (struct nouveau_pipe_buffer *)buf;
+}
+
+struct nouveau_pipe_winsys {
+	struct pipe_winsys pws;
+
+	struct nouveau_context *nv;
+};
+
+extern struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv);
+
+struct pipe_context *
+nouveau_create_softpipe(struct nouveau_context *nv);
+
+struct pipe_context *
+nouveau_pipe_create(struct nouveau_context *nv);
+
+#endif
diff --git a/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c b/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c
new file mode 100644
index 0000000000..68aade829d
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nouveau_winsys_softpipe.c
@@ -0,0 +1,86 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include "imports.h"
+
+#include "pipe/p_defines.h"
+#include "pipe/p_format.h"
+#include "softpipe/sp_winsys.h"
+
+#include "nouveau_context.h"
+#include "nouveau_winsys_pipe.h"
+
+struct nouveau_softpipe_winsys {
+   struct softpipe_winsys sws;
+   struct nouveau_context *nv;
+};
+
+/**
+ * Return list of surface formats supported by this driver.
+ */
+static boolean
+nouveau_is_format_supported(struct softpipe_winsys *sws,
+						enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return TRUE;
+	default:
+		break;
+	};
+
+	return FALSE;
+}
+
+struct pipe_context *
+nouveau_create_softpipe(struct nouveau_context *nv)
+{
+	struct nouveau_softpipe_winsys *nvsws;
+	struct pipe_screen *pscreen;
+	struct pipe_winsys *ws;
+
+	ws = nouveau_create_pipe_winsys(nv);
+	if (!ws)
+		return NULL;
+	pscreen = softpipe_create_screen(ws);
+
+	nvsws = CALLOC_STRUCT(nouveau_softpipe_winsys);
+	if (!nvsws)
+		return NULL;
+
+	nvsws->sws.is_format_supported = nouveau_is_format_supported;
+	nvsws->nv = nv;
+
+	return softpipe_create(pscreen, ws, &nvsws->sws);
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nv04_surface.c b/src/gallium/winsys/drm/nouveau/nv04_surface.c
new file mode 100644
index 0000000000..68338eb814
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nv04_surface.c
@@ -0,0 +1,420 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "nouveau_context.h"
+
+static INLINE int log2i(int i)
+{
+	int r = 0;
+
+	if (i & 0xffff0000) {
+		i >>= 16;
+		r += 16;
+	}
+	if (i & 0x0000ff00) {
+		i >>= 8;
+		r += 8;
+	}
+	if (i & 0x000000f0) {
+		i >>= 4;
+		r += 4;
+	}
+	if (i & 0x0000000c) {
+		i >>= 2;
+		r += 2;
+	}
+	if (i & 0x00000002) {
+		r += 1;
+	}
+	return r;
+}
+
+static INLINE int
+nv04_surface_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y8;
+	case PIPE_FORMAT_R16_SNORM:
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_R5G6B5;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_CONTEXT_SURFACES_2D_FORMAT_Y32;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_rect_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A16R5G6B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT_A8R8G8B8;
+	default:
+		return -1;
+	}
+}
+
+static INLINE int
+nv04_scaled_image_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A1R5G5B5_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A1R5G5B5;
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_A8R8G8B8;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_X8R8G8B8;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+	case PIPE_FORMAT_R16_SNORM:
+		return NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_FORMAT_R5G6B5;
+	default:
+		return -1;
+	}
+}
+
+static void
+nv04_surface_copy_swizzle(struct nouveau_context *nv, unsigned dx, unsigned dy,
+                          unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct pipe_surface *dst = nv->surf_dst;
+	struct pipe_surface *src = nv->surf_src;
+
+	/* POT or GTFO */
+	assert(!(w & (w - 1)) && !(h & (h - 1)));
+
+	BEGIN_RING(chan, nv->nvc->NvSwzSurf, NV04_SWIZZLED_SURFACE_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, nv->nvc->NvSwzSurf, NV04_SWIZZLED_SURFACE_FORMAT, 2);
+	OUT_RING  (chan, nv04_surface_format(dst->format) |
+	                 log2i(w) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_U_SHIFT |
+	                 log2i(h) << NV04_SWIZZLED_SURFACE_FORMAT_BASE_SIZE_V_SHIFT);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, nv->nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_DMA_IMAGE, 1);
+	OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	BEGIN_RING(chan, nv->nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_SURFACE, 1);
+	OUT_RING  (chan, nv->nvc->NvSwzSurf->handle);
+
+	BEGIN_RING(chan, nv->nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
+	OUT_RING  (chan, nv04_scaled_image_format(src->format));
+	OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, h << 16 | w);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, h << 16 | w);
+	OUT_RING  (chan, 1 << 20);
+	OUT_RING  (chan, 1 << 20);
+	BEGIN_RING(chan, nv->nvc->NvSIFM, NV04_SCALED_IMAGE_FROM_MEMORY_SIZE, 4);
+	OUT_RING  (chan, h << 16 | w);
+	OUT_RING  (chan, src->stride |
+	                 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_ORIGIN_CENTER |
+	                 NV04_SCALED_IMAGE_FROM_MEMORY_FORMAT_FILTER_POINT_SAMPLE);
+	OUT_RELOCl(chan, nouveau_buffer(src->buffer)->bo, src->offset,
+	                 NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RING  (chan, 0);
+}
+
+static void
+nv04_surface_copy_m2mf(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		       unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct pipe_surface *dst = nv->surf_dst;
+	struct pipe_surface *src = nv->surf_src;
+	unsigned dst_offset, src_offset;
+
+	dst_offset = dst->offset + (dy * dst->stride) + (dx * dst->block.size);
+	src_offset = src->offset + (sy * src->stride) + (sx * src->block.size);
+
+	while (h) {
+		int count = (h > 2047) ? 2047 : h;
+
+		BEGIN_RING(chan, nv->nvc->NvM2MF,
+			   NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+		OUT_RELOCl(chan, nouveau_buffer(src->buffer)->bo, src_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+		OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst_offset,
+			   NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_WR);
+		OUT_RING  (chan, src->stride);
+		OUT_RING  (chan, dst->stride);
+		OUT_RING  (chan, w * src->block.size);
+		OUT_RING  (chan, count);
+		OUT_RING  (chan, 0x0101);
+		OUT_RING  (chan, 0);
+
+		h -= count;
+		src_offset += src->stride * count;
+		dst_offset += dst->stride * count;
+	}
+}
+
+static void
+nv04_surface_copy_blit(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		       unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+
+	BEGIN_RING(chan, nv->nvc->NvImageBlit, 0x0300, 3);
+	OUT_RING  (chan, (sy << 16) | sx);
+	OUT_RING  (chan, (dy << 16) | dx);
+	OUT_RING  (chan, ( h << 16) |  w);
+}
+
+static int
+nv04_surface_copy_prep(struct nouveau_context *nv, struct pipe_surface *dst,
+		       struct pipe_surface *src)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	int format;
+
+	if (src->format != dst->format)
+		return 1;
+
+	/* Setup transfer to swizzle the texture to vram if needed */
+	/* FIXME/TODO: check proper limits of this operation */
+	if (src->texture && dst->texture) {
+		unsigned int src_linear = src->texture->tex_usage &
+		                          NOUVEAU_TEXTURE_USAGE_LINEAR;
+		unsigned int dst_linear = dst->texture->tex_usage &
+		                          NOUVEAU_TEXTURE_USAGE_LINEAR;
+		if (src_linear ^ dst_linear) {
+			nv->surface_copy = nv04_surface_copy_swizzle;
+			nv->surf_dst = dst;
+			nv->surf_src = src;
+			return 0;
+		}
+	}
+
+	/* NV_CONTEXT_SURFACES_2D has buffer alignment restrictions, fallback
+	 * to NV_MEMORY_TO_MEMORY_FORMAT in this case.
+	 */
+	if ((src->offset & 63) || (dst->offset & 63)) {
+		BEGIN_RING(nv->nvc->channel, nv->nvc->NvM2MF,
+			   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_BUFFER_IN, 2);
+		OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+			   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+		OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+			   NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+		nv->surface_copy = nv04_surface_copy_m2mf;
+		nv->surf_dst = dst;
+		nv->surf_src = src;
+		return 0;
+
+	}
+
+	if ((format = nv04_surface_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad surface format 0x%x\n", dst->format);
+		return 1;
+	}
+	nv->surface_copy = nv04_surface_copy_blit;
+
+	BEGIN_RING(chan, nv->nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, nouveau_buffer(src->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, nv->nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, format);
+	OUT_RING  (chan, (dst->stride << 16) | src->stride);
+	OUT_RELOCl(chan, nouveau_buffer(src->buffer)->bo, src->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_RD);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	return 0;
+}
+
+static void
+nv04_surface_copy_done(struct nouveau_context *nv)
+{
+	FIRE_RING(nv->nvc->channel);
+}
+
+static int
+nv04_surface_fill(struct nouveau_context *nv, struct pipe_surface *dst,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h,
+		  unsigned value)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *surf2d = nv->nvc->NvCtxSurf2D;
+	struct nouveau_grobj *rect = nv->nvc->NvGdiRect;
+	int cs2d_format, gdirect_format;
+
+	if ((cs2d_format = nv04_surface_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad format = %d\n", dst->format);
+		return 1;
+	}
+
+	if ((gdirect_format = nv04_rect_format(dst->format)) < 0) {
+		NOUVEAU_ERR("Bad format = %d\n", dst->format);
+		return 1;
+	}
+
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCo(chan, nouveau_buffer(dst->buffer)->bo,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	BEGIN_RING(chan, surf2d, NV04_CONTEXT_SURFACES_2D_FORMAT, 4);
+	OUT_RING  (chan, cs2d_format);
+	OUT_RING  (chan, (dst->stride << 16) | dst->stride);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(chan, nouveau_buffer(dst->buffer)->bo, dst->offset,
+		   NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR_FORMAT, 1);
+	OUT_RING  (chan, gdirect_format);
+	BEGIN_RING(chan, rect, NV04_GDI_RECTANGLE_TEXT_COLOR1_A, 1);
+	OUT_RING  (chan, value);
+	BEGIN_RING(chan, rect,
+		   NV04_GDI_RECTANGLE_TEXT_UNCLIPPED_RECTANGLE_POINT(0), 2);
+	OUT_RING  (chan, (dx << 16) | dy);
+	OUT_RING  (chan, ( w << 16) |  h);
+
+	FIRE_RING(chan);
+	return 0;
+}
+
+int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *nvc)
+{
+	struct nouveau_channel *chan = nvc->channel;
+	unsigned chipset = nvc->channel->device->chipset, class;
+	int ret;
+
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, 0x39,
+				       &nvc->NvM2MF))) {
+		NOUVEAU_ERR("Error creating m2mf object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvM2MF, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvM2MF,
+		   NV04_MEMORY_TO_MEMORY_FORMAT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+
+	class = chipset < 0x10 ? NV04_CONTEXT_SURFACES_2D :
+				 NV10_CONTEXT_SURFACES_2D;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvCtxSurf2D))) {
+		NOUVEAU_ERR("Error creating 2D surface object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvCtxSurf2D, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvCtxSurf2D,
+		   NV04_CONTEXT_SURFACES_2D_DMA_IMAGE_SOURCE, 2);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+	OUT_RING  (chan, nvc->channel->vram->handle);
+
+	class = chipset < 0x10 ? NV04_IMAGE_BLIT : NV12_IMAGE_BLIT;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvImageBlit))) {
+		NOUVEAU_ERR("Error creating blit object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvImageBlit, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_SURFACE, 1);
+	OUT_RING  (chan, nvc->NvCtxSurf2D->handle);
+	BEGIN_RING(chan, nvc->NvImageBlit, NV04_IMAGE_BLIT_OPERATION, 1);
+	OUT_RING  (chan, NV04_IMAGE_BLIT_OPERATION_SRCCOPY);
+
+	class = NV04_GDI_RECTANGLE_TEXT;
+	if ((ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				       &nvc->NvGdiRect))) {
+		NOUVEAU_ERR("Error creating rect object: %d\n", ret);
+		return 1;
+	}
+	BIND_RING (chan, nvc->NvGdiRect, nvc->next_subchannel++);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_DMA_NOTIFY, 1);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_SURFACE, 1);
+	OUT_RING  (chan, nvc->NvCtxSurf2D->handle);
+	BEGIN_RING(chan, nvc->NvGdiRect, NV04_GDI_RECTANGLE_TEXT_OPERATION, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, nvc->NvGdiRect,
+		   NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT, 1);
+	OUT_RING  (chan, NV04_GDI_RECTANGLE_TEXT_MONOCHROME_FORMAT_LE);
+
+	switch (chipset & 0xf0) {
+	case 0x00:
+	case 0x10:
+		class = NV04_SWIZZLED_SURFACE;
+		break;
+	case 0x20:
+		class = NV20_SWIZZLED_SURFACE;
+		break;
+	case 0x30:
+		class = NV30_SWIZZLED_SURFACE;
+		break;
+	case 0x40:
+	case 0x60:
+		class = NV40_SWIZZLED_SURFACE;
+		break;
+	default:
+		/* Famous last words: this really can't happen.. */
+		assert(0);
+		break;
+	}
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				  &nvc->NvSwzSurf);
+	if (ret) {
+		NOUVEAU_ERR("Error creating swizzled surface: %d\n", ret);
+		return 1;
+	}
+
+	BIND_RING (chan, nvc->NvSwzSurf, nvc->next_subchannel++);
+
+	if (chipset < 0x10) {
+		class = NV04_SCALED_IMAGE_FROM_MEMORY;
+	} else
+	if (chipset < 0x40) {
+		class = NV10_SCALED_IMAGE_FROM_MEMORY;
+	} else {
+		class = NV40_SCALED_IMAGE_FROM_MEMORY;
+	}
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, class,
+				  &nvc->NvSIFM);
+	if (ret) {
+		NOUVEAU_ERR("Error creating scaled image object: %d\n", ret);
+		return 1;
+	}
+
+	BIND_RING (chan, nvc->NvSIFM, nvc->next_subchannel++);
+
+	return 0;
+}
+
+int
+nouveau_surface_init_nv04(struct nouveau_context *nv)
+{
+	nv->surface_copy_prep = nv04_surface_copy_prep;
+	nv->surface_copy = nv04_surface_copy_blit;
+	nv->surface_copy_done = nv04_surface_copy_done;
+	nv->surface_fill = nv04_surface_fill;
+	return 0;
+}
+
diff --git a/src/gallium/winsys/drm/nouveau/nv50_surface.c b/src/gallium/winsys/drm/nouveau/nv50_surface.c
new file mode 100644
index 0000000000..c8ab7f690f
--- /dev/null
+++ b/src/gallium/winsys/drm/nouveau/nv50_surface.c
@@ -0,0 +1,194 @@
+#include "pipe/p_context.h"
+#include "pipe/p_format.h"
+
+#include "nouveau_context.h"
+
+static INLINE int
+nv50_format(enum pipe_format format)
+{
+	switch (format) {
+	case PIPE_FORMAT_A8R8G8B8_UNORM:
+	case PIPE_FORMAT_Z24S8_UNORM:
+		return NV50_2D_DST_FORMAT_32BPP;
+	case PIPE_FORMAT_X8R8G8B8_UNORM:
+		return NV50_2D_DST_FORMAT_24BPP;
+	case PIPE_FORMAT_R5G6B5_UNORM:
+		return NV50_2D_DST_FORMAT_16BPP;
+	case PIPE_FORMAT_A8_UNORM:
+		return NV50_2D_DST_FORMAT_8BPP;
+	default:
+		return -1;
+	}
+}
+
+static int
+nv50_surface_set(struct nouveau_context *nv, struct pipe_surface *surf, int dst)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+ 	struct nouveau_bo *bo = nouveau_buffer(surf->buffer)->bo;
+ 	int surf_format, mthd = dst ? NV50_2D_DST_FORMAT : NV50_2D_SRC_FORMAT;
+ 	int flags = NOUVEAU_BO_VRAM | (dst ? NOUVEAU_BO_WR : NOUVEAU_BO_RD);
+  
+ 	surf_format = nv50_format(surf->format);
+ 	if (surf_format < 0)
+ 		return 1;
+  
+ 	if (!nouveau_bo(bo)->tiled) {
+ 		BEGIN_RING(chan, eng2d, mthd, 2);
+ 		OUT_RING  (chan, surf_format);
+ 		OUT_RING  (chan, 1);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x14, 5);
+ 		OUT_RING  (chan, surf->stride);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 		OUT_RELOCh(chan, bo, surf->offset, flags);
+ 		OUT_RELOCl(chan, bo, surf->offset, flags);
+ 	} else {
+ 		BEGIN_RING(chan, eng2d, mthd, 5);
+ 		OUT_RING  (chan, surf_format);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 1);
+ 		OUT_RING  (chan, 0);
+ 		BEGIN_RING(chan, eng2d, mthd + 0x18, 4);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 		OUT_RELOCh(chan, bo, surf->offset, flags);
+ 		OUT_RELOCl(chan, bo, surf->offset, flags);
+ 	}
+ 
+#if 0
+ 	if (dst) {
+ 		BEGIN_RING(chan, eng2d, NV50_2D_CLIP_X, 4);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, 0);
+ 		OUT_RING  (chan, surf->width);
+ 		OUT_RING  (chan, surf->height);
+ 	}
+#endif
+  
+ 	return 0;
+}
+
+static int
+nv50_surface_copy_prep(struct nouveau_context *nv,
+		       struct pipe_surface *dst, struct pipe_surface *src)
+{
+	int ret;
+
+	assert(src->format == dst->format);
+
+	ret = nv50_surface_set(nv, dst, 1);
+	if (ret)
+		return ret;
+
+	ret = nv50_surface_set(nv, src, 0);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void
+nv50_surface_copy(struct nouveau_context *nv, unsigned dx, unsigned dy,
+		  unsigned sx, unsigned sy, unsigned w, unsigned h)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+
+	BEGIN_RING(chan, eng2d, 0x088c, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, NV50_2D_BLIT_DST_X, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, w);
+	OUT_RING  (chan, h);
+	BEGIN_RING(chan, eng2d, 0x08c0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, 1);
+	BEGIN_RING(chan, eng2d, 0x08d0, 4);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sx);
+	OUT_RING  (chan, 0);
+	OUT_RING  (chan, sy);
+}
+
+static void
+nv50_surface_copy_done(struct nouveau_context *nv)
+{
+	FIRE_RING(nv->nvc->channel);
+}
+
+static int
+nv50_surface_fill(struct nouveau_context *nv, struct pipe_surface *dst,
+		  unsigned dx, unsigned dy, unsigned w, unsigned h,
+		  unsigned value)
+{
+	struct nouveau_channel *chan = nv->nvc->channel;
+	struct nouveau_grobj *eng2d = nv->nvc->Nv2D;
+	int rect_format, ret;
+
+	rect_format = nv50_format(dst->format);
+	if (rect_format < 0)
+		return 1;
+
+	ret = nv50_surface_set(nv, dst, 1);
+	if (ret)
+		return ret;
+
+	BEGIN_RING(chan, eng2d, 0x0580, 3);
+	OUT_RING  (chan, 4);
+	OUT_RING  (chan, rect_format);
+	OUT_RING  (chan, value);
+
+	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	OUT_RING  (chan, dx);
+	OUT_RING  (chan, dy);
+	OUT_RING  (chan, dx + w);
+	OUT_RING  (chan, dy + h);
+
+	FIRE_RING(chan);
+	return 0;
+}
+
+int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *nvc)
+{
+	struct nouveau_channel *chan = nvc->channel;
+	struct nouveau_grobj *eng2d = NULL;
+	int ret;
+
+	ret = nouveau_grobj_alloc(chan, nvc->next_handle++, NV50_2D, &eng2d);
+	if (ret)
+		return ret;
+	nvc->Nv2D = eng2d;
+
+	BIND_RING (chan, eng2d, nvc->next_subchannel++);
+	BEGIN_RING(chan, eng2d, NV50_2D_DMA_NOTIFY, 4);
+	OUT_RING  (chan, nvc->sync_notifier->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	OUT_RING  (chan, chan->vram->handle);
+	BEGIN_RING(chan, eng2d, NV50_2D_OPERATION, 1);
+	OUT_RING  (chan, NV50_2D_OPERATION_SRCCOPY);
+	BEGIN_RING(chan, eng2d, 0x0290, 1);
+	OUT_RING  (chan, 0);
+	BEGIN_RING(chan, eng2d, 0x0888, 1);
+	OUT_RING  (chan, 1);
+
+	return 0;
+}
+
+int
+nouveau_surface_init_nv50(struct nouveau_context *nv)
+{
+	nv->surface_copy_prep = nv50_surface_copy_prep;
+	nv->surface_copy = nv50_surface_copy;
+	nv->surface_copy_done = nv50_surface_copy_done;
+	nv->surface_fill = nv50_surface_fill;
+	return 0;
+}
+
diff --git a/src/gallium/winsys/egl_xlib/Makefile b/src/gallium/winsys/egl_xlib/Makefile
new file mode 100644
index 0000000000..76f1b56da4
--- /dev/null
+++ b/src/gallium/winsys/egl_xlib/Makefile
@@ -0,0 +1,89 @@
+# src/gallium/winsys/egl_xlib/Makefile
+
+# Build softpipe/xlib/EGL driver library/object: "egl_softpipe.so"
+
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+
+DRIVER_NAME = egl_softpipe.so
+
+
+INCLUDE_DIRS = \
+	-I$(TOP)/include \
+	-I$(TOP)/src/egl/main \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary
+
+WINSYS_SOURCES = \
+	egl_xlib.c \
+	sw_winsys.c
+
+WINSYS_OBJECTS = $(WINSYS_SOURCES:.c=.o)
+
+
+LIBS = \
+	$(GALLIUM_DRIVERS) \
+	$(GALLIUM_AUXILIARIES)
+
+# XXX temporary (should create a separate lib with the GL API funcs and
+# mesa code, as done for ES 1.x, 2.x, OpenVG, etc)
+UNUSED_LIBS = \
+	$(TOP)/src/mesa/libglapi.a \
+	$(TOP)/src/mesa/libmesa.a \
+
+
+LOCAL_CFLAGS = -D_EGL_PLATFORM_X=1
+
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $(LOCAL_CFLAGS) $< -o $@
+
+
+.PHONY: library
+
+
+default: depend library Makefile
+
+
+library: $(TOP)/$(LIB_DIR)/$(DRIVER_NAME)
+
+
+# Make the egl_softpipe.so library
+$(TOP)/$(LIB_DIR)/$(DRIVER_NAME): $(WINSYS_OBJECTS) $(LIBS)
+	$(TOP)/bin/mklib -o $(DRIVER_NAME) \
+		-linker "$(CC)" \
+		-noprefix \
+		-install $(TOP)/$(LIB_DIR) \
+		$(MKLIB_OPTIONS) $(WINSYS_OBJECTS) \
+		--whole-archive $(LIBS) --no-whole-archive
+
+
+depend: $(ALL_SOURCES)
+	@ echo "running $(MKDEP)"
+	@ rm -f depend  # workaround oops on gutsy?!?
+	@ touch depend
+	@ $(MKDEP) $(MKDEP_OPTIONS) $(DEFINES) $(INCLUDE_DIRS) $(ALL_SOURCES) \
+		> /dev/null 2>/dev/null
+
+
+install: default
+	$(INSTALL) -d $(INSTALL_DIR)/$(LIB_DIR)
+	@if [ -e $(TOP)/$(LIB_DIR) ]; then \
+		$(INSTALL) $(TOP)/$(LIB_DIR)/$(DRIVER_NAME) $(INSTALL_DIR)/$(LIB_DIR); \
+	fi
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` $(TOP)/include/GL/*.h
+
+clean:
+	-rm -f *.o *~ *.bak
+
+
+include depend
diff --git a/src/gallium/winsys/egl_xlib/egl_xlib.c b/src/gallium/winsys/egl_xlib/egl_xlib.c
new file mode 100644
index 0000000000..477d766925
--- /dev/null
+++ b/src/gallium/winsys/egl_xlib/egl_xlib.c
@@ -0,0 +1,655 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * EGL / softpipe / xlib winsys module
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include <dlfcn.h>
+#include <X11/Xutil.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+#include "pipe/p_winsys.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+
+#include "eglconfig.h"
+#include "eglconfigutil.h"
+#include "eglcontext.h"
+#include "egldisplay.h"
+#include "egldriver.h"
+#include "eglglobals.h"
+#include "egllog.h"
+#include "eglsurface.h"
+
+#include "state_tracker/st_public.h"
+
+#include "sw_winsys.h"
+
+
+/** subclass of _EGLDriver */
+struct xlib_egl_driver
+{
+   _EGLDriver Base;   /**< base class */
+
+   struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+};
+
+
+/** subclass of _EGLContext */
+struct xlib_egl_context
+{
+   _EGLContext Base;   /**< base class */
+
+   struct pipe_context *pipe;   /**< Gallium driver context */
+   struct st_context *Context;  /**< Mesa/gallium state tracker context */
+};
+
+
+/** subclass of _EGLSurface */
+struct xlib_egl_surface
+{
+   _EGLSurface Base;   /**< base class */
+
+   Display *Dpy;  /**< The X Display of the window */
+   Window Win;    /**< The user-created window ID */
+   GC Gc;
+   XVisualInfo VisInfo;
+
+   struct pipe_winsys *winsys;
+
+   struct st_framebuffer *Framebuffer;
+};
+
+
+/** cast wrapper */
+static INLINE struct xlib_egl_driver *
+xlib_egl_driver(_EGLDriver *drv)
+{
+   return (struct xlib_egl_driver *) drv;
+}
+
+
+static struct xlib_egl_surface *
+lookup_surface(EGLSurface surf)
+{
+   _EGLSurface *surface = _eglLookupSurface(surf);
+   return (struct xlib_egl_surface *) surface;
+}
+
+
+static struct xlib_egl_context *
+lookup_context(EGLContext surf)
+{
+   _EGLContext *context = _eglLookupContext(surf);
+   return (struct xlib_egl_context *) context;
+}
+
+
+static unsigned int
+bitcount(unsigned int n)
+{
+   unsigned int bits;
+   for (bits = 0; n > 0; n = n >> 1) {
+      bits += (n & 1);
+   }
+   return bits;
+}
+
+
+/**
+ * Create the EGLConfigs.  (one per X visual)
+ */
+static void
+create_configs(_EGLDriver *drv, EGLDisplay dpy)
+{
+   static const EGLint all_apis = (EGL_OPENGL_ES_BIT |
+                                   EGL_OPENGL_ES2_BIT |
+                                   EGL_OPENVG_BIT |
+                                   EGL_OPENGL_BIT);
+   _EGLDisplay *disp = _eglLookupDisplay(dpy);
+   XVisualInfo *visInfo, visTemplate;
+   int num_visuals, i;
+
+   /* get list of all X visuals, create an EGL config for each */
+   visTemplate.screen = DefaultScreen(disp->Xdpy);
+   visInfo = XGetVisualInfo(disp->Xdpy, VisualScreenMask,
+                            &visTemplate, &num_visuals);
+   if (!visInfo) {
+      printf("egl_xlib.c: couldn't get any X visuals\n");
+      abort();
+   }
+
+   for (i = 0; i < num_visuals; i++) {
+      _EGLConfig *config = calloc(1, sizeof(_EGLConfig));
+      int id = i + 1;
+      int rbits = bitcount(visInfo[i].red_mask);
+      int gbits = bitcount(visInfo[i].green_mask);
+      int bbits = bitcount(visInfo[i].blue_mask);
+      int abits = bbits == 8 ? 8 : 0;
+      int zbits = 24;
+      int sbits = 8;
+      int visid = visInfo[i].visualid;
+#if defined(__cplusplus) || defined(c_plusplus)
+      int vistype = visInfo[i].c_class;
+#else
+      int vistype = visInfo[i].class;
+#endif
+
+      _eglInitConfig(config, id);
+      SET_CONFIG_ATTRIB(config, EGL_BUFFER_SIZE, rbits + gbits + bbits + abits);
+      SET_CONFIG_ATTRIB(config, EGL_RED_SIZE, rbits);
+      SET_CONFIG_ATTRIB(config, EGL_GREEN_SIZE, gbits);
+      SET_CONFIG_ATTRIB(config, EGL_BLUE_SIZE, bbits);
+      SET_CONFIG_ATTRIB(config, EGL_ALPHA_SIZE, abits);
+      SET_CONFIG_ATTRIB(config, EGL_DEPTH_SIZE, zbits);
+      SET_CONFIG_ATTRIB(config, EGL_STENCIL_SIZE, sbits);
+      SET_CONFIG_ATTRIB(config, EGL_NATIVE_VISUAL_ID, visid);
+      SET_CONFIG_ATTRIB(config, EGL_NATIVE_VISUAL_TYPE, vistype);
+      SET_CONFIG_ATTRIB(config, EGL_NATIVE_RENDERABLE, EGL_FALSE);
+      SET_CONFIG_ATTRIB(config, EGL_CONFORMANT, all_apis);
+      SET_CONFIG_ATTRIB(config, EGL_RENDERABLE_TYPE, all_apis);
+      SET_CONFIG_ATTRIB(config, EGL_SURFACE_TYPE, EGL_WINDOW_BIT);
+
+      _eglAddConfig(disp, config);
+   }
+}
+
+
+/**
+ * Called via eglInitialize(), drv->API.Initialize().
+ */
+static EGLBoolean
+xlib_eglInitialize(_EGLDriver *drv, EGLDisplay dpy,
+                   EGLint *minor, EGLint *major)
+{
+   create_configs(drv, dpy);
+
+   drv->Initialized = EGL_TRUE;
+
+   /* we're supporting EGL 1.4 */
+   *minor = 1;
+   *major = 4;
+
+   return EGL_TRUE;
+}
+
+
+/**
+ * Called via eglTerminate(), drv->API.Terminate().
+ */
+static EGLBoolean
+xlib_eglTerminate(_EGLDriver *drv, EGLDisplay dpy)
+{
+   return EGL_TRUE;
+}
+
+
+static _EGLProc
+xlib_eglGetProcAddress(const char *procname)
+{
+   return (_EGLProc) st_get_proc_address(procname);
+}
+
+
+static void
+get_drawable_visual_info(Display *dpy, Drawable d, XVisualInfo *visInfo)
+{
+   XWindowAttributes attr;
+   XVisualInfo visTemp, *vis;
+   int num_visuals;
+
+   XGetWindowAttributes(dpy, d, &attr);
+
+   visTemp.screen = DefaultScreen(dpy);
+   visTemp.visualid = attr.visual->visualid;
+   vis = XGetVisualInfo(dpy,
+                        (VisualScreenMask | VisualIDMask),
+                        &visTemp, &num_visuals);
+   if (vis)
+      *visInfo = *vis;
+
+   XFree(vis);
+}
+
+
+
+/** Get size of given window */
+static Status
+get_drawable_size(Display *dpy, Drawable d, uint *width, uint *height)
+{
+   Window root;
+   Status stat;
+   int xpos, ypos;
+   unsigned int w, h, bw, depth;
+   stat = XGetGeometry(dpy, d, &root, &xpos, &ypos, &w, &h, &bw, &depth);
+   *width = w;
+   *height = h;
+   return stat;
+}
+
+
+static void
+check_and_update_buffer_size(struct xlib_egl_surface *surface)
+{
+   uint width, height;
+   get_drawable_size(surface->Dpy, surface->Win, &width, &height);
+   st_resize_framebuffer(surface->Framebuffer, width, height);
+   surface->Base.Width = width;
+   surface->Base.Height = height;
+}
+
+
+
+static void
+display_surface(struct pipe_winsys *pws,
+                struct pipe_surface *psurf,
+                struct xlib_egl_surface *xsurf)
+{
+   XImage *ximage;
+   void *data;
+
+   ximage = XCreateImage(xsurf->Dpy,
+                         xsurf->VisInfo.visual,
+                         xsurf->VisInfo.depth,
+                         ZPixmap, 0,   /* format, offset */
+                         NULL,         /* data */
+                         0, 0,         /* size */
+                         32,           /* bitmap_pad */
+                         0);           /* bytes_per_line */
+
+
+   assert(ximage->format);
+   assert(ximage->bitmap_unit);
+
+   data = pws->buffer_map(pws, psurf->buffer, 0);
+
+   /* update XImage's fields */
+   ximage->data = data;
+   ximage->width = psurf->width;
+   ximage->height = psurf->height;
+   ximage->bytes_per_line = psurf->stride;
+   
+   XPutImage(xsurf->Dpy, xsurf->Win, xsurf->Gc,
+             ximage, 0, 0, 0, 0, psurf->width, psurf->height);
+
+   XSync(xsurf->Dpy, 0);
+
+   ximage->data = NULL;
+   XDestroyImage(ximage);
+
+   pws->buffer_unmap(pws, psurf->buffer);
+}
+
+
+
+/** Display gallium surface in X window */
+static void
+flush_frontbuffer(struct pipe_winsys *pws,
+                  struct pipe_surface *psurf,
+                  void *context_private)
+{
+   struct xlib_egl_surface *xsurf = (struct xlib_egl_surface *) context_private;
+   display_surface(pws, psurf, xsurf);
+}
+
+
+
+/**
+ * Called via eglCreateContext(), drv->API.CreateContext().
+ */
+static EGLContext
+xlib_eglCreateContext(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config,
+                      EGLContext share_list, const EGLint *attrib_list)
+{
+   struct xlib_egl_driver *xdrv = xlib_egl_driver(drv);
+   _EGLConfig *conf = _eglLookupConfig(drv, dpy, config);
+   struct xlib_egl_context *ctx;
+   struct st_context *share_ctx = NULL; /* XXX fix */
+   __GLcontextModes visual;
+
+   ctx = CALLOC_STRUCT(xlib_egl_context);
+   if (!ctx)
+      return EGL_NO_CONTEXT;
+
+   /* let EGL lib init the common stuff */
+   if (!_eglInitContext(drv, dpy, &ctx->Base, config, attrib_list)) {
+      free(ctx);
+      return EGL_NO_CONTEXT;
+   }
+
+   /* API-dependent context creation */
+   switch (ctx->Base.ClientAPI) {
+   case EGL_OPENVG_API:
+   case EGL_OPENGL_ES_API:
+      _eglLog(_EGL_DEBUG, "Create Context for ES version %d\n",
+              ctx->Base.ClientVersion);
+      /* fall-through */
+   case EGL_OPENGL_API:
+      /* create a softpipe context */
+      ctx->pipe = softpipe_create(xdrv->screen, xdrv->winsys, NULL);
+      /* Now do xlib / state tracker inits here */
+      _eglConfigToContextModesRec(conf, &visual);
+      ctx->Context = st_create_context(ctx->pipe, &visual, share_ctx);
+      break;
+   default:
+      _eglError(EGL_BAD_MATCH, "eglCreateContext(unsupported API)");
+      free(ctx);
+      return EGL_NO_CONTEXT;
+   }
+
+   _eglSaveContext(&ctx->Base);
+
+   return _eglGetContextHandle(&ctx->Base);
+}
+
+
+static EGLBoolean
+xlib_eglDestroyContext(_EGLDriver *drv, EGLDisplay dpy, EGLContext ctx)
+{
+   struct xlib_egl_context *context = lookup_context(ctx);
+   if (context) {
+      if (context->Base.IsBound) {
+         context->Base.DeletePending = EGL_TRUE;
+      }
+      else {
+         /* API-dependent clean-up */
+         switch (context->Base.ClientAPI) {
+         case EGL_OPENGL_ES_API:
+            /* fall-through */
+         case EGL_OPENGL_API:
+            st_destroy_context(context->Context);
+            break;
+         default:
+            assert(0);
+         }
+         free(context);
+      }
+      return EGL_TRUE;
+   }
+   else {
+      _eglError(EGL_BAD_CONTEXT, "eglDestroyContext");
+      return EGL_TRUE;
+   }
+}
+
+
+/**
+ * Called via eglMakeCurrent(), drv->API.MakeCurrent().
+ */
+static EGLBoolean
+xlib_eglMakeCurrent(_EGLDriver *drv, EGLDisplay dpy,
+                    EGLSurface draw, EGLSurface read, EGLContext ctx)
+{
+   struct xlib_egl_context *context = lookup_context(ctx);
+   struct xlib_egl_surface *draw_surf = lookup_surface(draw);
+   struct xlib_egl_surface *read_surf = lookup_surface(read);
+
+   if (!_eglMakeCurrent(drv, dpy, draw, read, context))
+      return EGL_FALSE;
+
+   st_make_current((context ? context->Context : NULL),
+                   (draw_surf ? draw_surf->Framebuffer : NULL),
+                   (read_surf ? read_surf->Framebuffer : NULL));
+
+   if (draw_surf)
+      check_and_update_buffer_size(draw_surf);
+   if (read_surf && read_surf != draw_surf)
+      check_and_update_buffer_size(draw_surf);
+
+   return EGL_TRUE;
+}
+
+
+static enum pipe_format
+choose_color_format(const __GLcontextModes *visual)
+{
+   if (visual->redBits == 8 &&
+       visual->greenBits == 8 &&
+       visual->blueBits == 8 &&
+       visual->alphaBits == 8) {
+      /* XXX this really also depends on the ordering of R,G,B,A */
+      return PIPE_FORMAT_A8R8G8B8_UNORM;
+   }
+   else {
+      assert(0);
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+
+static enum pipe_format
+choose_depth_format(const __GLcontextModes *visual)
+{
+   if (visual->depthBits > 0)
+      return PIPE_FORMAT_S8Z24_UNORM;
+   else
+      return PIPE_FORMAT_NONE;
+}
+
+
+static enum pipe_format
+choose_stencil_format(const __GLcontextModes *visual)
+{
+   if (visual->stencilBits > 0)
+      return PIPE_FORMAT_S8Z24_UNORM;
+   else
+      return PIPE_FORMAT_NONE;
+}
+
+
+/**
+ * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
+ */
+static EGLSurface
+xlib_eglCreateWindowSurface(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config,
+                            NativeWindowType window, const EGLint *attrib_list)
+{
+   struct xlib_egl_driver *xdrv = xlib_egl_driver(drv);
+   _EGLDisplay *disp = _eglLookupDisplay(dpy);
+   _EGLConfig *conf = _eglLookupConfig(drv, dpy, config);
+
+   struct xlib_egl_surface *surf;
+   __GLcontextModes visual;
+   uint width, height;
+
+   surf = CALLOC_STRUCT(xlib_egl_surface);
+   if (!surf)
+      return EGL_NO_SURFACE;
+
+   /* Let EGL lib init the common stuff */
+   if (!_eglInitSurface(drv, dpy, &surf->Base, EGL_WINDOW_BIT,
+                        config, attrib_list)) {
+      free(surf);
+      return EGL_NO_SURFACE;
+   }
+
+   _eglSaveSurface(&surf->Base);
+
+   /*
+    * Now init the Xlib and gallium stuff
+    */
+   surf->Win = (Window) window;  /* The X window ID */
+   surf->Dpy = disp->Xdpy;  /* The X display */
+   surf->Gc = XCreateGC(surf->Dpy, surf->Win, 0, NULL);
+
+   surf->winsys = xdrv->winsys;
+
+   _eglConfigToContextModesRec(conf, &visual);
+   get_drawable_size(surf->Dpy, surf->Win, &width, &height);
+   get_drawable_visual_info(surf->Dpy, surf->Win, &surf->VisInfo);
+
+   surf->Base.Width = width;
+   surf->Base.Height = height;
+
+   /* Create GL statetracker framebuffer */
+   surf->Framebuffer = st_create_framebuffer(&visual,
+                                             choose_color_format(&visual),
+                                             choose_depth_format(&visual),
+                                             choose_stencil_format(&visual),
+                                             width, height,
+                                             (void *) surf);
+
+   st_resize_framebuffer(surf->Framebuffer, width, height);
+
+   return _eglGetSurfaceHandle(&surf->Base);
+}
+
+
+static EGLBoolean
+xlib_eglDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
+{
+   struct xlib_egl_surface *surf = lookup_surface(surface);
+   if (surf) {
+      _eglHashRemove(_eglGlobal.Surfaces, (EGLuint) surface);
+      if (surf->Base.IsBound) {
+         surf->Base.DeletePending = EGL_TRUE;
+      }
+      else {
+         XFreeGC(surf->Dpy, surf->Gc);
+         st_unreference_framebuffer(surf->Framebuffer);
+         free(surf);
+      }
+      return EGL_TRUE;
+   }
+   else {
+      _eglError(EGL_BAD_SURFACE, "eglDestroySurface");
+      return EGL_FALSE;
+   }
+}
+
+
+static EGLBoolean
+xlib_eglSwapBuffers(_EGLDriver *drv, EGLDisplay dpy, EGLSurface draw)
+{
+   /* error checking step: */
+   if (!_eglSwapBuffers(drv, dpy, draw))
+      return EGL_FALSE;
+
+   {
+      struct xlib_egl_surface *xsurf = lookup_surface(draw);
+      struct pipe_winsys *pws = xsurf->winsys;
+      struct pipe_surface *psurf =
+         st_get_framebuffer_surface(xsurf->Framebuffer, ST_SURFACE_BACK_LEFT);
+
+      st_notify_swapbuffers(xsurf->Framebuffer);
+
+      display_surface(pws, psurf, xsurf);
+
+      check_and_update_buffer_size(xsurf);
+   }
+
+   return EGL_TRUE;
+}
+
+
+/**
+ * Determine which API(s) is(are) present by looking for some specific
+ * global symbols.
+ */
+static EGLint
+find_supported_apis(void)
+{
+   EGLint mask = 0;
+   void *handle;
+
+   handle = dlopen(NULL, 0);
+
+   if (dlsym(handle, "st_api_OpenGL_ES1"))
+      mask |= EGL_OPENGL_ES_BIT;
+
+   if (dlsym(handle, "st_api_OpenGL_ES2"))
+      mask |= EGL_OPENGL_ES2_BIT;
+
+   if (dlsym(handle, "st_api_OpenGL"))
+      mask |= EGL_OPENGL_BIT;
+
+   if (dlsym(handle, "st_api_OpenVG"))
+      mask |= EGL_OPENVG_BIT;
+
+   dlclose(handle);
+
+   return mask;
+}
+
+
+/**
+ * This is the main entrypoint into the driver.
+ * Called by libEGL to instantiate an _EGLDriver object.
+ */
+_EGLDriver *
+_eglMain(_EGLDisplay *dpy, const char *args)
+{
+   struct xlib_egl_driver *xdrv;
+
+   _eglLog(_EGL_INFO, "Entering EGL/Xlib _eglMain(%s)", args);
+
+   xdrv = CALLOC_STRUCT(xlib_egl_driver);
+   if (!xdrv)
+      return NULL;
+
+   if (!dpy->Xdpy) {
+      dpy->Xdpy = XOpenDisplay(NULL);
+   }
+
+   _eglInitDriverFallbacks(&xdrv->Base);
+   xdrv->Base.API.Initialize = xlib_eglInitialize;
+   xdrv->Base.API.Terminate = xlib_eglTerminate;
+   xdrv->Base.API.GetProcAddress = xlib_eglGetProcAddress;
+   xdrv->Base.API.CreateContext = xlib_eglCreateContext;
+   xdrv->Base.API.DestroyContext = xlib_eglDestroyContext;
+   xdrv->Base.API.CreateWindowSurface = xlib_eglCreateWindowSurface;
+   xdrv->Base.API.DestroySurface = xlib_eglDestroySurface;
+   xdrv->Base.API.MakeCurrent = xlib_eglMakeCurrent;
+   xdrv->Base.API.SwapBuffers = xlib_eglSwapBuffers;
+
+   xdrv->Base.ClientAPIsMask = find_supported_apis();
+   if (xdrv->Base.ClientAPIsMask == 0x0) {
+      /* the app isn't directly linked with any EGL-supprted APIs
+       * (such as libGLESv2.so) so use an EGL utility to see what
+       * APIs might be loaded dynamically on this system.
+       */
+      xdrv->Base.ClientAPIsMask = _eglFindAPIs();
+   }      
+
+   xdrv->Base.Name = "Xlib/softpipe";
+
+   /* create one winsys and use it for all contexts/surfaces */
+   xdrv->winsys = create_sw_winsys();
+   xdrv->winsys->flush_frontbuffer = flush_frontbuffer;
+
+   xdrv->screen = softpipe_create_screen(xdrv->winsys);
+
+   return &xdrv->Base;
+}
+
diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.c b/src/gallium/winsys/egl_xlib/sw_winsys.c
new file mode 100644
index 0000000000..2fd190da52
--- /dev/null
+++ b/src/gallium/winsys/egl_xlib/sw_winsys.c
@@ -0,0 +1,284 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Totally software-based winsys layer.
+ * Note that the one winsys function that we can't implement here
+ * is flush_frontbuffer().
+ * Whoever uses this code will have to provide that.
+ *
+ * Authors: Brian Paul
+ */
+
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#include "sw_winsys.h"
+
+
+
+/** Subclass of pipe_winsys */
+struct sw_pipe_winsys
+{
+   struct pipe_winsys Base;
+   /* no extra fields for now */
+};
+
+
+/** subclass of pipe_buffer */
+struct sw_pipe_buffer
+{
+   struct pipe_buffer Base;
+   boolean UserBuffer;  /** Is this a user-space buffer? */
+   void *Data;
+   void *Mapped;
+};
+
+
+/** cast wrapper */
+static INLINE struct sw_pipe_buffer *
+sw_pipe_buffer(struct pipe_buffer *b)
+{
+   return (struct sw_pipe_buffer *) b;
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static const char *
+get_name(struct pipe_winsys *pws)
+{
+   return "software";
+}
+
+
+/** Create new pipe_buffer and allocate storage of given size */
+static struct pipe_buffer *
+buffer_create(struct pipe_winsys *pws, 
+              unsigned alignment, 
+              unsigned usage,
+              unsigned size)
+{
+   struct sw_pipe_buffer *buffer = CALLOC_STRUCT(sw_pipe_buffer);
+   if (!buffer)
+      return NULL;
+
+   buffer->Base.refcount = 1;
+   buffer->Base.alignment = alignment;
+   buffer->Base.usage = usage;
+   buffer->Base.size = size;
+
+   /* align to 16-byte multiple for Cell */
+   buffer->Data = align_malloc(size, MAX2(alignment, 16));
+
+   return &buffer->Base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+user_buffer_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+   struct sw_pipe_buffer *buffer = CALLOC_STRUCT(sw_pipe_buffer);
+   if (!buffer)
+      return NULL;
+
+   buffer->Base.refcount = 1;
+   buffer->Base.size = bytes;
+   buffer->UserBuffer = TRUE;
+   buffer->Data = ptr;
+
+   return &buffer->Base;
+}
+
+
+static void *
+buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buf, unsigned flags)
+{
+   struct sw_pipe_buffer *buffer = sw_pipe_buffer(buf);
+   buffer->Mapped = buffer->Data;
+   return buffer->Mapped;
+}
+
+
+static void
+buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+   struct sw_pipe_buffer *buffer = sw_pipe_buffer(buf);
+   buffer->Mapped = NULL;
+}
+
+
+static void
+buffer_destroy(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+   struct sw_pipe_buffer *buffer = sw_pipe_buffer(buf);
+
+   if (buffer->Data && !buffer->UserBuffer) {
+      align_free(buffer->Data);
+      buffer->Data = NULL;
+   }
+
+   free(buffer);
+}
+
+
+/**
+ * Called via winsys->surface_alloc() to create new surfaces.
+ */
+static struct pipe_surface *
+surface_alloc(struct pipe_winsys *ws)
+{
+   struct pipe_surface *surf = CALLOC_STRUCT(pipe_surface);
+   if (!surf)
+      return NULL;
+
+   surf->refcount = 1;
+   surf->winsys = ws;
+
+   return surf;
+}
+
+
+static int
+surface_alloc_storage(struct pipe_winsys *winsys,
+                      struct pipe_surface *surf,
+                      unsigned width, unsigned height,
+                      enum pipe_format format, 
+                      unsigned flags,
+                      unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(surf->format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        surf->stride * height);
+   if(!surf->buffer)
+      return -1;
+   
+   return 0;
+}
+
+
+static void
+surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   assert(!surf->texture);
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+         winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+static void
+fence_reference(struct pipe_winsys *sws, struct pipe_fence_handle **ptr,
+                struct pipe_fence_handle *fence)
+{
+   /* no-op */
+}
+
+
+static int
+fence_signalled(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
+                unsigned flag)
+{
+   /* no-op */
+   return 0;
+}
+
+
+static int
+fence_finish(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
+             unsigned flag)
+{
+   /* no-op */
+   return 0;
+}
+
+
+/**
+ * Create/return a new pipe_winsys object.
+ */
+struct pipe_winsys *
+create_sw_winsys(void)
+{
+   struct sw_pipe_winsys *ws = CALLOC_STRUCT(sw_pipe_winsys);
+   if (!ws)
+      return NULL;
+
+   /* Fill in this struct with callbacks that pipe will need to
+    * communicate with the window system, buffer manager, etc. 
+    */
+   ws->Base.buffer_create = buffer_create;
+   ws->Base.user_buffer_create = user_buffer_create;
+   ws->Base.buffer_map = buffer_map;
+   ws->Base.buffer_unmap = buffer_unmap;
+   ws->Base.buffer_destroy = buffer_destroy;
+
+   ws->Base.surface_alloc = surface_alloc;
+   ws->Base.surface_alloc_storage = surface_alloc_storage;
+   ws->Base.surface_release = surface_release;
+
+   ws->Base.fence_reference = fence_reference;
+   ws->Base.fence_signalled = fence_signalled;
+   ws->Base.fence_finish = fence_finish;
+
+   ws->Base.flush_frontbuffer = NULL; /* not implemented here! */
+
+   ws->Base.get_name = get_name;
+
+   return &ws->Base;
+}
diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.h b/src/gallium/winsys/egl_xlib/sw_winsys.h
new file mode 100644
index 0000000000..f96c5a14b0
--- /dev/null
+++ b/src/gallium/winsys/egl_xlib/sw_winsys.h
@@ -0,0 +1,40 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#ifndef SW_WINSYS_H
+#define SW_WINSYS_H
+
+
+struct pipe_winsys;
+
+
+extern struct pipe_winsys *
+create_sw_winsys(void);
+
+
+#endif /* SW_WINSYS_H */
diff --git a/src/gallium/winsys/g3dvl/nouveau/Makefile b/src/gallium/winsys/g3dvl/nouveau/Makefile
new file mode 100644
index 0000000000..ff43327778
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/Makefile
@@ -0,0 +1,50 @@
+TARGET		= libnouveau_dri.so
+GALLIUMDIR	= ../../..
+DRMDIR		?= /usr
+DRIDIR		= ../../../../driclient
+
+OBJECTS		= nouveau_bo.o nouveau_fence.o nouveau_swapbuffers.o nouveau_channel.o		\
+		  nouveau_grobj.o nouveau_context.o nouveau_winsys.o nouveau_lock.o		\
+		  nouveau_winsys_pipe.o nouveau_device.o nouveau_notifier.o nouveau_dma.o	\
+		  nouveau_pushbuf.o nouveau_resource.o nouveau_screen.o nv04_surface.o		\
+		  nv50_surface.o #nouveau_winsys_softpipe.o
+
+CFLAGS		+= -g -Wall -fPIC			\
+		   -I${GALLIUMDIR}/include		\
+		   -I${GALLIUMDIR}/winsys/g3dvl		\
+		   -I${DRMDIR}/include			\
+		   -I${DRMDIR}/include/drm		\
+		   -I${GALLIUMDIR}/drivers		\
+		   -I${GALLIUMDIR}/auxiliary		\
+		   -I${DRIDIR}/include
+
+LDFLAGS		+= -L${DRMDIR}/lib			\
+		   -L${DRIDIR}/lib			\
+		   -L${GALLIUMDIR}/auxiliary/draw	\
+		   -L${GALLIUMDIR}/auxiliary/tgsi	\
+		   -L${GALLIUMDIR}/auxiliary/translate	\
+		   -L${GALLIUMDIR}/auxiliary/rtasm	\
+		   -L${GALLIUMDIR}/auxiliary/cso_cache	\
+		   -L${GALLIUMDIR}/drivers/nv10		\
+		   -L${GALLIUMDIR}/drivers/nv20		\
+		   -L${GALLIUMDIR}/drivers/nv30		\
+		   -L${GALLIUMDIR}/drivers/nv40		\
+		   -L${GALLIUMDIR}/drivers/nv50
+
+LIBS		+= -ldriclient -ldrm -lnv10 -lnv20 -lnv30 -lnv40 -lnv50 -ldraw -ltgsi -ltranslate -lrtasm -lcso_cache -lm
+
+#############################################
+
+.PHONY	= all clean libdriclient
+
+all: ${TARGET}
+
+${TARGET}: ${OBJECTS} libdriclient
+	$(CC) ${LDFLAGS} -shared -o $@ ${OBJECTS} ${LIBS}
+
+libdriclient:
+	cd ${DRIDIR}/src; ${MAKE}
+
+clean:
+	cd ${DRIDIR}/src; ${MAKE} clean
+	rm -rf ${OBJECTS} ${TARGET}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c
new file mode 120000
index 0000000000..1005282dd4
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_bo.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_bo.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c
new file mode 120000
index 0000000000..5af8202950
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_channel.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_channel.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c
new file mode 100644
index 0000000000..06a61fcda3
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.c
@@ -0,0 +1,370 @@
+#include "pipe/p_defines.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_winsys_pipe.h"
+
+/*
+#ifdef DEBUG
+static const struct dri_debug_control debug_control[] = {
+	{ "bo", DEBUG_BO },
+	{ NULL, 0 }
+};
+int __nouveau_debug = 0;
+#endif
+*/
+
+/*
+ * TODO: Re-examine dri_screen, dri_context, nouveau_screen, nouveau_context
+ * relationships, seems like there is a lot of room for simplification there.
+ */
+
+static void
+nouveau_channel_context_destroy(struct nouveau_channel_context *nvc)
+{
+	nouveau_grobj_free(&nvc->NvCtxSurf2D);
+	nouveau_grobj_free(&nvc->NvImageBlit);
+	nouveau_grobj_free(&nvc->NvGdiRect);
+	nouveau_grobj_free(&nvc->NvM2MF);
+	nouveau_grobj_free(&nvc->Nv2D);
+	nouveau_grobj_free(&nvc->NvSwzSurf);
+	nouveau_grobj_free(&nvc->NvSIFM);
+
+	nouveau_notifier_free(&nvc->sync_notifier);
+
+	nouveau_channel_free(&nvc->channel);
+
+	FREE(nvc);
+}
+
+static struct nouveau_channel_context *
+nouveau_channel_context_create(struct nouveau_device *dev)
+{
+	struct nouveau_channel_context *nvc;
+	int ret;
+
+	nvc = CALLOC_STRUCT(nouveau_channel_context);
+	if (!nvc)
+		return NULL;
+
+	if ((ret = nouveau_channel_alloc(dev, 0x8003d001, 0x8003d002,
+					 &nvc->channel))) {
+		NOUVEAU_ERR("Error creating GPU channel: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	nvc->next_handle = 0x80000000;
+
+	if ((ret = nouveau_notifier_alloc(nvc->channel, nvc->next_handle++, 1,
+					  &nvc->sync_notifier))) {
+		NOUVEAU_ERR("Error creating channel sync notifier: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		ret = nouveau_surface_channel_create_nv50(nvc);
+		break;
+	default:
+		ret = nouveau_surface_channel_create_nv04(nvc);
+		break;
+	}
+
+	if (ret) {
+		NOUVEAU_ERR("Error initialising surface objects: %d\n", ret);
+		nouveau_channel_context_destroy(nvc);
+		return NULL;
+	}
+
+	return nvc;
+}
+
+int
+nouveau_context_create(dri_context_t *dri_context)
+{
+	dri_screen_t			*dri_screen = dri_context->dri_screen;
+	struct nouveau_screen 		*nv_screen = dri_screen->private;
+	struct nouveau_context		*nv = CALLOC_STRUCT(nouveau_context);
+	struct pipe_context		*pipe = NULL;
+	struct nouveau_channel_context	*nvc = NULL;
+	struct nouveau_device		*dev = nv_screen->device;
+	int				i;
+
+	switch (dev->chipset & 0xf0) {
+	case 0x10:
+	case 0x20:
+		/* NV10 */
+	case 0x30:
+		/* NV30 */
+	case 0x40:
+	case 0x60:
+		/* NV40 */
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		/* G80 */
+		break;
+	default:
+		NOUVEAU_ERR("Unsupported chipset: NV%02x\n", dev->chipset);
+		return 1;
+	}
+
+	dri_context->private = (void*)nv;
+	nv->dri_context = dri_context;
+	nv->nv_screen  = nv_screen;
+
+	{
+		struct nouveau_device_priv *nvdev = nouveau_device(dev);
+
+		nvdev->ctx  = dri_context->drm_context;
+		nvdev->lock = (drmLock*)&dri_screen->sarea->lock;
+	}
+
+	/*
+	driParseConfigFiles(&nv->dri_option_cache, &nv_screen->option_cache,
+			    nv->dri_screen->myNum, "nouveau");
+#ifdef DEBUG
+	__nouveau_debug = driParseDebugString(getenv("NOUVEAU_DEBUG"),
+					      debug_control);
+#endif
+	*/
+
+	/*XXX: Hack up a fake region and buffer object for front buffer.
+	 *     This will go away with TTM, replaced with a simple reference
+	 *     of the front buffer handle passed to us by the DDX.
+	 */
+	{
+		struct pipe_surface *fb_surf;
+		struct nouveau_pipe_buffer *fb_buf;
+		struct nouveau_bo_priv *fb_bo;
+
+		fb_bo = calloc(1, sizeof(struct nouveau_bo_priv));
+		fb_bo->drm.offset = nv_screen->front_offset;
+		fb_bo->drm.flags = NOUVEAU_MEM_FB;
+		fb_bo->drm.size = nv_screen->front_pitch *
+				  nv_screen->front_height;
+		fb_bo->refcount = 1;
+		fb_bo->base.flags = NOUVEAU_BO_PIN | NOUVEAU_BO_VRAM;
+		fb_bo->base.offset = fb_bo->drm.offset;
+		fb_bo->base.handle = (unsigned long)fb_bo;
+		fb_bo->base.size = fb_bo->drm.size;
+		fb_bo->base.device = nv_screen->device;
+
+		fb_buf = calloc(1, sizeof(struct nouveau_pipe_buffer));
+		fb_buf->bo = &fb_bo->base;
+
+		fb_surf = calloc(1, sizeof(struct pipe_surface));
+		if (nv_screen->front_cpp == 2)
+			fb_surf->format = PIPE_FORMAT_R5G6B5_UNORM;
+		else
+			fb_surf->format = PIPE_FORMAT_A8R8G8B8_UNORM;
+		pf_get_block(fb_surf->format, &fb_surf->block);
+		fb_surf->width = nv_screen->front_pitch / nv_screen->front_cpp;
+		fb_surf->height = nv_screen->front_height;
+		fb_surf->stride = fb_surf->width * fb_surf->block.size;
+		fb_surf->refcount = 1;
+		fb_surf->buffer = &fb_buf->base;
+
+		nv->frontbuffer = fb_surf;
+	}
+
+	nvc = nv_screen->nvc;
+
+	if (!nvc) {
+		nvc = nouveau_channel_context_create(dev);
+		if (!nvc) {
+			NOUVEAU_ERR("Failed initialising GPU context\n");
+			return 1;
+		}
+		nv_screen->nvc = nvc;
+	}
+
+	nvc->refcount++;
+	nv->nvc = nvc;
+
+	/* Find a free slot for a pipe context, allocate a new one if needed */
+	nv->pctx_id = -1;
+	for (i = 0; i < nvc->nr_pctx; i++) {
+		if (nvc->pctx[i] == NULL) {
+			nv->pctx_id = i;
+			break;
+		}
+	}
+
+	if (nv->pctx_id < 0) {
+		nv->pctx_id = nvc->nr_pctx++;
+		nvc->pctx =
+			realloc(nvc->pctx,
+				sizeof(struct pipe_context *) * nvc->nr_pctx);
+	}
+
+	/* Create pipe */
+	switch (dev->chipset & 0xf0) {
+	case 0x50:
+	case 0x80:
+	case 0x90:
+		if (nouveau_surface_init_nv50(nv))
+			return 1;
+		break;
+	default:
+		if (nouveau_surface_init_nv04(nv))
+			return 1;
+		break;
+	}
+
+	if (!getenv("NOUVEAU_FORCE_SOFTPIPE")) {
+		struct pipe_screen *pscreen;
+
+		pipe = nouveau_pipe_create(nv);
+		if (!pipe)
+			NOUVEAU_ERR("Couldn't create hw pipe\n");
+		pscreen = nvc->pscreen;
+
+		nv->cap.hw_vertex_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_VTXBUF);
+		nv->cap.hw_index_buffer =
+			pscreen->get_param(pscreen, NOUVEAU_CAP_HW_IDXBUF);
+	}
+
+	/* XXX: nouveau_winsys_softpipe needs a mesa header removed before we can compile it. */
+	/*
+	if (!pipe) {
+		NOUVEAU_MSG("Using softpipe\n");
+		pipe = nouveau_create_softpipe(nv);
+		if (!pipe) {
+			NOUVEAU_ERR("Error creating pipe, bailing\n");
+			return 1;
+		}
+	}
+	*/
+	if (!pipe) {
+		NOUVEAU_ERR("Error creating pipe, bailing\n");
+		return 1;
+	}
+
+	pipe->priv = nv;
+
+	return 0;
+}
+
+void
+nouveau_context_destroy(dri_context_t *dri_context)
+{
+	struct nouveau_context *nv = dri_context->private;
+	struct nouveau_channel_context *nvc = nv->nvc;
+
+	assert(nv);
+
+	if (nv->pctx_id >= 0) {
+		nvc->pctx[nv->pctx_id] = NULL;
+		if (--nvc->refcount <= 0) {
+			nouveau_channel_context_destroy(nvc);
+			nv->nv_screen->nvc = NULL;
+		}
+	}
+
+	free(nv);
+}
+
+int
+nouveau_context_bind(struct nouveau_context *nv, dri_drawable_t *dri_drawable)
+{
+	assert(nv);
+	assert(dri_drawable);
+
+	if (nv->dri_drawable != dri_drawable)
+	{
+		nv->dri_drawable = dri_drawable;
+		dri_drawable->private = nv;
+	}
+
+	return 0;
+}
+
+int
+nouveau_context_unbind(struct nouveau_context *nv)
+{
+	assert(nv);
+
+	nv->dri_drawable = NULL;
+
+	return 0;
+}
+
+/* Show starts here */
+
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable)
+{
+	struct nouveau_context	*nv;
+	dri_drawable_t		*dri_drawable;
+
+	nv = pipe->priv;
+
+	driCreateDrawable(nv->nv_screen->dri_screen, drawable, &dri_drawable);
+
+	nouveau_context_bind(nv, dri_drawable);
+
+	return 0;
+}
+
+int unbind_pipe_drawable(struct pipe_context *pipe)
+{
+	nouveau_context_unbind(pipe->priv);
+
+	return 0;
+}
+
+struct pipe_context* create_pipe_context(Display *display, int screen)
+{
+	dri_screen_t		*dri_screen;
+	dri_framebuffer_t	dri_framebuf;
+	dri_context_t		*dri_context;
+	struct nouveau_context	*nv;
+
+	driCreateScreen(display, screen, &dri_screen, &dri_framebuf);
+	driCreateContext(dri_screen, XDefaultVisual(display, screen), &dri_context);
+
+	nouveau_screen_create(dri_screen, &dri_framebuf);
+	nouveau_context_create(dri_context);
+
+	nv = dri_context->private;
+
+	return nv->nvc->pctx[nv->pctx_id];
+}
+
+int destroy_pipe_context(struct pipe_context *pipe)
+{
+	struct pipe_screen	*screen;
+	struct pipe_winsys	*winsys;
+	struct nouveau_context	*nv;
+	dri_screen_t		*dri_screen;
+	dri_context_t		*dri_context;
+
+	assert(pipe);
+
+	screen = pipe->screen;
+	winsys = pipe->winsys;
+	nv = pipe->priv;
+	dri_context = nv->dri_context;
+	dri_screen = dri_context->dri_screen;
+
+	pipe->destroy(pipe);
+	screen->destroy(screen);
+	free(winsys);
+
+	nouveau_context_destroy(dri_context);
+	nouveau_screen_destroy(dri_screen);
+	driDestroyContext(dri_context);
+	driDestroyScreen(dri_screen);
+
+	return 0;
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h
new file mode 100644
index 0000000000..395a3ab790
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_context.h
@@ -0,0 +1,105 @@
+#ifndef __NOUVEAU_CONTEXT_H__
+#define __NOUVEAU_CONTEXT_H__
+
+/*#include "xmlconfig.h"*/
+
+#include <driclient.h>
+#include "nouveau/nouveau_winsys.h"
+#include "nouveau_drmif.h"
+#include "nouveau_dma.h"
+
+struct nouveau_channel_context {
+	struct pipe_screen *pscreen;
+	int refcount;
+
+	unsigned cur_pctx;
+	unsigned nr_pctx;
+	struct pipe_context **pctx;
+
+	struct nouveau_channel  *channel;
+
+	struct nouveau_notifier *sync_notifier;
+
+	/* Common */
+	struct nouveau_grobj    *NvM2MF;
+	/* NV04-NV40 */
+	struct nouveau_grobj    *NvCtxSurf2D;
+	struct nouveau_grobj	*NvSwzSurf;
+	struct nouveau_grobj    *NvImageBlit;
+	struct nouveau_grobj    *NvGdiRect;
+	struct nouveau_grobj	*NvSIFM;
+	/* G80 */
+	struct nouveau_grobj    *Nv2D;
+
+	uint32_t                 next_handle;
+	uint32_t                 next_subchannel;
+	uint32_t                 next_sequence;
+};
+
+struct nouveau_context {
+	/* DRI stuff */
+	dri_context_t		*dri_context;
+	dri_drawable_t		*dri_drawable;
+	unsigned int		last_stamp;
+	/*driOptionCache	dri_option_cache;*/
+	drm_context_t		drm_context;
+	drmLock			drm_lock;
+	int			locked;
+	struct nouveau_screen	*nv_screen;
+	struct pipe_surface	*frontbuffer;
+
+	struct {
+		int hw_vertex_buffer;
+		int hw_index_buffer;
+	} cap;
+
+	/* Hardware context */
+	struct nouveau_channel_context	*nvc;
+	int				pctx_id;
+
+	/* pipe_surface accel */
+	struct pipe_surface		*surf_src, *surf_dst;
+	unsigned			surf_src_offset, surf_dst_offset;
+	
+	int  (*surface_copy_prep)(struct nouveau_context *,
+				  struct pipe_surface *dst,
+				  struct pipe_surface *src);
+	void (*surface_copy)(struct nouveau_context *, unsigned dx, unsigned dy,
+			     unsigned sx, unsigned sy, unsigned w, unsigned h);
+	void (*surface_copy_done)(struct nouveau_context *);
+	int (*surface_fill)(struct nouveau_context *, struct pipe_surface *,
+			    unsigned, unsigned, unsigned, unsigned, unsigned);
+};
+
+extern int nouveau_context_create(dri_context_t *);
+extern void nouveau_context_destroy(dri_context_t *);
+extern int nouveau_context_bind(struct nouveau_context *, dri_drawable_t *);
+extern int nouveau_context_unbind(struct nouveau_context *);
+
+#ifdef DEBUG
+extern int __nouveau_debug;
+
+#define DEBUG_BO (1 << 0)
+
+#define DBG(flag, ...) do {                   \
+	if (__nouveau_debug & (DEBUG_##flag)) \
+		NOUVEAU_ERR(__VA_ARGS__);     \
+} while(0)
+#else
+#define DBG(flag, ...)
+#endif
+
+extern void LOCK_HARDWARE(struct nouveau_context *);
+extern void UNLOCK_HARDWARE(struct nouveau_context *);
+
+extern int
+nouveau_surface_channel_create_nv04(struct nouveau_channel_context *);
+extern int
+nouveau_surface_channel_create_nv50(struct nouveau_channel_context *);
+extern int nouveau_surface_init_nv04(struct nouveau_context *);
+extern int nouveau_surface_init_nv50(struct nouveau_context *);
+
+extern uint32_t *nouveau_pipe_dma_beginp(struct nouveau_grobj *, int, int);
+extern void nouveau_pipe_dma_kickoff(struct nouveau_channel *);
+
+#endif
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c
new file mode 120000
index 0000000000..63f1fa040c
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_device.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_device.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c
new file mode 120000
index 0000000000..cd0d32e537
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dma.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h
new file mode 120000
index 0000000000..e6c7d4bc96
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dma.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dma.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h
new file mode 120000
index 0000000000..c8f9dbdc3a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_dri.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_dri.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h
new file mode 120000
index 0000000000..27082c9d34
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_drmif.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_drmif.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c
new file mode 120000
index 0000000000..51a50527e6
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_fence.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_fence.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c
new file mode 120000
index 0000000000..db17c72e6d
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_grobj.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_grobj.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h
new file mode 120000
index 0000000000..4e9d3042cb
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_local.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_local.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c
new file mode 100644
index 0000000000..375634bd05
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_lock.c
@@ -0,0 +1,92 @@
+/**************************************************************************
+ * 
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <pthread.h>
+#include <driclient.h>
+#include "nouveau_context.h"
+#include "nouveau_screen.h"
+
+static pthread_mutex_t lockMutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void
+nouveau_contended_lock(struct nouveau_context *nv, unsigned int flags)
+{
+	dri_drawable_t			*dri_drawable = nv->dri_drawable;
+	dri_screen_t			*dri_screen = nv->dri_context->dri_screen;
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+
+	drmGetLock(nvdev->fd, nvdev->ctx, flags);
+
+	/* If the window moved, may need to set a new cliprect now.
+	 *
+	 * NOTE: This releases and regains the hw lock, so all state
+	 * checking must be done *after* this call:
+	 */
+	if (dri_drawable)
+		DRI_VALIDATE_DRAWABLE_INFO(dri_screen, dri_drawable);
+}
+
+/* Lock the hardware and validate our state.
+ */
+void
+LOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+	char				__ret=0;
+
+	pthread_mutex_lock(&lockMutex);
+	assert(!nv->locked);
+	
+	DRM_CAS(nvdev->lock, nvdev->ctx,
+		(DRM_LOCK_HELD | nvdev->ctx), __ret);
+	
+	if (__ret)
+		nouveau_contended_lock(nv, 0);
+	nv->locked = 1;
+}
+
+
+/* Unlock the hardware using the global current context 
+ */
+void
+UNLOCK_HARDWARE(struct nouveau_context *nv)
+{
+	struct nouveau_screen		*nv_screen = nv->nv_screen;
+	struct nouveau_device		*dev = nv_screen->device;
+	struct nouveau_device_priv	*nvdev = nouveau_device(dev);
+
+	assert(nv->locked);
+	nv->locked = 0;
+
+	DRM_UNLOCK(nvdev->fd, nvdev->lock, nvdev->ctx);
+
+	pthread_mutex_unlock(&lockMutex);
+} 
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c
new file mode 120000
index 0000000000..703bc3ce4a
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_notifier.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_notifier.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c
new file mode 120000
index 0000000000..4ac137cada
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_pushbuf.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_pushbuf.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c
new file mode 120000
index 0000000000..2241af328f
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_resource.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_resource.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c
new file mode 100644
index 0000000000..f80d00050c
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.c
@@ -0,0 +1,91 @@
+#include "pipe/p_context.h"
+#include "util/u_memory.h"
+#include "nouveau_context.h"
+#include <nouveau_drm.h>
+#include "nouveau_dri.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+#if NOUVEAU_DRM_HEADER_PATCHLEVEL != 11
+#error nouveau_drm.h version does not match expected version
+#endif
+
+/*
+PUBLIC const char __driConfigOptions[] =
+DRI_CONF_BEGIN
+DRI_CONF_END;
+static const GLuint __driNConfigOptions = 0;
+*/
+
+int nouveau_check_dri_drm_ddx(dri_version_t *dri, dri_version_t *drm, dri_version_t *ddx)
+{
+	static const dri_version_t ddx_expected = {0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL};
+	static const dri_version_t dri_expected = {4, 0, 0};
+	static const dri_version_t drm_expected = {0, 0, NOUVEAU_DRM_HEADER_PATCHLEVEL};
+
+	assert(dri);
+	assert(drm);
+	assert(ddx);
+
+	if (dri->major != dri_expected.major || dri->minor < dri_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DRI version.\n");
+		return 1;
+	}
+	if (drm->major != drm_expected.major || drm->minor < drm_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DRM version.\n");
+		return 1;
+	}
+	if (ddx->major != ddx_expected.major || ddx->minor < ddx_expected.minor)
+	{
+		NOUVEAU_ERR("Unexpected DDX version.\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int
+nouveau_screen_create(dri_screen_t *dri_screen, dri_framebuffer_t *dri_framebuf)
+{
+	struct nouveau_dri	*nv_dri = dri_framebuf->private;
+	struct nouveau_screen	*nv_screen;
+	int			ret;
+
+	if (nouveau_check_dri_drm_ddx(&dri_screen->dri, &dri_screen->drm, &dri_screen->ddx))
+		return 1;
+
+	nv_screen = CALLOC_STRUCT(nouveau_screen);
+	if (!nv_screen)
+		return 1;
+	nv_screen->dri_screen = dri_screen;
+	dri_screen->private = (void*)nv_screen;
+
+	/*
+	driParseOptionInfo(&nv_screen->option_cache,
+			   __driConfigOptions, __driNConfigOptions);
+	*/
+
+	if ((ret = nouveau_device_open_existing(&nv_screen->device, 0,
+						dri_screen->fd, 0))) {
+		NOUVEAU_ERR("Failed opening nouveau device: %d.\n", ret);
+		return 1;
+	}
+
+	nv_screen->front_offset = nv_dri->front_offset;
+	nv_screen->front_pitch  = nv_dri->front_pitch * (nv_dri->bpp / 8);
+	nv_screen->front_cpp = nv_dri->bpp / 8;
+	nv_screen->front_height = nv_dri->height;
+
+	return 0;
+}
+
+void
+nouveau_screen_destroy(dri_screen_t *dri_screen)
+{
+	struct nouveau_screen *nv_screen = dri_screen->private;
+
+	FREE(nv_screen);
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h
new file mode 100644
index 0000000000..8a58bb7556
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_screen.h
@@ -0,0 +1,24 @@
+#ifndef __NOUVEAU_SCREEN_H__
+#define __NOUVEAU_SCREEN_H__
+
+/* TODO: Investigate using DRI options for interesting things */
+/*#include "xmlconfig.h"*/
+
+struct nouveau_screen {
+	dri_screen_t			*dri_screen;
+	struct nouveau_device		*device;
+	struct nouveau_channel_context	*nvc;
+
+	uint32_t			front_offset;
+	uint32_t			front_pitch;
+	uint32_t			front_cpp;
+	uint32_t			front_height;
+	
+	/*driOptionCache		option_cache;*/
+};
+
+int nouveau_screen_create(dri_screen_t *dri_screen, dri_framebuffer_t *dri_framebuf);
+void nouveau_screen_destroy(dri_screen_t *dri_screen);
+
+#endif
+
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c
new file mode 100644
index 0000000000..7916c80615
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.c
@@ -0,0 +1,64 @@
+#include "pipe/p_context.h"
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+
+void
+nouveau_copy_buffer(dri_drawable_t *dri_drawable, struct pipe_surface *surf,
+		    const drm_clip_rect_t *rect)
+{
+	struct nouveau_context	*nv = dri_drawable->private;
+	drm_clip_rect_t		*pbox;
+	int			nbox, i;
+
+	LOCK_HARDWARE(nv);
+	if (!dri_drawable->num_cliprects) {
+		UNLOCK_HARDWARE(nv);
+		return;
+	}
+	pbox = dri_drawable->cliprects;
+	nbox = dri_drawable->num_cliprects;
+
+	nv->surface_copy_prep(nv, nv->frontbuffer, surf);
+	for (i = 0; i < nbox; i++, pbox++) {
+		int sx, sy, dx, dy, w, h;
+
+		sx = pbox->x1 - dri_drawable->x;
+		sy = pbox->y1 - dri_drawable->y;
+		dx = pbox->x1;
+		dy = pbox->y1;
+		w  = pbox->x2 - pbox->x1;
+		h  = pbox->y2 - pbox->y1;
+
+		nv->surface_copy(nv, dx, dy, sx, sy, w, h);
+	}
+
+	FIRE_RING(nv->nvc->channel);
+	UNLOCK_HARDWARE(nv);
+
+	//if (nv->last_stamp != dri_drawable->last_sarea_stamp)
+		//nv->last_stamp = dri_drawable->last_sarea_stamp;
+}
+
+void
+nouveau_copy_sub_buffer(dri_drawable_t *dri_drawable, struct pipe_surface *surf, int x, int y, int w, int h)
+{
+	if (surf) {
+		drm_clip_rect_t rect;
+		rect.x1 = x;
+		rect.y1 = y;
+		rect.x2 = x + w;
+		rect.y2 = y + h;
+
+		nouveau_copy_buffer(dri_drawable, surf, &rect);
+	}
+}
+
+void
+nouveau_swap_buffers(dri_drawable_t *dri_drawable, struct pipe_surface *surf)
+{
+	if (surf)
+		nouveau_copy_buffer(dri_drawable, surf, NULL);
+}
+
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h
new file mode 100644
index 0000000000..35e934adba
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_swapbuffers.h
@@ -0,0 +1,10 @@
+#ifndef __NOUVEAU_SWAPBUFFERS_H__
+#define __NOUVEAU_SWAPBUFFERS_H__
+
+extern void nouveau_copy_buffer(dri_drawable_t *, struct pipe_surface *,
+				const drm_clip_rect_t *);
+extern void nouveau_copy_sub_buffer(dri_drawable_t *, struct pipe_surface *,
+				    int x, int y, int w, int h);
+extern void nouveau_swap_buffers(dri_drawable_t *, struct pipe_surface *);
+
+#endif
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c
new file mode 120000
index 0000000000..ce4052d557
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c
new file mode 100644
index 0000000000..4f6ac9cad0
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.c
@@ -0,0 +1,260 @@
+#include "pipe/p_winsys.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+
+#include "nouveau_context.h"
+#include "nouveau_local.h"
+#include "nouveau_screen.h"
+#include "nouveau_swapbuffers.h"
+#include "nouveau_winsys_pipe.h"
+
+static void
+nouveau_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surf,
+			  void *context_private)
+{
+	struct nouveau_context *nv = context_private;
+	dri_drawable_t *dri_drawable = nv->dri_drawable;
+
+	nouveau_copy_buffer(dri_drawable, surf, NULL);
+}
+
+static const char *
+nouveau_get_name(struct pipe_winsys *pws)
+{
+	return "Nouveau/DRI";
+}
+
+static struct pipe_surface *
+nouveau_surface_alloc(struct pipe_winsys *ws)
+{
+	struct pipe_surface *surf;
+
+	surf = CALLOC_STRUCT(pipe_surface);
+	if (!surf)
+		return NULL;
+
+	surf->refcount = 1;
+	surf->winsys = ws;
+	return surf;
+}
+
+/* Borrowed from Mesa's xm_winsys */
+static unsigned int
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int
+nouveau_surface_alloc_storage
+(
+	struct pipe_winsys *pws,
+	struct pipe_surface *surface,
+	unsigned width,
+	unsigned height,
+	enum pipe_format format,
+	unsigned flags,
+	unsigned tex_usage
+)
+{
+	const unsigned int ALIGNMENT = 256;
+
+	assert(pws);
+	assert(surface);
+
+	surface->width = width;
+	surface->height = height;
+	surface->format = format;
+	pf_get_block(format, &surface->block);
+	surface->nblocksx = pf_get_nblocksx(&surface->block, width);
+	surface->nblocksy = pf_get_nblocksy(&surface->block, height);
+	surface->stride = round_up(surface->nblocksx * surface->block.size, ALIGNMENT);
+	surface->usage = flags;
+	surface->buffer = pws->buffer_create(pws, ALIGNMENT, PIPE_BUFFER_USAGE_PIXEL, surface->stride * surface->nblocksy);
+
+	return 0;
+}
+
+static void
+nouveau_surface_release(struct pipe_winsys *ws, struct pipe_surface **s)
+{
+	struct pipe_surface *surf = *s;
+
+	*s = NULL;
+	if (--surf->refcount <= 0) {
+		if (surf->buffer)
+			winsys_buffer_reference(ws, &surf->buffer, NULL);
+		free(surf);
+	}
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_create(struct pipe_winsys *pws, unsigned alignment,
+		       unsigned usage, unsigned size)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_context *nv = nvpws->nv;
+	struct nouveau_device *dev = nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+	uint32_t flags;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.alignment = alignment;
+	nvbuf->base.usage = usage;
+	nvbuf->base.size = size;
+
+	flags = NOUVEAU_BO_LOCAL;
+
+	if (usage & PIPE_BUFFER_USAGE_PIXEL) {
+		if (usage & NOUVEAU_BUFFER_USAGE_TEXTURE)
+			flags |= NOUVEAU_BO_GART;
+		flags |= NOUVEAU_BO_VRAM;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_VERTEX) {
+		if (nv->cap.hw_vertex_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (usage & PIPE_BUFFER_USAGE_INDEX) {
+		if (nv->cap.hw_index_buffer)
+			flags |= NOUVEAU_BO_GART;
+	}
+
+	if (nouveau_bo_new(dev, flags, alignment, size, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static struct pipe_buffer *
+nouveau_pipe_bo_user_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)pws;
+	struct nouveau_device *dev = nvpws->nv->nv_screen->device;
+	struct nouveau_pipe_buffer *nvbuf;
+
+	nvbuf = calloc(1, sizeof(*nvbuf));
+	if (!nvbuf)
+		return NULL;
+	nvbuf->base.refcount = 1;
+	nvbuf->base.size = bytes;
+
+	if (nouveau_bo_user(dev, ptr, bytes, &nvbuf->bo)) {
+		free(nvbuf);
+		return NULL;
+	}
+
+	return &nvbuf->base;
+}
+
+static void
+nouveau_pipe_bo_del(struct pipe_winsys *ws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_del(&nvbuf->bo);
+	free(nvbuf);
+}
+
+static void *
+nouveau_pipe_bo_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
+		    unsigned flags)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+	uint32_t map_flags = 0;
+
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ)
+		map_flags |= NOUVEAU_BO_RD;
+	if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+		map_flags |= NOUVEAU_BO_WR;
+
+	if (nouveau_bo_map(nvbuf->bo, map_flags))
+		return NULL;
+	return nvbuf->bo->map;
+}
+
+static void
+nouveau_pipe_bo_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+	struct nouveau_pipe_buffer *nvbuf = nouveau_buffer(buf);
+
+	nouveau_bo_unmap(nvbuf->bo);
+}
+
+static INLINE struct nouveau_fence *
+nouveau_pipe_fence(struct pipe_fence_handle *pfence)
+{
+	return (struct nouveau_fence *)pfence;
+}
+
+static void
+nouveau_pipe_fence_reference(struct pipe_winsys *ws,
+			     struct pipe_fence_handle **ptr,
+			     struct pipe_fence_handle *pfence)
+{
+	nouveau_fence_ref((void *)pfence, (void *)ptr);
+}
+
+static int
+nouveau_pipe_fence_signalled(struct pipe_winsys *ws,
+			     struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_pipe_winsys *nvpws = (struct nouveau_pipe_winsys *)ws;
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+
+	if (nouveau_fence(fence)->signalled == 0)
+		nouveau_fence_flush(nvpws->nv->nvc->channel);
+
+	return !nouveau_fence(fence)->signalled;
+}
+
+static int
+nouveau_pipe_fence_finish(struct pipe_winsys *ws,
+			  struct pipe_fence_handle *pfence, unsigned flag)
+{
+	struct nouveau_fence *fence = nouveau_pipe_fence(pfence);
+	struct nouveau_fence *ref = NULL;
+
+	nouveau_fence_ref(fence, &ref);
+	return nouveau_fence_wait(&ref);
+}
+
+struct pipe_winsys *
+nouveau_create_pipe_winsys(struct nouveau_context *nv)
+{
+	struct nouveau_pipe_winsys *nvpws;
+	struct pipe_winsys *pws;
+
+	nvpws = CALLOC_STRUCT(nouveau_pipe_winsys);
+	if (!nvpws)
+		return NULL;
+	nvpws->nv = nv;
+	pws = &nvpws->pws;
+
+	pws->flush_frontbuffer = nouveau_flush_frontbuffer;
+
+	pws->surface_alloc = nouveau_surface_alloc;
+	pws->surface_alloc_storage = nouveau_surface_alloc_storage;
+	pws->surface_release = nouveau_surface_release;
+
+	pws->buffer_create = nouveau_pipe_bo_create;
+	pws->buffer_destroy = nouveau_pipe_bo_del;
+	pws->user_buffer_create = nouveau_pipe_bo_user_create;
+	pws->buffer_map = nouveau_pipe_bo_map;
+	pws->buffer_unmap = nouveau_pipe_bo_unmap;
+
+	pws->fence_reference = nouveau_pipe_fence_reference;
+	pws->fence_signalled = nouveau_pipe_fence_signalled;
+	pws->fence_finish = nouveau_pipe_fence_finish;
+
+	pws->get_name = nouveau_get_name;
+
+	return &nvpws->pws;
+}
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h
new file mode 120000
index 0000000000..9d9460883e
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_pipe.h
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys_pipe.h
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c
new file mode 120000
index 0000000000..ec613ecbf6
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nouveau_winsys_softpipe.c
@@ -0,0 +1 @@
+../../drm/nouveau/nouveau_winsys_softpipe.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c b/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c
new file mode 120000
index 0000000000..4455d8f924
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nv04_surface.c
@@ -0,0 +1 @@
+../../drm/nouveau/nv04_surface.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c b/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c
new file mode 120000
index 0000000000..19f102001e
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/nouveau/nv50_surface.c
@@ -0,0 +1 @@
+../../drm/nouveau/nv50_surface.c
+\ No newline at end of file
diff --git a/src/gallium/winsys/g3dvl/vl_winsys.h b/src/gallium/winsys/g3dvl/vl_winsys.h
new file mode 100644
index 0000000000..c83db28dd9
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/vl_winsys.h
@@ -0,0 +1,14 @@
+#ifndef vl_winsys_h
+#define vl_winsys_h
+
+#include <X11/Xlib.h>
+
+struct pipe_context;
+
+struct pipe_context* create_pipe_context(Display *display, int screen);
+int destroy_pipe_context(struct pipe_context *pipe);
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable);
+int unbind_pipe_drawable(struct pipe_context *pipe);
+
+#endif
+
diff --git a/src/gallium/winsys/g3dvl/xsp_winsys.c b/src/gallium/winsys/g3dvl/xsp_winsys.c
new file mode 100644
index 0000000000..68be2c2ea3
--- /dev/null
+++ b/src/gallium/winsys/g3dvl/xsp_winsys.c
@@ -0,0 +1,336 @@
+#include "vl_winsys.h"
+#include <X11/Xutil.h>
+#include <pipe/p_winsys.h>
+#include <pipe/p_state.h>
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include <softpipe/sp_winsys.h>
+
+/* pipe_winsys implementation */
+
+struct xsp_pipe_winsys
+{
+	struct pipe_winsys	base;
+	XImage			fbimage;
+};
+
+struct xsp_context
+{
+	Display			*display;
+	int			screen;
+	Drawable		drawable;
+	int			drawable_bound;
+};
+
+struct xsp_buffer
+{
+	struct pipe_buffer	base;
+	boolean			is_user_buffer;
+	void			*data;
+	void			*mapped_data;
+};
+
+static struct pipe_buffer* xsp_buffer_create(struct pipe_winsys *pws, unsigned alignment, unsigned usage, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	buffer->base.refcount = 1;
+	buffer->base.alignment = alignment;
+	buffer->base.usage = usage;
+	buffer->base.size = size;
+	buffer->data = align_malloc(size, alignment);
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static struct pipe_buffer* xsp_user_buffer_create(struct pipe_winsys *pws, void *data, unsigned size)
+{
+	struct xsp_buffer *buffer;
+
+	assert(pws);
+
+	buffer = calloc(1, sizeof(struct xsp_buffer));
+	buffer->base.refcount = 1;
+	buffer->base.size = size;
+	buffer->is_user_buffer = TRUE;
+	buffer->data = data;
+
+	return (struct pipe_buffer*)buffer;
+}
+
+static void* xsp_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buffer, unsigned flags)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = xsp_buf->data;
+
+	return xsp_buf->mapped_data;
+}
+
+static void xsp_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	xsp_buf->mapped_data = NULL;
+}
+
+static void xsp_buffer_destroy(struct pipe_winsys *pws, struct pipe_buffer *buffer)
+{
+	struct xsp_buffer *xsp_buf = (struct xsp_buffer*)buffer;
+
+	assert(pws);
+	assert(buffer);
+
+	if (!xsp_buf->is_user_buffer)
+		align_free(xsp_buf->data);
+
+	free(xsp_buf);
+}
+
+static struct pipe_surface* xsp_surface_alloc(struct pipe_winsys *pws)
+{
+	struct pipe_surface *surface;
+
+	assert(pws);
+
+	surface = calloc(1, sizeof(struct pipe_surface));
+	surface->refcount = 1;
+	surface->winsys = pws;
+
+	return surface;
+}
+
+/* Borrowed from Mesa's xm_winsys */
+static unsigned int round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int xsp_surface_alloc_storage
+(
+	struct pipe_winsys *pws,
+	struct pipe_surface *surface,
+	unsigned width,
+	unsigned height,
+	enum pipe_format format,
+	unsigned flags,
+	unsigned tex_usage
+)
+{
+	const unsigned int ALIGNMENT = 1;
+
+	assert(pws);
+	assert(surface);
+
+	surface->width = width;
+	surface->height = height;
+	surface->format = format;
+	pf_get_block(format, &surface->block);
+	surface->nblocksx = pf_get_nblocksx(&surface->block, width);
+	surface->nblocksy = pf_get_nblocksy(&surface->block, height);
+	surface->stride = round_up(surface->nblocksx * surface->block.size, ALIGNMENT);
+	surface->usage = flags;
+	surface->buffer = pws->buffer_create(pws, ALIGNMENT, PIPE_BUFFER_USAGE_PIXEL, surface->stride * surface->nblocksy);
+
+	return 0;
+}
+
+static void xsp_surface_release(struct pipe_winsys *pws, struct pipe_surface **surface)
+{
+	struct pipe_surface *s;
+
+	assert(pws);
+	assert(surface);
+	assert(*surface);
+
+	s = *surface;
+
+	s->refcount--;
+
+	if (s->refcount == 0)
+	{
+		winsys_buffer_reference(pws, &s->buffer, NULL);
+		free(s);
+	}
+
+	*surface = NULL;
+}
+
+static void xsp_fence_reference(struct pipe_winsys *pws, struct pipe_fence_handle **ptr, struct pipe_fence_handle *fence)
+{
+	assert(pws);
+	assert(ptr);
+	assert(fence);
+}
+
+static int xsp_fence_signalled(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static int xsp_fence_finish(struct pipe_winsys *pws, struct pipe_fence_handle *fence, unsigned flag)
+{
+	assert(pws);
+	assert(fence);
+
+	return 0;
+}
+
+static void xsp_flush_frontbuffer(struct pipe_winsys *pws, struct pipe_surface *surface, void *context_private)
+{
+	struct xsp_pipe_winsys	*xsp_winsys;
+	struct xsp_context	*xsp_context;
+
+	assert(pws);
+	assert(surface);
+	assert(context_private);
+
+	xsp_winsys = (struct xsp_pipe_winsys*)pws;
+	xsp_context = (struct xsp_context*)context_private;
+
+	if (!xsp_context->drawable_bound)
+		return;
+
+	xsp_winsys->fbimage.width = surface->width;
+	xsp_winsys->fbimage.height = surface->height;
+	xsp_winsys->fbimage.bytes_per_line = surface->width * (xsp_winsys->fbimage.bits_per_pixel >> 3);
+	xsp_winsys->fbimage.data = pipe_surface_map(surface, 0);
+
+	XPutImage
+	(
+		xsp_context->display,
+		xsp_context->drawable,
+		XDefaultGC(xsp_context->display, xsp_context->screen),
+		&xsp_winsys->fbimage,
+		0,
+		0,
+		0,
+		0,
+		surface->width,
+		surface->height
+	);
+	XFlush(xsp_context->display);
+	pipe_surface_unmap(surface);
+}
+
+static const char* xsp_get_name(struct pipe_winsys *pws)
+{
+	assert(pws);
+	return "X11 SoftPipe";
+}
+
+/* Show starts here */
+
+int bind_pipe_drawable(struct pipe_context *pipe, Drawable drawable)
+{
+	struct xsp_context *xsp_context;
+
+	assert(pipe);
+
+	xsp_context = pipe->priv;
+	xsp_context->drawable = drawable;
+	xsp_context->drawable_bound = 1;
+
+	return 0;
+}
+
+int unbind_pipe_drawable(struct pipe_context *pipe)
+{
+	struct xsp_context *xsp_context;
+
+	assert(pipe);
+
+	xsp_context = pipe->priv;
+	xsp_context->drawable_bound = 0;
+
+	return 0;
+}
+
+struct pipe_context* create_pipe_context(Display *display, int screen)
+{
+	struct xsp_pipe_winsys	*xsp_winsys;
+	struct xsp_context	*xsp_context;
+	struct pipe_screen	*sp_screen;
+	struct pipe_context	*sp_pipe;
+
+	assert(display);
+
+	xsp_winsys = calloc(1, sizeof(struct xsp_pipe_winsys));
+	xsp_winsys->base.buffer_create = xsp_buffer_create;
+	xsp_winsys->base.user_buffer_create = xsp_user_buffer_create;
+	xsp_winsys->base.buffer_map = xsp_buffer_map;
+	xsp_winsys->base.buffer_unmap = xsp_buffer_unmap;
+	xsp_winsys->base.buffer_destroy = xsp_buffer_destroy;
+	xsp_winsys->base.surface_alloc = xsp_surface_alloc;
+	xsp_winsys->base.surface_alloc_storage = xsp_surface_alloc_storage;
+	xsp_winsys->base.surface_release = xsp_surface_release;
+	xsp_winsys->base.fence_reference = xsp_fence_reference;
+	xsp_winsys->base.fence_signalled = xsp_fence_signalled;
+	xsp_winsys->base.fence_finish = xsp_fence_finish;
+	xsp_winsys->base.flush_frontbuffer = xsp_flush_frontbuffer;
+	xsp_winsys->base.get_name = xsp_get_name;
+
+	{
+		/* XXX: Can't use the returned XImage* directly,
+		since we don't have control over winsys destruction
+		and we wouldn't be able to free it */
+		XImage *template = XCreateImage
+		(
+			display,
+			XDefaultVisual(display, XDefaultScreen(display)),
+			XDefaultDepth(display, XDefaultScreen(display)),
+			ZPixmap,
+			0,
+			NULL,
+			0,	/* Don't know the width and height until flush_frontbuffer */
+			0,
+			32,
+			0
+		);
+
+		memcpy(&xsp_winsys->fbimage, template, sizeof(XImage));
+		XInitImage(&xsp_winsys->fbimage);
+
+		XDestroyImage(template);
+	}
+
+	sp_screen = softpipe_create_screen((struct pipe_winsys*)xsp_winsys);
+	sp_pipe = softpipe_create(sp_screen, (struct pipe_winsys*)xsp_winsys, NULL);
+
+	xsp_context = calloc(1, sizeof(struct xsp_context));
+	xsp_context->display = display;
+	xsp_context->screen = screen;
+
+	sp_pipe->priv = xsp_context;
+
+	return sp_pipe;
+}
+
+int destroy_pipe_context(struct pipe_context *pipe)
+{
+	struct pipe_screen *screen;
+	struct pipe_winsys *winsys;
+
+	assert(pipe);
+
+	screen = pipe->screen;
+	winsys = pipe->winsys;
+	free(pipe->priv);
+	pipe->destroy(pipe);
+	screen->destroy(screen);
+	free(winsys);
+
+	return 0;
+}
diff --git a/src/gallium/winsys/gdi/SConscript b/src/gallium/winsys/gdi/SConscript
new file mode 100644
index 0000000000..cc6aa6634f
--- /dev/null
+++ b/src/gallium/winsys/gdi/SConscript
@@ -0,0 +1,41 @@
+#######################################################################
+# SConscript for gdi winsys
+
+Import('*')
+
+if env['platform'] == 'windows':
+
+	env = env.Clone()
+
+	env.Append(CPPPATH = [
+		'#src/mesa/state_tracker/wgl',
+	])
+
+	env.Append(CPPDEFINES = [
+	])
+
+	env.Append(CPPDEFINES = [
+		'__GL_EXPORTS',
+		'BUILD_GL32',
+		'_GNU_H_WINDOWS32_DEFINES',
+	])
+
+	sources = [
+		'#src/mesa/state_tracker/wgl/opengl32.def',
+		'gdi_softpipe_winsys.c',
+	]
+		
+	drivers = [
+		softpipe,
+	]
+
+	env.Append(LIBS = [
+		'gdi32', 
+		'user32'
+	])
+
+	env.SharedLibrary(
+		target ='opengl32',
+		source = sources,
+		LIBS = wgl + glapi + mesa + drivers + auxiliaries + env['LIBS'],
+	)
diff --git a/src/gallium/winsys/gdi/gdi_softpipe_winsys.c b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
new file mode 100644
index 0000000000..e66ce48f2d
--- /dev/null
+++ b/src/gallium/winsys/gdi/gdi_softpipe_winsys.c
@@ -0,0 +1,344 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * Softpipe support.
+ *
+ * @author Keith Whitwell
+ * @author Brian Paul
+ * @author Jose Fonseca
+ */
+
+
+#include <windows.h>
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+#include "stw_winsys.h"
+
+
+struct gdi_softpipe_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+};
+
+
+/** Cast wrapper */
+static INLINE struct gdi_softpipe_buffer *
+gdi_softpipe_buffer( struct pipe_buffer *buf )
+{
+   return (struct gdi_softpipe_buffer *)buf;
+}
+
+
+static void *
+gdi_softpipe_buffer_map(struct pipe_winsys *winsys,
+                        struct pipe_buffer *buf,
+                        unsigned flags)
+{
+   struct gdi_softpipe_buffer *gdi_softpipe_buf = gdi_softpipe_buffer(buf);
+   gdi_softpipe_buf->mapped = gdi_softpipe_buf->data;
+   return gdi_softpipe_buf->mapped;
+}
+
+
+static void
+gdi_softpipe_buffer_unmap(struct pipe_winsys *winsys,
+                          struct pipe_buffer *buf)
+{
+   struct gdi_softpipe_buffer *gdi_softpipe_buf = gdi_softpipe_buffer(buf);
+   gdi_softpipe_buf->mapped = NULL;
+}
+
+
+static void
+gdi_softpipe_buffer_destroy(struct pipe_winsys *winsys,
+                            struct pipe_buffer *buf)
+{
+   struct gdi_softpipe_buffer *oldBuf = gdi_softpipe_buffer(buf);
+
+   if (oldBuf->data) {
+      if (!oldBuf->userBuffer)
+         align_free(oldBuf->data);
+
+      oldBuf->data = NULL;
+   }
+
+   FREE(oldBuf);
+}
+
+
+static const char *
+gdi_softpipe_get_name(struct pipe_winsys *winsys)
+{
+   return "softpipe";
+}
+
+
+static struct pipe_buffer *
+gdi_softpipe_buffer_create(struct pipe_winsys *winsys,
+                           unsigned alignment,
+                           unsigned usage,
+                           unsigned size)
+{
+   struct gdi_softpipe_buffer *buffer = CALLOC_STRUCT(gdi_softpipe_buffer);
+
+   buffer->base.refcount = 1;
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+   buffer->data = align_malloc(size, alignment);
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+gdi_softpipe_user_buffer_create(struct pipe_winsys *winsys,
+                               void *ptr,
+                               unsigned bytes)
+{
+   struct gdi_softpipe_buffer *buffer;
+
+   buffer = CALLOC_STRUCT(gdi_softpipe_buffer);
+   if(!buffer)
+      return NULL;
+
+   buffer->base.refcount = 1;
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+
+   return &buffer->base;
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+
+static int
+gdi_softpipe_surface_alloc_storage(struct pipe_winsys *winsys,
+                                   struct pipe_surface *surf,
+                                   unsigned width, unsigned height,
+                                   enum pipe_format format,
+                                   unsigned flags,
+                                   unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        surf->stride * surf->nblocksy);
+   if(!surf->buffer)
+      return -1;
+
+   return 0;
+}
+
+
+static struct pipe_surface *
+gdi_softpipe_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
+
+   assert(winsys);
+
+   surface->refcount = 1;
+   surface->winsys = winsys;
+
+   return surface;
+}
+
+
+static void
+gdi_softpipe_surface_release(struct pipe_winsys *winsys,
+                             struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   assert(!surf->texture);
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+static void
+gdi_softpipe_dummy_flush_frontbuffer(struct pipe_winsys *winsys,
+                                     struct pipe_surface *surface,
+                                     void *context_private)
+{
+   assert(0);
+}
+
+
+static void
+gdi_softpipe_fence_reference(struct pipe_winsys *winsys,
+                             struct pipe_fence_handle **ptr,
+                             struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+gdi_softpipe_fence_signalled(struct pipe_winsys *winsys,
+                             struct pipe_fence_handle *fence,
+                             unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+gdi_softpipe_fence_finish(struct pipe_winsys *winsys,
+                          struct pipe_fence_handle *fence,
+                          unsigned flag)
+{
+   return 0;
+}
+
+
+static void
+gdi_softpipe_destroy(struct pipe_winsys *winsys)
+{
+   FREE(winsys);
+}
+
+
+static struct pipe_screen *
+gdi_softpipe_screen_create(void)
+{
+   static struct pipe_winsys *winsys;
+   struct pipe_screen *screen;
+
+   winsys = CALLOC_STRUCT(pipe_winsys);
+   if(!winsys)
+      return NULL;
+
+   winsys->destroy = gdi_softpipe_destroy;
+
+   winsys->buffer_create = gdi_softpipe_buffer_create;
+   winsys->user_buffer_create = gdi_softpipe_user_buffer_create;
+   winsys->buffer_map = gdi_softpipe_buffer_map;
+   winsys->buffer_unmap = gdi_softpipe_buffer_unmap;
+   winsys->buffer_destroy = gdi_softpipe_buffer_destroy;
+
+   winsys->surface_alloc = gdi_softpipe_surface_alloc;
+   winsys->surface_alloc_storage = gdi_softpipe_surface_alloc_storage;
+   winsys->surface_release = gdi_softpipe_surface_release;
+
+   winsys->fence_reference = gdi_softpipe_fence_reference;
+   winsys->fence_signalled = gdi_softpipe_fence_signalled;
+   winsys->fence_finish = gdi_softpipe_fence_finish;
+
+   winsys->flush_frontbuffer = gdi_softpipe_dummy_flush_frontbuffer;
+   winsys->get_name = gdi_softpipe_get_name;
+
+   screen = softpipe_create_screen(winsys);
+   if(!screen)
+      gdi_softpipe_destroy(winsys);
+
+   return screen;
+}
+
+
+static struct pipe_context *
+gdi_softpipe_context_create(struct pipe_screen *screen)
+{
+   return softpipe_create(screen, screen->winsys, NULL);
+}
+
+
+static void
+gdi_softpipe_flush_frontbuffer(struct pipe_winsys *winsys,
+                               struct pipe_surface *surface,
+                               HDC hDC)
+{
+    struct gdi_softpipe_buffer *buffer;
+    BITMAPINFO bmi;
+
+    buffer = gdi_softpipe_buffer(surface->buffer);
+
+    memset(&bmi, 0, sizeof(BITMAPINFO));
+    bmi.bmiHeader.biSize = sizeof(BITMAPINFOHEADER);
+    bmi.bmiHeader.biWidth = surface->stride / pf_get_size(surface->format);
+    bmi.bmiHeader.biHeight= -surface->height;
+    bmi.bmiHeader.biPlanes = 1;
+    bmi.bmiHeader.biBitCount = pf_get_bits(surface->format);
+    bmi.bmiHeader.biCompression = BI_RGB;
+    bmi.bmiHeader.biSizeImage = 0;
+    bmi.bmiHeader.biXPelsPerMeter = 0;
+    bmi.bmiHeader.biYPelsPerMeter = 0;
+    bmi.bmiHeader.biClrUsed = 0;
+    bmi.bmiHeader.biClrImportant = 0;
+
+    StretchDIBits(hDC,
+                  0, 0, surface->width, surface->height,
+                  0, 0, surface->width, surface->height,
+                  buffer->data, &bmi, 0, SRCCOPY);
+}
+
+
+const struct stw_winsys stw_winsys = {
+   &gdi_softpipe_screen_create,
+   &gdi_softpipe_context_create,
+   &gdi_softpipe_flush_frontbuffer
+};
diff --git a/src/gallium/winsys/xlib/Makefile b/src/gallium/winsys/xlib/Makefile
new file mode 100644
index 0000000000..11c7632411
--- /dev/null
+++ b/src/gallium/winsys/xlib/Makefile
@@ -0,0 +1,94 @@
+# src/gallium/winsys/xlib/Makefile
+
+# This makefile produces a "stand-alone" libGL.so which is based on
+# Xlib (no DRI HW acceleration)
+
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+
+GL_MAJOR = 1
+GL_MINOR = 5
+GL_TINY = 0$(MESA_MAJOR)0$(MESA_MINOR)0$(MESA_TINY)
+
+
+INCLUDE_DIRS = \
+	-I$(TOP)/include \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/src/mesa/main \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/gallium/drivers \
+	-I$(TOP)/src/gallium/auxiliary
+
+XLIB_WINSYS_SOURCES = \
+	glxapi.c	\
+	fakeglx.c	\
+	xfonts.c	\
+	xm_api.c	\
+	xm_winsys.c	\
+	xm_winsys_aub.c	\
+	brw_aub.c
+
+XLIB_WINSYS_OBJECTS = $(XLIB_WINSYS_SOURCES:.c=.o)
+
+
+# Note: CELL_SPU_LIB is only defined for cell configs
+
+LIBS = \
+	$(GALLIUM_DRIVERS) \
+	$(TOP)/src/mesa/libglapi.a \
+	$(TOP)/src/mesa/libmesa.a \
+	$(GALLIUM_AUXILIARIES) \
+	$(CELL_SPU_LIB) \
+
+
+.SUFFIXES : .cpp
+
+.c.o:
+	$(CC) -c $(INCLUDE_DIRS) $(CFLAGS) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDE_DIRS) $(CXXFLAGS) $< -o $@
+
+
+
+default: $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME)
+
+
+# Make the libGL.so library
+$(TOP)/$(LIB_DIR)/$(GL_LIB_NAME): $(XLIB_WINSYS_OBJECTS) $(LIBS)
+	$(TOP)/bin/mklib -o $(GL_LIB) \
+		-linker "$(CC)" \
+		-major $(GL_MAJOR) -minor $(GL_MINOR) -patch $(GL_TINY) \
+		-install $(TOP)/$(LIB_DIR) \
+		$(MKLIB_OPTIONS) $(XLIB_WINSYS_OBJECTS) \
+		--start-group $(LIBS) --end-group $(GL_LIB_DEPS)
+
+
+depend: $(ALL_SOURCES)
+	@ echo "running $(MKDEP)"
+	@ rm -f depend  # workaround oops on gutsy?!?
+	@ touch depend
+	@ $(MKDEP) $(MKDEP_OPTIONS) $(DEFINES) $(INCLUDE_DIRS) $(ALL_SOURCES) \
+		> /dev/null 2>/dev/null
+
+
+install: default
+	$(INSTALL) -d $(INSTALL_DIR)/include/GL
+	$(INSTALL) -d $(INSTALL_DIR)/$(LIB_DIR)
+	$(INSTALL) -m 644 $(TOP)/include/GL/*.h $(INSTALL_DIR)/include/GL
+	@if [ -e $(TOP)/$(LIB_DIR)/$(GL_LIB_NAME) ]; then \
+		$(INSTALL) $(TOP)/$(LIB_DIR)/libGL* $(INSTALL_DIR)/$(LIB_DIR); \
+	fi
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` $(TOP)/include/GL/*.h
+
+clean:
+	-rm -f *.o
+
+
+include depend
diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
new file mode 100644
index 0000000000..3aef3b6ced
--- /dev/null
+++ b/src/gallium/winsys/xlib/SConscript
@@ -0,0 +1,49 @@
+#######################################################################
+# SConscript for xlib winsys
+
+Import('*')
+
+if env['platform'] == 'linux' \
+        and 'mesa' in env['statetrackers'] \
+        and ('softpipe' or 'i915simple' or 'trace') in env['drivers'] \
+        and not env['dri']:
+
+    env = env.Clone()
+
+    env.Append(CPPPATH = [
+        '#/src/mesa',
+        '#/src/mesa/main',
+    ])
+
+    sources = [
+        'glxapi.c',
+        'fakeglx.c',
+        'xfonts.c',
+        'xm_api.c',
+        'xm_winsys.c',
+    ]
+
+    drivers = [];
+        
+    if 'softpipe' in env['drivers']:
+        drivers += [softpipe]
+
+    if 'i965simple' in env['drivers']:
+        drivers += [i965simple]
+        sources += [
+            'brw_aub.c',
+            'xm_winsys_aub.c',
+            ]
+        
+    if 'trace' in env['drivers']:
+        env.Append(CPPDEFINES = 'GALLIUM_TRACE')
+        drivers += [trace]
+
+    # TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
+    libgl = env.SharedLibrary(
+        target ='GL',
+        source = sources,
+        LIBS = glapi + mesa + drivers + auxiliaries + env['LIBS'],
+    )
+
+    env.InstallSharedLibrary(libgl, version=(1, 5))
diff --git a/src/gallium/winsys/xlib/brw_aub.c b/src/gallium/winsys/xlib/brw_aub.c
new file mode 100644
index 0000000000..9e96efaa53
--- /dev/null
+++ b/src/gallium/winsys/xlib/brw_aub.c
@@ -0,0 +1,397 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "brw_aub.h"
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "pipe/p_debug.h"
+#include "util/u_memory.h"
+
+
+struct brw_aubfile {
+   FILE *file;
+   unsigned next_free_page;
+};
+
+
+extern char *__progname;
+
+
+struct aub_file_header {
+   unsigned int instruction_type;
+   unsigned int pad0:16;
+   unsigned int minor:8;
+   unsigned int major:8;
+   unsigned char application[8*4];
+   unsigned int day:8;
+   unsigned int month:8;
+   unsigned int year:16;
+   unsigned int timezone:8;
+   unsigned int second:8;
+   unsigned int minute:8;
+   unsigned int hour:8;
+   unsigned int comment_length:16;   
+   unsigned int pad1:16;
+};
+
+struct aub_block_header {
+   unsigned int instruction_type;
+   unsigned int operation:8;
+   unsigned int type:8;
+   unsigned int address_space:8;
+   unsigned int pad0:8;
+   unsigned int general_state_type:8;
+   unsigned int surface_state_type:8;
+   unsigned int pad1:16;
+   unsigned int address;
+   unsigned int length;
+};
+
+struct aub_dump_bmp {
+   unsigned int instruction_type;
+   unsigned int xmin:16;
+   unsigned int ymin:16;
+   unsigned int pitch:16;
+   unsigned int bpp:8;
+   unsigned int format:8;
+   unsigned int xsize:16;
+   unsigned int ysize:16;
+   unsigned int addr;
+   unsigned int unknown;
+};
+
+enum bh_operation {
+   BH_COMMENT,
+   BH_DATA_WRITE,
+   BH_COMMAND_WRITE,
+   BH_MMI0_WRITE32,
+   BH_END_SCENE,
+   BH_CONFIG_MEMORY_MAP,
+   BH_MAX_OPERATION
+};
+
+enum command_write_type {
+   CW_HWB_RING = 1,
+   CW_PRIMARY_RING_A,
+   CW_PRIMARY_RING_B,		/* XXX - disagreement with listaub! */
+   CW_PRIMARY_RING_C,
+   CW_MAX_TYPE
+};
+
+enum memory_map_type {
+   MM_DEFAULT,
+   MM_DYNAMIC,
+   MM_MAX_TYPE
+};
+
+enum address_space {
+   ADDR_GTT,
+   ADDR_LOCAL,
+   ADDR_MAIN,
+   ADDR_MAX
+};
+
+
+#define AUB_FILE_HEADER 0xe085000b
+#define AUB_BLOCK_HEADER 0xe0c10003
+#define AUB_DUMP_BMP 0xe09e0004
+
+/* Registers to control page table
+ */
+#define PGETBL_CTL       0x2020
+#define PGETBL_ENABLED   0x1
+
+#define NR_GTT_ENTRIES  65536	/* 256 mb */
+
+#define FAIL										\
+do {											\
+   fprintf(stderr, "failed to write aub data at %s/%d\n", __FUNCTION__, __LINE__);	\
+   exit(1);										\
+} while (0)
+
+
+/* Emit the headers at the top of each aubfile.  Initialize the GTT.
+ */
+static void init_aubfile( FILE *aub_file )
+{   
+   struct aub_file_header fh;
+   struct aub_block_header bh;
+   unsigned int data;
+
+   static int nr;
+   
+   nr++;
+
+   /* Emit the aub header:
+    */
+   memset(&fh, 0, sizeof(fh));
+
+   fh.instruction_type = AUB_FILE_HEADER;
+   fh.minor = 0x0;
+   fh.major = 0x7;
+   memcpy(fh.application, __progname, sizeof(fh.application));
+   fh.day = (nr>>24) & 0xff;
+   fh.month = 0x0;
+   fh.year = 0x0;
+   fh.timezone = 0x0;
+   fh.second = nr & 0xff;
+   fh.minute = (nr>>8) & 0xff;
+   fh.hour = (nr>>16) & 0xff;
+   fh.comment_length = 0x0;   
+
+   if (fwrite(&fh, sizeof(fh), 1, aub_file) < 0) 
+      FAIL;
+         
+   /* Setup the GTT starting at main memory address zero (!):
+    */
+   memset(&bh, 0, sizeof(bh));
+   
+   bh.instruction_type = AUB_BLOCK_HEADER;
+   bh.operation = BH_MMI0_WRITE32;
+   bh.type = 0x0;
+   bh.address_space = ADDR_GTT;	/* ??? */
+   bh.general_state_type = 0x0;
+   bh.surface_state_type = 0x0;
+   bh.address = PGETBL_CTL;
+   bh.length = 0x4;
+
+   if (fwrite(&bh, sizeof(bh), 1, aub_file) < 0) 
+      FAIL;
+
+   data = 0x0 | PGETBL_ENABLED;
+
+   if (fwrite(&data, sizeof(data), 1, aub_file) < 0) 
+      FAIL;
+}
+
+
+static void init_aub_gtt( struct brw_aubfile *aubfile,
+			  unsigned start_offset, 
+			  unsigned size )
+{
+   FILE *aub_file = aubfile->file;
+   struct aub_block_header bh;
+   unsigned int i;
+
+   assert(start_offset + size < NR_GTT_ENTRIES * 4096);
+
+
+   memset(&bh, 0, sizeof(bh));
+   
+   bh.instruction_type = AUB_BLOCK_HEADER;
+   bh.operation = BH_DATA_WRITE;
+   bh.type = 0x0;
+   bh.address_space = ADDR_MAIN;
+   bh.general_state_type = 0x0;
+   bh.surface_state_type = 0x0;
+   bh.address =  start_offset / 4096 * 4;
+   bh.length = size / 4096 * 4;
+
+   if (fwrite(&bh, sizeof(bh), 1, aub_file) < 0) 
+      FAIL;
+
+   for (i = 0; i < size / 4096; i++) {
+      unsigned data = aubfile->next_free_page | 1;
+
+      aubfile->next_free_page += 4096;
+
+      if (fwrite(&data, sizeof(data), 1, aub_file) < 0) 
+	 FAIL;
+   }
+
+}
+
+static void write_block_header( FILE *aub_file,
+				struct aub_block_header *bh,
+				const unsigned *data,
+				unsigned sz )
+{
+   sz = (sz + 3) & ~3;
+
+   if (fwrite(bh, sizeof(*bh), 1, aub_file) < 0) 
+      FAIL;
+
+   if (fwrite(data, sz, 1, aub_file) < 0) 
+      FAIL;
+
+   fflush(aub_file);
+}
+
+
+static void write_dump_bmp( FILE *aub_file,
+			    struct aub_dump_bmp *db )
+{
+   if (fwrite(db, sizeof(*db), 1, aub_file) < 0) 
+      FAIL;
+
+   fflush(aub_file);
+}
+
+
+
+void brw_aub_gtt_data( struct brw_aubfile *aubfile,
+		       unsigned offset,
+		       const void *data,
+		       unsigned sz,
+		       unsigned type,
+		       unsigned state_type )
+{
+   struct aub_block_header bh;
+
+   bh.instruction_type = AUB_BLOCK_HEADER;
+   bh.operation = BH_DATA_WRITE;
+   bh.type = type;
+   bh.address_space = ADDR_GTT;
+   bh.pad0 = 0;
+
+   if (type == DW_GENERAL_STATE) {
+      bh.general_state_type = state_type;
+      bh.surface_state_type = 0;
+   }
+   else {
+      bh.general_state_type = 0;
+      bh.surface_state_type = state_type;
+   }
+
+   bh.pad1 = 0;
+   bh.address = offset;
+   bh.length = sz;
+
+   write_block_header(aubfile->file, &bh, data, sz);
+}
+
+
+
+void brw_aub_gtt_cmds( struct brw_aubfile *aubfile,
+		       unsigned offset,
+		       const void *data,
+		       unsigned sz )
+{
+   struct aub_block_header bh;   
+   unsigned type = CW_PRIMARY_RING_A;
+   
+
+   bh.instruction_type = AUB_BLOCK_HEADER;
+   bh.operation = BH_COMMAND_WRITE;
+   bh.type = type;
+   bh.address_space = ADDR_GTT;
+   bh.pad0 = 0;
+   bh.general_state_type = 0;
+   bh.surface_state_type = 0;
+   bh.pad1 = 0;
+   bh.address = offset;
+   bh.length = sz;
+
+   write_block_header(aubfile->file, &bh, data, sz);
+}
+
+void brw_aub_dump_bmp( struct brw_aubfile *aubfile,
+		       struct pipe_surface *surface,
+		       unsigned gtt_offset )
+{
+   struct aub_dump_bmp db;
+   unsigned format;
+
+   assert(surface->block.width == 1);
+   assert(surface->block.height == 1);
+   
+   if (surface->block.size == 4)
+      format = 0x7;
+   else
+      format = 0x3;
+
+   db.instruction_type = AUB_DUMP_BMP;
+   db.xmin = 0;
+   db.ymin = 0;
+   db.format = format;
+   db.bpp = surface->block.size * 8;
+   db.pitch = surface->stride/surface->block.size;
+   db.xsize = surface->width;
+   db.ysize = surface->height;
+   db.addr = gtt_offset;
+   db.unknown = /* surface->tiled ? 0x4 : */ 0x0;
+
+   write_dump_bmp(aubfile->file, &db);
+}
+
+
+
+struct brw_aubfile *brw_aubfile_create( void )
+{
+   struct brw_aubfile *aubfile = CALLOC_STRUCT(brw_aubfile);
+   char filename[80];
+   int val;
+   static int i = 0;
+
+   i++;
+
+   if (getenv("INTEL_AUBFILE")) {
+      val = snprintf(filename, sizeof(filename), "%s%d.aub", getenv("INTEL_AUBFILE"), i%4);
+      debug_printf("--> Aub file: %s\n", filename);
+      aubfile->file = fopen(filename, "w");
+   }
+   else {
+      val = snprintf(filename, sizeof(filename), "%s.aub", __progname);
+      if (val < 0 || val > sizeof(filename)) 
+	 strcpy(filename, "default.aub");   
+   
+      debug_printf("--> Aub file: %s\n", filename);
+      aubfile->file = fopen(filename, "w");
+   }
+
+   if (!aubfile->file) {
+      debug_printf("couldn't open aubfile\n");
+      exit(1);
+   }
+
+   init_aubfile(aubfile->file);
+
+   /* The GTT is located starting address zero in main memory.  Pages
+    * to populate the gtt start after this point.
+    */
+   aubfile->next_free_page = (NR_GTT_ENTRIES * 4 + 4095) & ~4095;
+
+   /* More or less correspond with all the agp regions mapped by the
+    * driver:
+    */
+   init_aub_gtt(aubfile, 0, 4096*4);
+   init_aub_gtt(aubfile, AUB_BUF_START, AUB_BUF_SIZE);
+
+   return aubfile;
+}
+
+void brw_aub_destroy( struct brw_aubfile *aubfile )
+{
+   fclose(aubfile->file);
+   FREE(aubfile);
+}
diff --git a/src/gallium/winsys/xlib/brw_aub.h b/src/gallium/winsys/xlib/brw_aub.h
new file mode 100644
index 0000000000..f5c60c7be2
--- /dev/null
+++ b/src/gallium/winsys/xlib/brw_aub.h
@@ -0,0 +1,114 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+ 
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+ 
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+ 
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ 
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef BRW_AUB_H
+#define BRW_AUB_H
+
+/* We set up this region, buffers may be allocated here:
+ */
+#define AUB_BUF_START (4096*4)
+#define AUB_BUF_SIZE  (8*1024*1024)
+
+struct intel_context;
+struct pipe_surface;
+
+struct brw_aubfile *brw_aubfile_create( void );
+
+void brw_aub_destroy( struct brw_aubfile *aubfile );
+
+void brw_aub_gtt_data( struct brw_aubfile *aubfile,
+		       unsigned offset,
+		       const void *data,
+		       unsigned sz,
+		       unsigned type,
+		       unsigned state_type );
+
+void brw_aub_gtt_cmds( struct brw_aubfile *aubfile,
+		       unsigned offset,
+		       const void *data,
+		       unsigned sz );
+
+void brw_aub_dump_bmp( struct brw_aubfile *aubfile,
+		       struct pipe_surface *surface,
+		       unsigned gtt_offset );
+
+
+enum data_write_type {
+   DW_NOTYPE,
+   DW_BATCH_BUFFER,
+   DW_BIN_BUFFER,
+   DW_BIN_POINTER_LIST,
+   DW_SLOW_STATE_BUFFER,
+   DW_VERTEX_BUFFER,
+   DW_2D_MAP,
+   DW_CUBE_MAP,
+   DW_INDIRECT_STATE_BUFFER,
+   DW_VOLUME_MAP,
+   DW_1D_MAP,
+   DW_CONSTANT_BUFFER,
+   DW_CONSTANT_URB_ENTRY,
+   DW_INDEX_BUFFER,
+   DW_GENERAL_STATE,
+   DW_SURFACE_STATE,
+   DW_MEDIA_OBJECT_INDIRECT_DATA,
+   DW_MAX_TYPE
+};
+
+enum data_write_general_state_type {
+   DWGS_NOTYPE,
+   DWGS_VERTEX_SHADER_STATE,
+   DWGS_GEOMETRY_SHADER_STATE ,
+   DWGS_CLIPPER_STATE,
+   DWGS_STRIPS_FANS_STATE,
+   DWGS_WINDOWER_IZ_STATE,
+   DWGS_COLOR_CALC_STATE,
+   DWGS_CLIPPER_VIEWPORT_STATE,	/* was 0x7 */
+   DWGS_STRIPS_FANS_VIEWPORT_STATE,
+   DWGS_COLOR_CALC_VIEWPORT_STATE, /* was 0x9 */
+   DWGS_SAMPLER_STATE,
+   DWGS_KERNEL_INSTRUCTIONS,
+   DWGS_SCRATCH_SPACE,
+   DWGS_SAMPLER_DEFAULT_COLOR,
+   DWGS_INTERFACE_DESCRIPTOR,
+   DWGS_VLD_STATE,
+   DWGS_VFE_STATE,
+   DWGS_MAX_TYPE
+};
+
+enum data_write_surface_state_type {
+   DWSS_NOTYPE,
+   DWSS_BINDING_TABLE_STATE,
+   DWSS_SURFACE_STATE,
+   DWSS_MAX_TYPE
+};
+
+
+#endif
diff --git a/src/gallium/winsys/xlib/fakeglx.c b/src/gallium/winsys/xlib/fakeglx.c
new file mode 100644
index 0000000000..a56e63572a
--- /dev/null
+++ b/src/gallium/winsys/xlib/fakeglx.c
@@ -0,0 +1,3212 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*
+ * This is an emulation of the GLX API which allows Mesa/GLX-based programs
+ * to run on X servers which do not have the real GLX extension.
+ *
+ * Thanks to the contributors:
+ *
+ * Initial version:  Philip Brown (phil@bolthole.com)
+ * Better glXGetConfig() support: Armin Liebchen (liebchen@asylum.cs.utah.edu)
+ * Further visual-handling refinements: Wolfram Gloger
+ *    (wmglo@Dent.MED.Uni-Muenchen.DE).
+ *
+ * Notes:
+ *   Don't be fooled, stereo isn't supported yet.
+ */
+
+
+
+#include "glxheader.h"
+#include "glxapi.h"
+#include "GL/xmesa.h"
+#include "context.h"
+#include "config.h"
+#include "macros.h"
+#include "imports.h"
+#include "mtypes.h"
+#include "version.h"
+#include "xfonts.h"
+#include "xmesaP.h"
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_public.h"
+
+
+#ifdef __VMS
+#define _mesa_sprintf sprintf
+#endif
+
+/* This indicates the client-side GLX API and GLX encoder version. */
+#define CLIENT_MAJOR_VERSION 1
+#define CLIENT_MINOR_VERSION 4  /* but don't have 1.3's pbuffers, etc yet */
+
+/* This indicates the server-side GLX decoder version.
+ * GLX 1.4 indicates OpenGL 1.3 support
+ */
+#define SERVER_MAJOR_VERSION 1
+#define SERVER_MINOR_VERSION 4
+
+/* This is appended onto the glXGetClient/ServerString version strings. */
+#define MESA_GLX_VERSION "Mesa " MESA_VERSION_STRING
+
+/* Who implemented this GLX? */
+#define VENDOR "Brian Paul"
+
+#define EXTENSIONS \
+   "GLX_MESA_set_3dfx_mode " \
+   "GLX_MESA_copy_sub_buffer " \
+   "GLX_MESA_pixmap_colormap " \
+   "GLX_MESA_release_buffers " \
+   "GLX_ARB_get_proc_address " \
+   "GLX_EXT_texture_from_pixmap " \
+   "GLX_EXT_visual_info " \
+   "GLX_EXT_visual_rating " \
+   /*"GLX_SGI_video_sync "*/ \
+   "GLX_SGIX_fbconfig " \
+   "GLX_SGIX_pbuffer "
+
+/*
+ * Our fake GLX context will contain a "real" GLX context and an XMesa context.
+ *
+ * Note that a pointer to a __GLXcontext is a pointer to a fake_glx_context,
+ * and vice versa.
+ *
+ * We really just need this structure in order to make the libGL functions
+ * glXGetCurrentContext(), glXGetCurrentDrawable() and glXGetCurrentDisplay()
+ * work correctly.
+ */
+struct fake_glx_context {
+   __GLXcontext glxContext;   /* this MUST be first! */
+   XMesaContext xmesaContext;
+};
+
+
+
+/**********************************************************************/
+/***                       GLX Visual Code                          ***/
+/**********************************************************************/
+
+#define DONT_CARE -1
+
+
+static XMesaVisual *VisualTable = NULL;
+static int NumVisuals = 0;
+
+
+/*
+ * This struct and some code fragments borrowed
+ * from Mark Kilgard's GLUT library.
+ */
+typedef struct _OverlayInfo {
+  /* Avoid 64-bit portability problems by being careful to use
+     longs due to the way XGetWindowProperty is specified. Note
+     that these parameters are passed as CARD32s over X
+     protocol. */
+  unsigned long overlay_visual;
+  long transparent_type;
+  long value;
+  long layer;
+} OverlayInfo;
+
+
+
+/* Macro to handle c_class vs class field name in XVisualInfo struct */
+#if defined(__cplusplus) || defined(c_plusplus)
+#define CLASS c_class
+#else
+#define CLASS class
+#endif
+
+
+
+/*
+ * Test if the given XVisualInfo is usable for Mesa rendering.
+ */
+static GLboolean
+is_usable_visual( XVisualInfo *vinfo )
+{
+   switch (vinfo->CLASS) {
+      case StaticGray:
+      case GrayScale:
+         /* Any StaticGray/GrayScale visual works in RGB or CI mode */
+         return GL_TRUE;
+      case StaticColor:
+      case PseudoColor:
+	 /* Any StaticColor/PseudoColor visual of at least 4 bits */
+	 if (vinfo->depth>=4) {
+	    return GL_TRUE;
+	 }
+	 else {
+	    return GL_FALSE;
+	 }
+      case TrueColor:
+      case DirectColor:
+	 /* Any depth of TrueColor or DirectColor works in RGB mode */
+	 return GL_TRUE;
+      default:
+	 /* This should never happen */
+	 return GL_FALSE;
+   }
+}
+
+
+
+/**
+ * Get an array OverlayInfo records for specified screen.
+ * \param dpy  the display
+ * \param screen  screen number
+ * \param numOverlays  returns numver of OverlayInfo records
+ * \return  pointer to OverlayInfo array, free with XFree()
+ */
+static OverlayInfo *
+GetOverlayInfo(Display *dpy, int screen, int *numOverlays)
+{
+   Atom overlayVisualsAtom;
+   Atom actualType;
+   Status status;
+   unsigned char *ovInfo;
+   unsigned long sizeData, bytesLeft;
+   int actualFormat;
+
+   /*
+    * The SERVER_OVERLAY_VISUALS property on the root window contains
+    * a list of overlay visuals.  Get that list now.
+    */
+   overlayVisualsAtom = XInternAtom(dpy,"SERVER_OVERLAY_VISUALS", True);
+   if (overlayVisualsAtom == None) {
+      return 0;
+   }
+
+   status = XGetWindowProperty(dpy, RootWindow(dpy, screen),
+                               overlayVisualsAtom, 0L, (long) 10000, False,
+                               overlayVisualsAtom, &actualType, &actualFormat,
+                               &sizeData, &bytesLeft,
+                               &ovInfo);
+
+   if (status != Success || actualType != overlayVisualsAtom ||
+       actualFormat != 32 || sizeData < 4) {
+      /* something went wrong */
+      XFree((void *) ovInfo);
+      *numOverlays = 0;
+      return NULL;
+   }
+
+   *numOverlays = sizeData / 4;
+   return (OverlayInfo *) ovInfo;
+}
+
+
+
+/**
+ * Return the level (overlay, normal, underlay) of a given XVisualInfo.
+ * Input:  dpy - the X display
+ *         vinfo - the XVisualInfo to test
+ * Return:  level of the visual:
+ *             0 = normal planes
+ *            >0 = overlay planes
+ *            <0 = underlay planes
+ */
+static int
+level_of_visual( Display *dpy, XVisualInfo *vinfo )
+{
+   OverlayInfo *overlay_info;
+   int numOverlaysPerScreen, i;
+
+   overlay_info = GetOverlayInfo(dpy, vinfo->screen, &numOverlaysPerScreen);
+   if (!overlay_info) {
+      return 0;
+   }
+
+   /* search the overlay visual list for the visual ID of interest */
+   for (i = 0; i < numOverlaysPerScreen; i++) {
+      const OverlayInfo *ov = overlay_info + i;
+      if (ov->overlay_visual == vinfo->visualid) {
+         /* found the visual */
+         if (/*ov->transparent_type==1 &&*/ ov->layer!=0) {
+            int level = ov->layer;
+            XFree((void *) overlay_info);
+            return level;
+         }
+         else {
+            XFree((void *) overlay_info);
+            return 0;
+         }
+      }
+   }
+
+   /* The visual ID was not found in the overlay list. */
+   XFree((void *) overlay_info);
+   return 0;
+}
+
+
+
+
+/*
+ * Given an XVisualInfo and RGB, Double, and Depth buffer flags, save the
+ * configuration in our list of GLX visuals.
+ */
+static XMesaVisual
+save_glx_visual( Display *dpy, XVisualInfo *vinfo,
+                 GLboolean rgbFlag, GLboolean alphaFlag, GLboolean dbFlag,
+                 GLboolean stereoFlag,
+                 GLint depth_size, GLint stencil_size,
+                 GLint accumRedSize, GLint accumGreenSize,
+                 GLint accumBlueSize, GLint accumAlphaSize,
+                 GLint level, GLint numAuxBuffers )
+{
+   GLboolean ximageFlag = GL_TRUE;
+   XMesaVisual xmvis;
+   GLint i;
+   GLboolean comparePointers;
+
+   if (dbFlag) {
+      /* Check if the MESA_BACK_BUFFER env var is set */
+      char *backbuffer = _mesa_getenv("MESA_BACK_BUFFER");
+      if (backbuffer) {
+         if (backbuffer[0]=='p' || backbuffer[0]=='P') {
+            ximageFlag = GL_FALSE;
+         }
+         else if (backbuffer[0]=='x' || backbuffer[0]=='X') {
+            ximageFlag = GL_TRUE;
+         }
+         else {
+            _mesa_warning(NULL, "Mesa: invalid value for MESA_BACK_BUFFER environment variable, using an XImage.");
+         }
+      }
+   }
+
+   if (stereoFlag) {
+      /* stereo not supported */
+      return NULL;
+   }
+
+   /* Comparing IDs uses less memory but sometimes fails. */
+   /* XXX revisit this after 3.0 is finished. */
+   if (_mesa_getenv("MESA_GLX_VISUAL_HACK"))
+      comparePointers = GL_TRUE;
+   else
+      comparePointers = GL_FALSE;
+
+   /* Force the visual to have an alpha channel */
+   if (rgbFlag && _mesa_getenv("MESA_GLX_FORCE_ALPHA"))
+      alphaFlag = GL_TRUE;
+
+   /* First check if a matching visual is already in the list */
+   for (i=0; i<NumVisuals; i++) {
+      XMesaVisual v = VisualTable[i];
+      if (v->display == dpy
+          && v->mesa_visual.level == level
+          && v->mesa_visual.numAuxBuffers == numAuxBuffers
+          && v->ximage_flag == ximageFlag
+          && v->mesa_visual.rgbMode == rgbFlag
+          && v->mesa_visual.doubleBufferMode == dbFlag
+          && v->mesa_visual.stereoMode == stereoFlag
+          && (v->mesa_visual.alphaBits > 0) == alphaFlag
+          && (v->mesa_visual.depthBits >= depth_size || depth_size == 0)
+          && (v->mesa_visual.stencilBits >= stencil_size || stencil_size == 0)
+          && (v->mesa_visual.accumRedBits >= accumRedSize || accumRedSize == 0)
+          && (v->mesa_visual.accumGreenBits >= accumGreenSize || accumGreenSize == 0)
+          && (v->mesa_visual.accumBlueBits >= accumBlueSize || accumBlueSize == 0)
+          && (v->mesa_visual.accumAlphaBits >= accumAlphaSize || accumAlphaSize == 0)) {
+         /* now either compare XVisualInfo pointers or visual IDs */
+         if ((!comparePointers && v->visinfo->visualid == vinfo->visualid)
+             || (comparePointers && v->vishandle == vinfo)) {
+            return v;
+         }
+      }
+   }
+
+   /* Create a new visual and add it to the list. */
+
+   xmvis = XMesaCreateVisual( dpy, vinfo, rgbFlag, alphaFlag, dbFlag,
+                              stereoFlag, ximageFlag,
+                              depth_size, stencil_size,
+                              accumRedSize, accumBlueSize,
+                              accumBlueSize, accumAlphaSize, 0, level,
+                              GLX_NONE_EXT );
+   if (xmvis) {
+      /* Save a copy of the pointer now so we can find this visual again
+       * if we need to search for it in find_glx_visual().
+       */
+      xmvis->vishandle = vinfo;
+      /* Allocate more space for additional visual */
+      VisualTable = (XMesaVisual *) _mesa_realloc( VisualTable, 
+                                   sizeof(XMesaVisual) * NumVisuals, 
+                                   sizeof(XMesaVisual) * (NumVisuals + 1));
+      /* add xmvis to the list */
+      VisualTable[NumVisuals] = xmvis;
+      NumVisuals++;
+      /* XXX minor hack, because XMesaCreateVisual doesn't support an
+       * aux buffers parameter.
+       */
+      xmvis->mesa_visual.numAuxBuffers = numAuxBuffers;
+   }
+   return xmvis;
+}
+
+
+/**
+ * Return the default number of bits for the Z buffer.
+ * If defined, use the MESA_GLX_DEPTH_BITS env var value.
+ * Otherwise, use the DEFAULT_SOFTWARE_DEPTH_BITS constant.
+ * XXX probably do the same thing for stencil, accum, etc.
+ */
+static GLint
+default_depth_bits(void)
+{
+   int zBits;
+   const char *zEnv = _mesa_getenv("MESA_GLX_DEPTH_BITS");
+   if (zEnv)
+      zBits = _mesa_atoi(zEnv);
+   else
+      zBits = DEFAULT_SOFTWARE_DEPTH_BITS;
+   return zBits;
+}
+
+static GLint
+default_alpha_bits(void)
+{
+   int aBits;
+   const char *aEnv = _mesa_getenv("MESA_GLX_ALPHA_BITS");
+   if (aEnv)
+      aBits = _mesa_atoi(aEnv);
+   else
+      aBits = 0;
+   return aBits;
+}
+
+static GLint
+default_accum_bits(void)
+{
+   return 16;
+}
+
+
+
+/*
+ * Create a GLX visual from a regular XVisualInfo.
+ * This is called when Fake GLX is given an XVisualInfo which wasn't
+ * returned by glXChooseVisual.  Since this is the first time we're
+ * considering this visual we'll take a guess at reasonable values
+ * for depth buffer size, stencil size, accum size, etc.
+ * This is the best we can do with a client-side emulation of GLX.
+ */
+static XMesaVisual
+create_glx_visual( Display *dpy, XVisualInfo *visinfo )
+{
+   int vislevel;
+   GLint zBits = default_depth_bits();
+   GLint accBits = default_accum_bits();
+   GLboolean alphaFlag = default_alpha_bits() > 0;
+
+   vislevel = level_of_visual( dpy, visinfo );
+   if (vislevel) {
+      /* Configure this visual as a CI, single-buffered overlay */
+      return save_glx_visual( dpy, visinfo,
+                              GL_FALSE,  /* rgb */
+                              GL_FALSE,  /* alpha */
+                              GL_FALSE,  /* double */
+                              GL_FALSE,  /* stereo */
+                              0,         /* depth bits */
+                              0,         /* stencil bits */
+                              0,0,0,0,   /* accum bits */
+                              vislevel,  /* level */
+                              0          /* numAux */
+                            );
+   }
+   else if (is_usable_visual( visinfo )) {
+      if (_mesa_getenv("MESA_GLX_FORCE_CI")) {
+         /* Configure this visual as a COLOR INDEX visual. */
+         return save_glx_visual( dpy, visinfo,
+                                 GL_FALSE,   /* rgb */
+                                 GL_FALSE,  /* alpha */
+                                 GL_TRUE,   /* double */
+                                 GL_FALSE,  /* stereo */
+                                 zBits,
+                                 STENCIL_BITS,
+                                 0, 0, 0, 0, /* accum bits */
+                                 0,         /* level */
+                                 0          /* numAux */
+                               );
+      }
+      else {
+         /* Configure this visual as RGB, double-buffered, depth-buffered. */
+         /* This is surely wrong for some people's needs but what else */
+         /* can be done?  They should use glXChooseVisual(). */
+         return save_glx_visual( dpy, visinfo,
+                                 GL_TRUE,   /* rgb */
+                                 alphaFlag, /* alpha */
+                                 GL_TRUE,   /* double */
+                                 GL_FALSE,  /* stereo */
+                                 zBits,
+                                 STENCIL_BITS,
+                                 accBits, /* r */
+                                 accBits, /* g */
+                                 accBits, /* b */
+                                 accBits, /* a */
+                                 0,         /* level */
+                                 0          /* numAux */
+                               );
+      }
+   }
+   else {
+      _mesa_warning(NULL, "Mesa: error in glXCreateContext: bad visual\n");
+      return NULL;
+   }
+}
+
+
+
+/*
+ * Find the GLX visual associated with an XVisualInfo.
+ */
+static XMesaVisual
+find_glx_visual( Display *dpy, XVisualInfo *vinfo )
+{
+   int i;
+
+   /* try to match visual id */
+   for (i=0;i<NumVisuals;i++) {
+      if (VisualTable[i]->display==dpy
+          && VisualTable[i]->visinfo->visualid == vinfo->visualid) {
+         return VisualTable[i];
+      }
+   }
+
+   /* if that fails, try to match pointers */
+   for (i=0;i<NumVisuals;i++) {
+      if (VisualTable[i]->display==dpy && VisualTable[i]->vishandle==vinfo) {
+         return VisualTable[i];
+      }
+   }
+
+   return NULL;
+}
+
+
+
+/**
+ * Return the transparent pixel value for a GLX visual.
+ * Input:  glxvis - the glx_visual
+ * Return:  a pixel value or -1 if no transparent pixel
+ */
+static int
+transparent_pixel( XMesaVisual glxvis )
+{
+   Display *dpy = glxvis->display;
+   XVisualInfo *vinfo = glxvis->visinfo;
+   OverlayInfo *overlay_info;
+   int numOverlaysPerScreen, i;
+
+   overlay_info = GetOverlayInfo(dpy, vinfo->screen, &numOverlaysPerScreen);
+   if (!overlay_info) {
+      return -1;
+   }
+
+   for (i = 0; i < numOverlaysPerScreen; i++) {
+      const OverlayInfo *ov = overlay_info + i;
+      if (ov->overlay_visual == vinfo->visualid) {
+         /* found it! */
+         if (ov->transparent_type == 0) {
+            /* type 0 indicates no transparency */
+            XFree((void *) overlay_info);
+            return -1;
+         }
+         else {
+            /* ov->value is the transparent pixel */
+            XFree((void *) overlay_info);
+            return ov->value;
+         }
+      }
+   }
+
+   /* The visual ID was not found in the overlay list. */
+   XFree((void *) overlay_info);
+   return -1;
+}
+
+
+
+/**
+ * Try to get an X visual which matches the given arguments.
+ */
+static XVisualInfo *
+get_visual( Display *dpy, int scr, unsigned int depth, int xclass )
+{
+   XVisualInfo temp, *vis;
+   long mask;
+   int n;
+   unsigned int default_depth;
+   int default_class;
+
+   mask = VisualScreenMask | VisualDepthMask | VisualClassMask;
+   temp.screen = scr;
+   temp.depth = depth;
+   temp.CLASS = xclass;
+
+   default_depth = DefaultDepth(dpy,scr);
+   default_class = DefaultVisual(dpy,scr)->CLASS;
+
+   if (depth==default_depth && xclass==default_class) {
+      /* try to get root window's visual */
+      temp.visualid = DefaultVisual(dpy,scr)->visualid;
+      mask |= VisualIDMask;
+   }
+
+   vis = XGetVisualInfo( dpy, mask, &temp, &n );
+
+   /* In case bits/pixel > 24, make sure color channels are still <=8 bits.
+    * An SGI Infinite Reality system, for example, can have 30bpp pixels:
+    * 10 bits per color channel.  Mesa's limited to a max of 8 bits/channel.
+    */
+   if (vis && depth > 24 && (xclass==TrueColor || xclass==DirectColor)) {
+      if (_mesa_bitcount((GLuint) vis->red_mask  ) <= 8 &&
+          _mesa_bitcount((GLuint) vis->green_mask) <= 8 &&
+          _mesa_bitcount((GLuint) vis->blue_mask ) <= 8) {
+         return vis;
+      }
+      else {
+         XFree((void *) vis);
+         return NULL;
+      }
+   }
+
+   return vis;
+}
+
+
+
+/*
+ * Retrieve the value of the given environment variable and find
+ * the X visual which matches it.
+ * Input:  dpy - the display
+ *         screen - the screen number
+ *         varname - the name of the environment variable
+ * Return:  an XVisualInfo pointer to NULL if error.
+ */
+static XVisualInfo *
+get_env_visual(Display *dpy, int scr, const char *varname)
+{
+   char value[100], type[100];
+   int depth, xclass = -1;
+   XVisualInfo *vis;
+
+   if (!_mesa_getenv( varname )) {
+      return NULL;
+   }
+
+   _mesa_strncpy( value, _mesa_getenv(varname), 100 );
+   value[99] = 0;
+
+   sscanf( value, "%s %d", type, &depth );
+
+   if (_mesa_strcmp(type,"TrueColor")==0)          xclass = TrueColor;
+   else if (_mesa_strcmp(type,"DirectColor")==0)   xclass = DirectColor;
+   else if (_mesa_strcmp(type,"PseudoColor")==0)   xclass = PseudoColor;
+   else if (_mesa_strcmp(type,"StaticColor")==0)   xclass = StaticColor;
+   else if (_mesa_strcmp(type,"GrayScale")==0)     xclass = GrayScale;
+   else if (_mesa_strcmp(type,"StaticGray")==0)    xclass = StaticGray;
+
+   if (xclass>-1 && depth>0) {
+      vis = get_visual( dpy, scr, depth, xclass );
+      if (vis) {
+	 return vis;
+      }
+   }
+
+   _mesa_warning(NULL, "GLX unable to find visual class=%s, depth=%d.",
+                 type, depth);
+
+   return NULL;
+}
+
+
+
+/*
+ * Select an X visual which satisfies the RGBA/CI flag and minimum depth.
+ * Input:  dpy, screen - X display and screen number
+ *         rgba - GL_TRUE = RGBA mode, GL_FALSE = CI mode
+ *         min_depth - minimum visual depth
+ *         preferred_class - preferred GLX visual class or DONT_CARE
+ * Return:  pointer to an XVisualInfo or NULL.
+ */
+static XVisualInfo *
+choose_x_visual( Display *dpy, int screen, GLboolean rgba, int min_depth,
+                 int preferred_class )
+{
+   XVisualInfo *vis;
+   int xclass, visclass = 0;
+   int depth;
+
+   if (rgba) {
+      Atom hp_cr_maps = XInternAtom(dpy, "_HP_RGB_SMOOTH_MAP_LIST", True);
+      /* First see if the MESA_RGB_VISUAL env var is defined */
+      vis = get_env_visual( dpy, screen, "MESA_RGB_VISUAL" );
+      if (vis) {
+	 return vis;
+      }
+      /* Otherwise, search for a suitable visual */
+      if (preferred_class==DONT_CARE) {
+         for (xclass=0;xclass<6;xclass++) {
+            switch (xclass) {
+               case 0:  visclass = TrueColor;    break;
+               case 1:  visclass = DirectColor;  break;
+               case 2:  visclass = PseudoColor;  break;
+               case 3:  visclass = StaticColor;  break;
+               case 4:  visclass = GrayScale;    break;
+               case 5:  visclass = StaticGray;   break;
+            }
+            if (min_depth==0) {
+               /* start with shallowest */
+               for (depth=0;depth<=32;depth++) {
+                  if (visclass==TrueColor && depth==8 && !hp_cr_maps) {
+                     /* Special case:  try to get 8-bit PseudoColor before */
+                     /* 8-bit TrueColor */
+                     vis = get_visual( dpy, screen, 8, PseudoColor );
+                     if (vis) {
+                        return vis;
+                     }
+                  }
+                  vis = get_visual( dpy, screen, depth, visclass );
+                  if (vis) {
+                     return vis;
+                  }
+               }
+            }
+            else {
+               /* start with deepest */
+               for (depth=32;depth>=min_depth;depth--) {
+                  if (visclass==TrueColor && depth==8 && !hp_cr_maps) {
+                     /* Special case:  try to get 8-bit PseudoColor before */
+                     /* 8-bit TrueColor */
+                     vis = get_visual( dpy, screen, 8, PseudoColor );
+                     if (vis) {
+                        return vis;
+                     }
+                  }
+                  vis = get_visual( dpy, screen, depth, visclass );
+                  if (vis) {
+                     return vis;
+                  }
+               }
+            }
+         }
+      }
+      else {
+         /* search for a specific visual class */
+         switch (preferred_class) {
+            case GLX_TRUE_COLOR_EXT:    visclass = TrueColor;    break;
+            case GLX_DIRECT_COLOR_EXT:  visclass = DirectColor;  break;
+            case GLX_PSEUDO_COLOR_EXT:  visclass = PseudoColor;  break;
+            case GLX_STATIC_COLOR_EXT:  visclass = StaticColor;  break;
+            case GLX_GRAY_SCALE_EXT:    visclass = GrayScale;    break;
+            case GLX_STATIC_GRAY_EXT:   visclass = StaticGray;   break;
+            default:   return NULL;
+         }
+         if (min_depth==0) {
+            /* start with shallowest */
+            for (depth=0;depth<=32;depth++) {
+               vis = get_visual( dpy, screen, depth, visclass );
+               if (vis) {
+                  return vis;
+               }
+            }
+         }
+         else {
+            /* start with deepest */
+            for (depth=32;depth>=min_depth;depth--) {
+               vis = get_visual( dpy, screen, depth, visclass );
+               if (vis) {
+                  return vis;
+               }
+            }
+         }
+      }
+   }
+   else {
+      /* First see if the MESA_CI_VISUAL env var is defined */
+      vis = get_env_visual( dpy, screen, "MESA_CI_VISUAL" );
+      if (vis) {
+	 return vis;
+      }
+      /* Otherwise, search for a suitable visual, starting with shallowest */
+      if (preferred_class==DONT_CARE) {
+         for (xclass=0;xclass<4;xclass++) {
+            switch (xclass) {
+               case 0:  visclass = PseudoColor;  break;
+               case 1:  visclass = StaticColor;  break;
+               case 2:  visclass = GrayScale;    break;
+               case 3:  visclass = StaticGray;   break;
+            }
+            /* try 8-bit up through 16-bit */
+            for (depth=8;depth<=16;depth++) {
+               vis = get_visual( dpy, screen, depth, visclass );
+               if (vis) {
+                  return vis;
+               }
+            }
+            /* try min_depth up to 8-bit */
+            for (depth=min_depth;depth<8;depth++) {
+               vis = get_visual( dpy, screen, depth, visclass );
+               if (vis) {
+                  return vis;
+               }
+            }
+         }
+      }
+      else {
+         /* search for a specific visual class */
+         switch (preferred_class) {
+            case GLX_TRUE_COLOR_EXT:    visclass = TrueColor;    break;
+            case GLX_DIRECT_COLOR_EXT:  visclass = DirectColor;  break;
+            case GLX_PSEUDO_COLOR_EXT:  visclass = PseudoColor;  break;
+            case GLX_STATIC_COLOR_EXT:  visclass = StaticColor;  break;
+            case GLX_GRAY_SCALE_EXT:    visclass = GrayScale;    break;
+            case GLX_STATIC_GRAY_EXT:   visclass = StaticGray;   break;
+            default:   return NULL;
+         }
+         /* try 8-bit up through 16-bit */
+         for (depth=8;depth<=16;depth++) {
+            vis = get_visual( dpy, screen, depth, visclass );
+            if (vis) {
+               return vis;
+            }
+         }
+         /* try min_depth up to 8-bit */
+         for (depth=min_depth;depth<8;depth++) {
+            vis = get_visual( dpy, screen, depth, visclass );
+            if (vis) {
+               return vis;
+            }
+         }
+      }
+   }
+
+   /* didn't find a visual */
+   return NULL;
+}
+
+
+
+/*
+ * Find the deepest X over/underlay visual of at least min_depth.
+ * Input:  dpy, screen - X display and screen number
+ *         level - the over/underlay level
+ *         trans_type - transparent pixel type: GLX_NONE_EXT,
+ *                      GLX_TRANSPARENT_RGB_EXT, GLX_TRANSPARENT_INDEX_EXT,
+ *                      or DONT_CARE
+ *         trans_value - transparent pixel value or DONT_CARE
+ *         min_depth - minimum visual depth
+ *         preferred_class - preferred GLX visual class or DONT_CARE
+ * Return:  pointer to an XVisualInfo or NULL.
+ */
+static XVisualInfo *
+choose_x_overlay_visual( Display *dpy, int scr, GLboolean rgbFlag,
+                         int level, int trans_type, int trans_value,
+                         int min_depth, int preferred_class )
+{
+   OverlayInfo *overlay_info;
+   int numOverlaysPerScreen;
+   int i;
+   XVisualInfo *deepvis;
+   int deepest;
+
+   /*DEBUG int tt, tv; */
+
+   switch (preferred_class) {
+      case GLX_TRUE_COLOR_EXT:    preferred_class = TrueColor;    break;
+      case GLX_DIRECT_COLOR_EXT:  preferred_class = DirectColor;  break;
+      case GLX_PSEUDO_COLOR_EXT:  preferred_class = PseudoColor;  break;
+      case GLX_STATIC_COLOR_EXT:  preferred_class = StaticColor;  break;
+      case GLX_GRAY_SCALE_EXT:    preferred_class = GrayScale;    break;
+      case GLX_STATIC_GRAY_EXT:   preferred_class = StaticGray;   break;
+      default:                    preferred_class = DONT_CARE;
+   }
+
+   overlay_info = GetOverlayInfo(dpy, scr, &numOverlaysPerScreen);
+   if (!overlay_info) {
+      return NULL;
+   }
+
+   /* Search for the deepest overlay which satisifies all criteria. */
+   deepest = min_depth;
+   deepvis = NULL;
+
+   for (i = 0; i < numOverlaysPerScreen; i++) {
+      const OverlayInfo *ov = overlay_info + i;
+      XVisualInfo *vislist, vistemplate;
+      int count;
+
+      if (ov->layer!=level) {
+         /* failed overlay level criteria */
+         continue;
+      }
+      if (!(trans_type==DONT_CARE
+            || (trans_type==GLX_TRANSPARENT_INDEX_EXT
+                && ov->transparent_type>0)
+            || (trans_type==GLX_NONE_EXT && ov->transparent_type==0))) {
+         /* failed transparent pixel type criteria */
+         continue;
+      }
+      if (trans_value!=DONT_CARE && trans_value!=ov->value) {
+         /* failed transparent pixel value criteria */
+         continue;
+      }
+
+      /* get XVisualInfo and check the depth */
+      vistemplate.visualid = ov->overlay_visual;
+      vistemplate.screen = scr;
+      vislist = XGetVisualInfo( dpy, VisualIDMask | VisualScreenMask,
+                                &vistemplate, &count );
+
+      if (count!=1) {
+         /* something went wrong */
+         continue;
+      }
+      if (preferred_class!=DONT_CARE && preferred_class!=vislist->CLASS) {
+         /* wrong visual class */
+         continue;
+      }
+
+      /* if RGB was requested, make sure we have True/DirectColor */
+      if (rgbFlag && vislist->CLASS != TrueColor
+          && vislist->CLASS != DirectColor)
+         continue;
+
+      /* if CI was requested, make sure we have a color indexed visual */
+      if (!rgbFlag
+          && (vislist->CLASS == TrueColor || vislist->CLASS == DirectColor))
+         continue;
+
+      if (deepvis==NULL || vislist->depth > deepest) {
+         /* YES!  found a satisfactory visual */
+         if (deepvis) {
+            XFree( deepvis );
+         }
+         deepest = vislist->depth;
+         deepvis = vislist;
+         /* DEBUG  tt = ov->transparent_type;*/
+         /* DEBUG  tv = ov->value; */
+      }
+   }
+
+/*DEBUG
+   if (deepvis) {
+      printf("chose 0x%x:  layer=%d depth=%d trans_type=%d trans_value=%d\n",
+             deepvis->visualid, level, deepvis->depth, tt, tv );
+   }
+*/
+   return deepvis;
+}
+
+
+/**********************************************************************/
+/***             Display-related functions                          ***/
+/**********************************************************************/
+
+
+/**
+ * Free all XMesaVisuals which are associated with the given display.
+ */
+static void
+destroy_visuals_on_display(Display *dpy)
+{
+   int i;
+   for (i = 0; i < NumVisuals; i++) {
+      if (VisualTable[i]->display == dpy) {
+         /* remove this visual */
+         int j;
+         free(VisualTable[i]);
+         for (j = i; j < NumVisuals - 1; j++)
+            VisualTable[j] = VisualTable[j + 1];
+         NumVisuals--;
+      }
+   }
+}
+
+
+/**
+ * Called from XCloseDisplay() to let us free our display-related data.
+ */
+static int
+close_display_callback(Display *dpy, XExtCodes *codes)
+{
+   destroy_visuals_on_display(dpy);
+   xmesa_destroy_buffers_on_display(dpy);
+   return 0;
+}
+
+
+/**
+ * Look for the named extension on given display and return a pointer
+ * to the _XExtension data, or NULL if extension not found.
+ */
+static _XExtension *
+lookup_extension(Display *dpy, const char *extName)
+{
+   _XExtension *ext;
+   for (ext = dpy->ext_procs; ext; ext = ext->next) {
+      if (ext->name && strcmp(ext->name, extName) == 0) {
+         return ext;
+      }
+   }
+   return NULL;
+}
+
+
+/**
+ * Whenever we're given a new Display pointer, call this function to
+ * register our close_display_callback function.
+ */
+static void
+register_with_display(Display *dpy)
+{
+   const char *extName = "MesaGLX";
+   _XExtension *ext;
+
+   ext = lookup_extension(dpy, extName);
+   if (!ext) {
+      XExtCodes *c = XAddExtension(dpy);
+      ext = dpy->ext_procs;  /* new extension is at head of list */
+      assert(c->extension == ext->codes.extension);
+      ext->name = _mesa_strdup(extName);
+      ext->close_display = close_display_callback;
+   }
+}
+
+
+/**********************************************************************/
+/***                  Begin Fake GLX API Functions                  ***/
+/**********************************************************************/
+
+
+/**
+ * Helper used by glXChooseVisual and glXChooseFBConfig.
+ * The fbConfig parameter must be GL_FALSE for the former and GL_TRUE for
+ * the later.
+ * In either case, the attribute list is terminated with the value 'None'.
+ */
+static XMesaVisual
+choose_visual( Display *dpy, int screen, const int *list, GLboolean fbConfig )
+{
+   const GLboolean rgbModeDefault = fbConfig;
+   const int *parselist;
+   XVisualInfo *vis;
+   int min_ci = 0;
+   int min_red=0, min_green=0, min_blue=0;
+   GLboolean rgb_flag = rgbModeDefault;
+   GLboolean alpha_flag = GL_FALSE;
+   GLboolean double_flag = GL_FALSE;
+   GLboolean stereo_flag = GL_FALSE;
+   GLint depth_size = 0;
+   GLint stencil_size = 0;
+   GLint accumRedSize = 0;
+   GLint accumGreenSize = 0;
+   GLint accumBlueSize = 0;
+   GLint accumAlphaSize = 0;
+   int level = 0;
+   int visual_type = DONT_CARE;
+   int trans_type = DONT_CARE;
+   int trans_value = DONT_CARE;
+   GLint caveat = DONT_CARE;
+   XMesaVisual xmvis = NULL;
+   int desiredVisualID = -1;
+   int numAux = 0;
+
+   parselist = list;
+
+   while (*parselist) {
+
+      switch (*parselist) {
+	 case GLX_USE_GL:
+            if (fbConfig) {
+               /* invalid token */
+               return NULL;
+            }
+            else {
+               /* skip */
+               parselist++;
+            }
+	    break;
+	 case GLX_BUFFER_SIZE:
+	    parselist++;
+	    min_ci = *parselist++;
+	    break;
+	 case GLX_LEVEL:
+	    parselist++;
+            level = *parselist++;
+	    break;
+	 case GLX_RGBA:
+            if (fbConfig) {
+               /* invalid token */
+               return NULL;
+            }
+            else {
+               rgb_flag = GL_TRUE;
+               parselist++;
+            }
+	    break;
+	 case GLX_DOUBLEBUFFER:
+            parselist++;
+            if (fbConfig) {
+               double_flag = *parselist++;
+            }
+            else {
+               double_flag = GL_TRUE;
+            }
+	    break;
+	 case GLX_STEREO:
+            parselist++;
+            if (fbConfig) {
+               stereo_flag = *parselist++;
+            }
+            else {
+               stereo_flag = GL_TRUE;
+            }
+            break;
+	 case GLX_AUX_BUFFERS:
+	    parselist++;
+            numAux = *parselist++;
+            if (numAux > MAX_AUX_BUFFERS)
+               return NULL;
+	    break;
+	 case GLX_RED_SIZE:
+	    parselist++;
+	    min_red = *parselist++;
+	    break;
+	 case GLX_GREEN_SIZE:
+	    parselist++;
+	    min_green = *parselist++;
+	    break;
+	 case GLX_BLUE_SIZE:
+	    parselist++;
+	    min_blue = *parselist++;
+	    break;
+	 case GLX_ALPHA_SIZE:
+	    parselist++;
+            {
+               GLint size = *parselist++;
+               alpha_flag = size ? GL_TRUE : GL_FALSE;
+            }
+	    break;
+	 case GLX_DEPTH_SIZE:
+	    parselist++;
+	    depth_size = *parselist++;
+	    break;
+	 case GLX_STENCIL_SIZE:
+	    parselist++;
+	    stencil_size = *parselist++;
+	    break;
+	 case GLX_ACCUM_RED_SIZE:
+	    parselist++;
+            {
+               GLint size = *parselist++;
+               accumRedSize = MAX2( accumRedSize, size );
+            }
+            break;
+	 case GLX_ACCUM_GREEN_SIZE:
+	    parselist++;
+            {
+               GLint size = *parselist++;
+               accumGreenSize = MAX2( accumGreenSize, size );
+            }
+            break;
+	 case GLX_ACCUM_BLUE_SIZE:
+	    parselist++;
+            {
+               GLint size = *parselist++;
+               accumBlueSize = MAX2( accumBlueSize, size );
+            }
+            break;
+	 case GLX_ACCUM_ALPHA_SIZE:
+	    parselist++;
+            {
+               GLint size = *parselist++;
+               accumAlphaSize = MAX2( accumAlphaSize, size );
+            }
+	    break;
+
+         /*
+          * GLX_EXT_visual_info extension
+          */
+         case GLX_X_VISUAL_TYPE_EXT:
+            parselist++;
+            visual_type = *parselist++;
+            break;
+         case GLX_TRANSPARENT_TYPE_EXT:
+            parselist++;
+            trans_type = *parselist++;
+            break;
+         case GLX_TRANSPARENT_INDEX_VALUE_EXT:
+            parselist++;
+            trans_value = *parselist++;
+            break;
+         case GLX_TRANSPARENT_RED_VALUE_EXT:
+         case GLX_TRANSPARENT_GREEN_VALUE_EXT:
+         case GLX_TRANSPARENT_BLUE_VALUE_EXT:
+         case GLX_TRANSPARENT_ALPHA_VALUE_EXT:
+	    /* ignore */
+	    parselist++;
+	    parselist++;
+	    break;
+
+         /*
+          * GLX_EXT_visual_info extension
+          */
+         case GLX_VISUAL_CAVEAT_EXT:
+            parselist++;
+            caveat = *parselist++; /* ignored for now */
+            break;
+
+         /*
+          * GLX_ARB_multisample
+          */
+         case GLX_SAMPLE_BUFFERS_ARB:
+            /* ms not supported */
+            return NULL;
+         case GLX_SAMPLES_ARB:
+            /* ms not supported */
+            return NULL;
+
+         /*
+          * FBConfig attribs.
+          */
+         case GLX_RENDER_TYPE:
+            if (!fbConfig)
+               return NULL;
+            parselist++;
+            if (*parselist == GLX_RGBA_BIT) {
+               rgb_flag = GL_TRUE;
+            }
+            else if (*parselist == GLX_COLOR_INDEX_BIT) {
+               rgb_flag = GL_FALSE;
+            }
+            else if (*parselist == 0) {
+               rgb_flag = GL_TRUE;
+            }
+            parselist++;
+            break;
+         case GLX_DRAWABLE_TYPE:
+            if (!fbConfig)
+               return NULL;
+            parselist++;
+            if (*parselist & ~(GLX_WINDOW_BIT | GLX_PIXMAP_BIT | GLX_PBUFFER_BIT)) {
+               return NULL; /* bad bit */
+            }
+            parselist++;
+            break;
+         case GLX_FBCONFIG_ID:
+            if (!fbConfig)
+               return NULL;
+            parselist++;
+            desiredVisualID = *parselist++;
+            break;
+         case GLX_X_RENDERABLE:
+            if (!fbConfig)
+               return NULL;
+            parselist += 2;
+            /* ignore */
+            break;
+
+#ifdef GLX_EXT_texture_from_pixmap
+         case GLX_BIND_TO_TEXTURE_RGB_EXT:
+            parselist++; /*skip*/
+            break;
+         case GLX_BIND_TO_TEXTURE_RGBA_EXT:
+            parselist++; /*skip*/
+            break;
+         case GLX_BIND_TO_MIPMAP_TEXTURE_EXT:
+            parselist++; /*skip*/
+            break;
+         case GLX_BIND_TO_TEXTURE_TARGETS_EXT:
+            parselist++;
+            if (*parselist & ~(GLX_TEXTURE_1D_BIT_EXT |
+                               GLX_TEXTURE_2D_BIT_EXT |
+                               GLX_TEXTURE_RECTANGLE_BIT_EXT)) {
+               /* invalid bit */
+               return NULL;
+            }
+            break;
+         case GLX_Y_INVERTED_EXT:
+            parselist++; /*skip*/
+            break;
+#endif
+
+	 case None:
+            /* end of list */
+	    break;
+
+	 default:
+	    /* undefined attribute */
+            _mesa_warning(NULL, "unexpected attrib 0x%x in choose_visual()",
+                          *parselist);
+	    return NULL;
+      }
+   }
+
+   (void) caveat;
+
+   /*
+    * Since we're only simulating the GLX extension this function will never
+    * find any real GL visuals.  Instead, all we can do is try to find an RGB
+    * or CI visual of appropriate depth.  Other requested attributes such as
+    * double buffering, depth buffer, etc. will be associated with the X
+    * visual and stored in the VisualTable[].
+    */
+   if (desiredVisualID != -1) {
+      /* try to get a specific visual, by visualID */
+      XVisualInfo temp;
+      int n;
+      temp.visualid = desiredVisualID;
+      temp.screen = screen;
+      vis = XGetVisualInfo(dpy, VisualIDMask | VisualScreenMask, &temp, &n);
+      if (vis) {
+         /* give the visual some useful GLX attributes */
+         double_flag = GL_TRUE;
+         if (vis->depth > 8)
+            rgb_flag = GL_TRUE;
+         depth_size = default_depth_bits();
+         stencil_size = STENCIL_BITS;
+         /* XXX accum??? */
+      }
+   }
+   else if (level==0) {
+      /* normal color planes */
+      if (rgb_flag) {
+         /* Get an RGB visual */
+         int min_rgb = min_red + min_green + min_blue;
+         if (min_rgb>1 && min_rgb<8) {
+            /* a special case to be sure we can get a monochrome visual */
+            min_rgb = 1;
+         }
+         vis = choose_x_visual( dpy, screen, rgb_flag, min_rgb, visual_type );
+      }
+      else {
+         /* Get a color index visual */
+         vis = choose_x_visual( dpy, screen, rgb_flag, min_ci, visual_type );
+         accumRedSize = accumGreenSize = accumBlueSize = accumAlphaSize = 0;
+      }
+   }
+   else {
+      /* over/underlay planes */
+      if (rgb_flag) {
+         /* rgba overlay */
+         int min_rgb = min_red + min_green + min_blue;
+         if (min_rgb>1 && min_rgb<8) {
+            /* a special case to be sure we can get a monochrome visual */
+            min_rgb = 1;
+         }
+         vis = choose_x_overlay_visual( dpy, screen, rgb_flag, level,
+                              trans_type, trans_value, min_rgb, visual_type );
+      }
+      else {
+         /* color index overlay */
+         vis = choose_x_overlay_visual( dpy, screen, rgb_flag, level,
+                              trans_type, trans_value, min_ci, visual_type );
+      }
+   }
+
+   if (vis) {
+      /* Note: we're not exactly obeying the glXChooseVisual rules here.
+       * When GLX_DEPTH_SIZE = 1 is specified we're supposed to choose the
+       * largest depth buffer size, which is 32bits/value.  Instead, we
+       * return 16 to maintain performance with earlier versions of Mesa.
+       */
+      if (stencil_size > 0)
+         depth_size = 24;  /* if Z and stencil, always use 24+8 format */
+      else if (depth_size > 24)
+         depth_size = 32;
+      else if (depth_size > 16)
+         depth_size = 24;
+      else if (depth_size > 0) {
+         depth_size = default_depth_bits();
+      }
+
+      if (!alpha_flag) {
+         alpha_flag = default_alpha_bits() > 0;
+      }
+
+      /* we only support one size of stencil and accum buffers. */
+      if (stencil_size > 0)
+         stencil_size = STENCIL_BITS;
+      if (accumRedSize > 0 || accumGreenSize > 0 || accumBlueSize > 0 ||
+          accumAlphaSize > 0) {
+         accumRedSize = 
+         accumGreenSize = 
+         accumBlueSize = default_accum_bits();
+         accumAlphaSize = alpha_flag ? accumRedSize : 0;
+      }
+
+      xmvis = save_glx_visual( dpy, vis, rgb_flag, alpha_flag, double_flag,
+                               stereo_flag, depth_size, stencil_size,
+                               accumRedSize, accumGreenSize,
+                               accumBlueSize, accumAlphaSize, level, numAux );
+   }
+
+   return xmvis;
+}
+
+
+static XVisualInfo *
+Fake_glXChooseVisual( Display *dpy, int screen, int *list )
+{
+   XMesaVisual xmvis;
+
+   /* register ourselves as an extension on this display */
+   register_with_display(dpy);
+
+   xmvis = choose_visual(dpy, screen, list, GL_FALSE);
+   if (xmvis) {
+#if 0
+      return xmvis->vishandle;
+#else
+      /* create a new vishandle - the cached one may be stale */
+      xmvis->vishandle = (XVisualInfo *) _mesa_malloc(sizeof(XVisualInfo));
+      if (xmvis->vishandle) {
+         _mesa_memcpy(xmvis->vishandle, xmvis->visinfo, sizeof(XVisualInfo));
+      }
+      return xmvis->vishandle;
+#endif
+   }
+   else
+      return NULL;
+}
+
+
+static GLXContext
+Fake_glXCreateContext( Display *dpy, XVisualInfo *visinfo,
+                       GLXContext share_list, Bool direct )
+{
+   XMesaVisual xmvis;
+   struct fake_glx_context *glxCtx;
+   struct fake_glx_context *shareCtx = (struct fake_glx_context *) share_list;
+
+   if (!dpy || !visinfo)
+      return 0;
+
+   glxCtx = CALLOC_STRUCT(fake_glx_context);
+   if (!glxCtx)
+      return 0;
+
+   /* deallocate unused windows/buffers */
+#if 0
+   XMesaGarbageCollect();
+#endif
+
+   xmvis = find_glx_visual( dpy, visinfo );
+   if (!xmvis) {
+      /* This visual wasn't found with glXChooseVisual() */
+      xmvis = create_glx_visual( dpy, visinfo );
+      if (!xmvis) {
+         /* unusable visual */
+         _mesa_free(glxCtx);
+         return NULL;
+      }
+   }
+
+   glxCtx->xmesaContext = XMesaCreateContext(xmvis,
+                                   shareCtx ? shareCtx->xmesaContext : NULL);
+   if (!glxCtx->xmesaContext) {
+      _mesa_free(glxCtx);
+      return NULL;
+   }
+
+   glxCtx->glxContext.isDirect = GL_FALSE;
+   glxCtx->glxContext.currentDpy = dpy;
+   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
+
+   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+
+   return (GLXContext) glxCtx;
+}
+
+
+/* XXX these may have to be removed due to thread-safety issues. */
+static GLXContext MakeCurrent_PrevContext = 0;
+static GLXDrawable MakeCurrent_PrevDrawable = 0;
+static GLXDrawable MakeCurrent_PrevReadable = 0;
+static XMesaBuffer MakeCurrent_PrevDrawBuffer = 0;
+static XMesaBuffer MakeCurrent_PrevReadBuffer = 0;
+
+
+/* GLX 1.3 and later */
+static Bool
+Fake_glXMakeContextCurrent( Display *dpy, GLXDrawable draw,
+                            GLXDrawable read, GLXContext ctx )
+{
+   struct fake_glx_context *glxCtx = (struct fake_glx_context *) ctx;
+   static boolean firsttime = 1, no_rast = 0;
+
+   if (firsttime) {
+      no_rast = getenv("SP_NO_RAST") != NULL;
+      firsttime = 0;
+   }
+
+
+   if (ctx && draw && read) {
+      XMesaBuffer drawBuffer, readBuffer;
+      XMesaContext xmctx = glxCtx->xmesaContext;
+
+      /* Find the XMesaBuffer which corresponds to the GLXDrawable 'draw' */
+      if (ctx == MakeCurrent_PrevContext
+          && draw == MakeCurrent_PrevDrawable) {
+         drawBuffer = MakeCurrent_PrevDrawBuffer;
+      }
+      else {
+         drawBuffer = XMesaFindBuffer( dpy, draw );
+      }
+      if (!drawBuffer) {
+         /* drawable must be a new window! */
+         drawBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, draw );
+         if (!drawBuffer) {
+            /* Out of memory, or context/drawable depth mismatch */
+            return False;
+         }
+#ifdef FX
+         FXcreateContext( xmctx->xm_visual, draw, xmctx, drawBuffer );
+#endif
+      }
+
+      /* Find the XMesaBuffer which corresponds to the GLXDrawable 'read' */
+      if (ctx == MakeCurrent_PrevContext
+          && read == MakeCurrent_PrevReadable) {
+         readBuffer = MakeCurrent_PrevReadBuffer;
+      }
+      else {
+         readBuffer = XMesaFindBuffer( dpy, read );
+      }
+      if (!readBuffer) {
+         /* drawable must be a new window! */
+         readBuffer = XMesaCreateWindowBuffer( xmctx->xm_visual, read );
+         if (!readBuffer) {
+            /* Out of memory, or context/drawable depth mismatch */
+            return False;
+         }
+#ifdef FX
+         FXcreateContext( xmctx->xm_visual, read, xmctx, readBuffer );
+#endif
+      }
+
+      if (no_rast &&
+          MakeCurrent_PrevContext == ctx &&
+          MakeCurrent_PrevDrawable == draw &&
+          MakeCurrent_PrevReadable == read &&
+          MakeCurrent_PrevDrawBuffer == drawBuffer &&
+          MakeCurrent_PrevReadBuffer == readBuffer)
+         return True;
+          
+      MakeCurrent_PrevContext = ctx;
+      MakeCurrent_PrevDrawable = draw;
+      MakeCurrent_PrevReadable = read;
+      MakeCurrent_PrevDrawBuffer = drawBuffer;
+      MakeCurrent_PrevReadBuffer = readBuffer;
+
+      /* Now make current! */
+      if (XMesaMakeCurrent2(xmctx, drawBuffer, readBuffer)) {
+         ((__GLXcontext *) ctx)->currentDpy = dpy;
+         ((__GLXcontext *) ctx)->currentDrawable = draw;
+         ((__GLXcontext *) ctx)->currentReadable = read;
+         return True;
+      }
+      else {
+         return False;
+      }
+   }
+   else if (!ctx && !draw && !read) {
+      /* release current context w/out assigning new one. */
+      XMesaMakeCurrent( NULL, NULL );
+      MakeCurrent_PrevContext = 0;
+      MakeCurrent_PrevDrawable = 0;
+      MakeCurrent_PrevReadable = 0;
+      MakeCurrent_PrevDrawBuffer = 0;
+      MakeCurrent_PrevReadBuffer = 0;
+      return True;
+   }
+   else {
+      /* The args must either all be non-zero or all zero.
+       * This is an error.
+       */
+      return False;
+   }
+}
+
+
+static Bool
+Fake_glXMakeCurrent( Display *dpy, GLXDrawable drawable, GLXContext ctx )
+{
+   return Fake_glXMakeContextCurrent( dpy, drawable, drawable, ctx );
+}
+
+
+static GLXPixmap
+Fake_glXCreateGLXPixmap( Display *dpy, XVisualInfo *visinfo, Pixmap pixmap )
+{
+   XMesaVisual v;
+   XMesaBuffer b;
+
+   v = find_glx_visual( dpy, visinfo );
+   if (!v) {
+      v = create_glx_visual( dpy, visinfo );
+      if (!v) {
+         /* unusable visual */
+         return 0;
+      }
+   }
+
+   b = XMesaCreatePixmapBuffer( v, pixmap, 0 );
+   if (!b) {
+      return 0;
+   }
+   return b->drawable;
+}
+
+
+/*** GLX_MESA_pixmap_colormap ***/
+
+static GLXPixmap
+Fake_glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visinfo,
+                             Pixmap pixmap, Colormap cmap )
+{
+   XMesaVisual v;
+   XMesaBuffer b;
+
+   v = find_glx_visual( dpy, visinfo );
+   if (!v) {
+      v = create_glx_visual( dpy, visinfo );
+      if (!v) {
+         /* unusable visual */
+         return 0;
+      }
+   }
+
+   b = XMesaCreatePixmapBuffer( v, pixmap, cmap );
+   if (!b) {
+      return 0;
+   }
+   return b->drawable;
+}
+
+
+static void
+Fake_glXDestroyGLXPixmap( Display *dpy, GLXPixmap pixmap )
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, pixmap);
+   if (b) {
+      XMesaDestroyBuffer(b);
+   }
+   else if (_mesa_getenv("MESA_DEBUG")) {
+      _mesa_warning(NULL, "Mesa: glXDestroyGLXPixmap: invalid pixmap\n");
+   }
+}
+
+
+static void
+Fake_glXCopyContext( Display *dpy, GLXContext src, GLXContext dst,
+                     unsigned long mask )
+{
+   struct fake_glx_context *fakeSrc = (struct fake_glx_context *) src;
+   struct fake_glx_context *fakeDst = (struct fake_glx_context *) dst;
+   XMesaContext xm_src = fakeSrc->xmesaContext;
+   XMesaContext xm_dst = fakeDst->xmesaContext;
+   (void) dpy;
+   if (MakeCurrent_PrevContext == src) {
+      _mesa_Flush();
+   }
+   st_copy_context_state( xm_src->st, xm_dst->st, (GLuint) mask );
+}
+
+
+static Bool
+Fake_glXQueryExtension( Display *dpy, int *errorb, int *event )
+{
+   /* Mesa's GLX isn't really an X extension but we try to act like one. */
+   (void) dpy;
+   (void) errorb;
+   (void) event;
+   return True;
+}
+
+
+extern void _kw_ungrab_all( Display *dpy );
+void _kw_ungrab_all( Display *dpy )
+{
+   XUngrabPointer( dpy, CurrentTime );
+   XUngrabKeyboard( dpy, CurrentTime );
+}
+
+
+static void
+Fake_glXDestroyContext( Display *dpy, GLXContext ctx )
+{
+   struct fake_glx_context *glxCtx = (struct fake_glx_context *) ctx;
+   (void) dpy;
+   MakeCurrent_PrevContext = 0;
+   MakeCurrent_PrevDrawable = 0;
+   MakeCurrent_PrevReadable = 0;
+   MakeCurrent_PrevDrawBuffer = 0;
+   MakeCurrent_PrevReadBuffer = 0;
+   XMesaDestroyContext( glxCtx->xmesaContext );
+   XMesaGarbageCollect();
+   _mesa_free(glxCtx);
+}
+
+
+static Bool
+Fake_glXIsDirect( Display *dpy, GLXContext ctx )
+{
+   (void) dpy;
+   (void) ctx;
+   return False;
+}
+
+
+
+static void
+Fake_glXSwapBuffers( Display *dpy, GLXDrawable drawable )
+{
+   XMesaBuffer buffer = XMesaFindBuffer( dpy, drawable );
+   static boolean firsttime = 1, no_rast = 0;
+
+   if (firsttime) {
+      no_rast = getenv("SP_NO_RAST") != NULL;
+      firsttime = 0;
+   }
+
+   if (no_rast)
+      return;
+
+   if (buffer) {
+      XMesaSwapBuffers(buffer);
+   }
+   else if (_mesa_getenv("MESA_DEBUG")) {
+      _mesa_warning(NULL, "glXSwapBuffers: invalid drawable 0x%x\n",
+                    (int) drawable);
+   }
+}
+
+
+
+/*** GLX_MESA_copy_sub_buffer ***/
+
+static void
+Fake_glXCopySubBufferMESA( Display *dpy, GLXDrawable drawable,
+                           int x, int y, int width, int height )
+{
+   XMesaBuffer buffer = XMesaFindBuffer( dpy, drawable );
+   if (buffer) {
+      XMesaCopySubBuffer(buffer, x, y, width, height);
+   }
+   else if (_mesa_getenv("MESA_DEBUG")) {
+      _mesa_warning(NULL, "Mesa: glXCopySubBufferMESA: invalid drawable\n");
+   }
+}
+
+
+static Bool
+Fake_glXQueryVersion( Display *dpy, int *maj, int *min )
+{
+   (void) dpy;
+   /* Return GLX version, not Mesa version */
+   assert(CLIENT_MAJOR_VERSION == SERVER_MAJOR_VERSION);
+   *maj = CLIENT_MAJOR_VERSION;
+   *min = MIN2( CLIENT_MINOR_VERSION, SERVER_MINOR_VERSION );
+   return True;
+}
+
+
+/*
+ * Query the GLX attributes of the given XVisualInfo.
+ */
+static int
+get_config( XMesaVisual xmvis, int attrib, int *value, GLboolean fbconfig )
+{
+   ASSERT(xmvis);
+   switch(attrib) {
+      case GLX_USE_GL:
+         if (fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = (int) True;
+	 return 0;
+      case GLX_BUFFER_SIZE:
+	 *value = xmvis->visinfo->depth;
+	 return 0;
+      case GLX_LEVEL:
+	 *value = xmvis->mesa_visual.level;
+	 return 0;
+      case GLX_RGBA:
+         if (fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+	 if (xmvis->mesa_visual.rgbMode) {
+	    *value = True;
+	 }
+	 else {
+	    *value = False;
+	 }
+	 return 0;
+      case GLX_DOUBLEBUFFER:
+	 *value = (int) xmvis->mesa_visual.doubleBufferMode;
+	 return 0;
+      case GLX_STEREO:
+	 *value = (int) xmvis->mesa_visual.stereoMode;
+	 return 0;
+      case GLX_AUX_BUFFERS:
+	 *value = xmvis->mesa_visual.numAuxBuffers;
+	 return 0;
+      case GLX_RED_SIZE:
+         *value = xmvis->mesa_visual.redBits;
+	 return 0;
+      case GLX_GREEN_SIZE:
+         *value = xmvis->mesa_visual.greenBits;
+	 return 0;
+      case GLX_BLUE_SIZE:
+         *value = xmvis->mesa_visual.blueBits;
+	 return 0;
+      case GLX_ALPHA_SIZE:
+         *value = xmvis->mesa_visual.alphaBits;
+	 return 0;
+      case GLX_DEPTH_SIZE:
+         *value = xmvis->mesa_visual.depthBits;
+	 return 0;
+      case GLX_STENCIL_SIZE:
+	 *value = xmvis->mesa_visual.stencilBits;
+	 return 0;
+      case GLX_ACCUM_RED_SIZE:
+	 *value = xmvis->mesa_visual.accumRedBits;
+	 return 0;
+      case GLX_ACCUM_GREEN_SIZE:
+	 *value = xmvis->mesa_visual.accumGreenBits;
+	 return 0;
+      case GLX_ACCUM_BLUE_SIZE:
+	 *value = xmvis->mesa_visual.accumBlueBits;
+	 return 0;
+      case GLX_ACCUM_ALPHA_SIZE:
+         *value = xmvis->mesa_visual.accumAlphaBits;
+	 return 0;
+
+      /*
+       * GLX_EXT_visual_info extension
+       */
+      case GLX_X_VISUAL_TYPE_EXT:
+         switch (xmvis->visinfo->CLASS) {
+            case StaticGray:   *value = GLX_STATIC_GRAY_EXT;   return 0;
+            case GrayScale:    *value = GLX_GRAY_SCALE_EXT;    return 0;
+            case StaticColor:  *value = GLX_STATIC_GRAY_EXT;   return 0;
+            case PseudoColor:  *value = GLX_PSEUDO_COLOR_EXT;  return 0;
+            case TrueColor:    *value = GLX_TRUE_COLOR_EXT;    return 0;
+            case DirectColor:  *value = GLX_DIRECT_COLOR_EXT;  return 0;
+         }
+         return 0;
+      case GLX_TRANSPARENT_TYPE_EXT:
+         if (xmvis->mesa_visual.level==0) {
+            /* normal planes */
+            *value = GLX_NONE_EXT;
+         }
+         else if (xmvis->mesa_visual.level>0) {
+            /* overlay */
+            if (xmvis->mesa_visual.rgbMode) {
+               *value = GLX_TRANSPARENT_RGB_EXT;
+            }
+            else {
+               *value = GLX_TRANSPARENT_INDEX_EXT;
+            }
+         }
+         else if (xmvis->mesa_visual.level<0) {
+            /* underlay */
+            *value = GLX_NONE_EXT;
+         }
+         return 0;
+      case GLX_TRANSPARENT_INDEX_VALUE_EXT:
+         {
+            int pixel = transparent_pixel( xmvis );
+            if (pixel>=0) {
+               *value = pixel;
+            }
+            /* else undefined */
+         }
+         return 0;
+      case GLX_TRANSPARENT_RED_VALUE_EXT:
+         /* undefined */
+         return 0;
+      case GLX_TRANSPARENT_GREEN_VALUE_EXT:
+         /* undefined */
+         return 0;
+      case GLX_TRANSPARENT_BLUE_VALUE_EXT:
+         /* undefined */
+         return 0;
+      case GLX_TRANSPARENT_ALPHA_VALUE_EXT:
+         /* undefined */
+         return 0;
+
+      /*
+       * GLX_EXT_visual_info extension
+       */
+      case GLX_VISUAL_CAVEAT_EXT:
+         /* test for zero, just in case */
+         if (xmvis->mesa_visual.visualRating > 0)
+            *value = xmvis->mesa_visual.visualRating;
+         else
+            *value = GLX_NONE_EXT;
+         return 0;
+
+      /*
+       * GLX_ARB_multisample
+       */
+      case GLX_SAMPLE_BUFFERS_ARB:
+         *value = 0;
+         return 0;
+      case GLX_SAMPLES_ARB:
+         *value = 0;
+         return 0;
+
+      /*
+       * For FBConfigs:
+       */
+      case GLX_SCREEN_EXT:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = xmvis->visinfo->screen;
+         break;
+      case GLX_DRAWABLE_TYPE: /*SGIX too */
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = GLX_WINDOW_BIT | GLX_PIXMAP_BIT | GLX_PBUFFER_BIT;
+         break;
+      case GLX_RENDER_TYPE_SGIX:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         if (xmvis->mesa_visual.rgbMode)
+            *value = GLX_RGBA_BIT;
+         else
+            *value = GLX_COLOR_INDEX_BIT;
+         break;
+      case GLX_X_RENDERABLE_SGIX:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = True; /* XXX really? */
+         break;
+      case GLX_FBCONFIG_ID_SGIX:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = xmvis->visinfo->visualid;
+         break;
+      case GLX_MAX_PBUFFER_WIDTH:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         /* XXX or MAX_WIDTH? */
+         *value = DisplayWidth(xmvis->display, xmvis->visinfo->screen);
+         break;
+      case GLX_MAX_PBUFFER_HEIGHT:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = DisplayHeight(xmvis->display, xmvis->visinfo->screen);
+         break;
+      case GLX_MAX_PBUFFER_PIXELS:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = DisplayWidth(xmvis->display, xmvis->visinfo->screen) *
+                  DisplayHeight(xmvis->display, xmvis->visinfo->screen);
+         break;
+      case GLX_VISUAL_ID:
+         if (!fbconfig)
+            return GLX_BAD_ATTRIBUTE;
+         *value = xmvis->visinfo->visualid;
+         break;
+
+#ifdef GLX_EXT_texture_from_pixmap
+      case GLX_BIND_TO_TEXTURE_RGB_EXT:
+         *value = True; /*XXX*/
+         break;
+      case GLX_BIND_TO_TEXTURE_RGBA_EXT:
+         /* XXX review */
+         *value = xmvis->mesa_visual.alphaBits > 0 ? True : False;
+         break;
+      case GLX_BIND_TO_MIPMAP_TEXTURE_EXT:
+         *value = True; /*XXX*/
+         break;
+      case GLX_BIND_TO_TEXTURE_TARGETS_EXT:
+         *value = (GLX_TEXTURE_1D_BIT_EXT |
+                   GLX_TEXTURE_2D_BIT_EXT |
+                   GLX_TEXTURE_RECTANGLE_BIT_EXT); /*XXX*/
+         break;
+      case GLX_Y_INVERTED_EXT:
+         *value = True; /*XXX*/
+         break;
+#endif
+
+      default:
+	 return GLX_BAD_ATTRIBUTE;
+   }
+   return Success;
+}
+
+
+static int
+Fake_glXGetConfig( Display *dpy, XVisualInfo *visinfo,
+                   int attrib, int *value )
+{
+   XMesaVisual xmvis;
+   int k;
+   if (!dpy || !visinfo)
+      return GLX_BAD_ATTRIBUTE;
+
+   xmvis = find_glx_visual( dpy, visinfo );
+   if (!xmvis) {
+      /* this visual wasn't obtained with glXChooseVisual */
+      xmvis = create_glx_visual( dpy, visinfo );
+      if (!xmvis) {
+	 /* this visual can't be used for GL rendering */
+	 if (attrib==GLX_USE_GL) {
+	    *value = (int) False;
+	    return 0;
+	 }
+	 else {
+	    return GLX_BAD_VISUAL;
+	 }
+      }
+   }
+
+   k = get_config(xmvis, attrib, value, GL_FALSE);
+   return k;
+}
+
+
+static void
+Fake_glXWaitGL( void )
+{
+   XMesaContext xmesa = XMesaGetCurrentContext();
+   XMesaFlush( xmesa );
+}
+
+
+
+static void
+Fake_glXWaitX( void )
+{
+   XMesaContext xmesa = XMesaGetCurrentContext();
+   XMesaFlush( xmesa );
+}
+
+
+static const char *
+get_extensions( void )
+{
+#ifdef FX
+   const char *fx = _mesa_getenv("MESA_GLX_FX");
+   if (fx && fx[0] != 'd') {
+      return EXTENSIONS;
+   }
+#endif
+   return EXTENSIONS + 23; /* skip "GLX_MESA_set_3dfx_mode" */
+}
+
+
+
+/* GLX 1.1 and later */
+static const char *
+Fake_glXQueryExtensionsString( Display *dpy, int screen )
+{
+   (void) dpy;
+   (void) screen;
+   return get_extensions();
+}
+
+
+
+/* GLX 1.1 and later */
+static const char *
+Fake_glXQueryServerString( Display *dpy, int screen, int name )
+{
+   static char version[1000];
+   _mesa_sprintf(version, "%d.%d %s",
+                 SERVER_MAJOR_VERSION, SERVER_MINOR_VERSION, MESA_GLX_VERSION);
+
+   (void) dpy;
+   (void) screen;
+
+   switch (name) {
+      case GLX_EXTENSIONS:
+         return get_extensions();
+      case GLX_VENDOR:
+	 return VENDOR;
+      case GLX_VERSION:
+	 return version;
+      default:
+         return NULL;
+   }
+}
+
+
+
+/* GLX 1.1 and later */
+static const char *
+Fake_glXGetClientString( Display *dpy, int name )
+{
+   static char version[1000];
+   _mesa_sprintf(version, "%d.%d %s", CLIENT_MAJOR_VERSION,
+                 CLIENT_MINOR_VERSION, MESA_GLX_VERSION);
+
+   (void) dpy;
+
+   switch (name) {
+      case GLX_EXTENSIONS:
+         return get_extensions();
+      case GLX_VENDOR:
+	 return VENDOR;
+      case GLX_VERSION:
+	 return version;
+      default:
+         return NULL;
+   }
+}
+
+
+
+/*
+ * GLX 1.3 and later
+ */
+
+
+static int
+Fake_glXGetFBConfigAttrib( Display *dpy, GLXFBConfig config,
+                           int attribute, int *value )
+{
+   XMesaVisual v = (XMesaVisual) config;
+   (void) dpy;
+   (void) config;
+
+   if (!dpy || !config || !value)
+      return -1;
+
+   return get_config(v, attribute, value, GL_TRUE);
+}
+
+
+static GLXFBConfig *
+Fake_glXGetFBConfigs( Display *dpy, int screen, int *nelements )
+{
+   XVisualInfo *visuals, visTemplate;
+   const long visMask = VisualScreenMask;
+   int i;
+
+   /* Get list of all X visuals */
+   visTemplate.screen = screen;
+   visuals = XGetVisualInfo(dpy, visMask, &visTemplate, nelements);
+   if (*nelements > 0) {
+      XMesaVisual *results;
+      results = (XMesaVisual *) _mesa_malloc(*nelements * sizeof(XMesaVisual));
+      if (!results) {
+         *nelements = 0;
+         return NULL;
+      }
+      for (i = 0; i < *nelements; i++) {
+         results[i] = create_glx_visual(dpy, visuals + i);
+      }
+      return (GLXFBConfig *) results;
+   }
+   return NULL;
+}
+
+
+static GLXFBConfig *
+Fake_glXChooseFBConfig( Display *dpy, int screen,
+                        const int *attribList, int *nitems )
+{
+   XMesaVisual xmvis;
+
+   if (!attribList || !attribList[0]) {
+      /* return list of all configs (per GLX_SGIX_fbconfig spec) */
+      return Fake_glXGetFBConfigs(dpy, screen, nitems);
+   }
+
+   xmvis = choose_visual(dpy, screen, attribList, GL_TRUE);
+   if (xmvis) {
+      GLXFBConfig *config = (GLXFBConfig *) _mesa_malloc(sizeof(XMesaVisual));
+      if (!config) {
+         *nitems = 0;
+         return NULL;
+      }
+      *nitems = 1;
+      config[0] = (GLXFBConfig) xmvis;
+      return (GLXFBConfig *) config;
+   }
+   else {
+      *nitems = 0;
+      return NULL;
+   }
+}
+
+
+static XVisualInfo *
+Fake_glXGetVisualFromFBConfig( Display *dpy, GLXFBConfig config )
+{
+   if (dpy && config) {
+      XMesaVisual xmvis = (XMesaVisual) config;
+#if 0      
+      return xmvis->vishandle;
+#else
+      /* create a new vishandle - the cached one may be stale */
+      xmvis->vishandle = (XVisualInfo *) _mesa_malloc(sizeof(XVisualInfo));
+      if (xmvis->vishandle) {
+         _mesa_memcpy(xmvis->vishandle, xmvis->visinfo, sizeof(XVisualInfo));
+      }
+      return xmvis->vishandle;
+#endif
+   }
+   else {
+      return NULL;
+   }
+}
+
+
+static GLXWindow
+Fake_glXCreateWindow( Display *dpy, GLXFBConfig config, Window win,
+                      const int *attribList )
+{
+   XMesaVisual xmvis = (XMesaVisual) config;
+   XMesaBuffer xmbuf;
+   if (!xmvis)
+      return 0;
+
+   xmbuf = XMesaCreateWindowBuffer(xmvis, win);
+   if (!xmbuf)
+      return 0;
+
+#ifdef FX
+   /* XXX this will segfault if actually called */
+   FXcreateContext(xmvis, win, NULL, xmbuf);
+#endif
+
+   (void) dpy;
+   (void) attribList;  /* Ignored in GLX 1.3 */
+
+   return win;  /* A hack for now */
+}
+
+
+static void
+Fake_glXDestroyWindow( Display *dpy, GLXWindow window )
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, (XMesaDrawable) window);
+   if (b)
+      XMesaDestroyBuffer(b);
+   /* don't destroy X window */
+}
+
+
+/* XXX untested */
+static GLXPixmap
+Fake_glXCreatePixmap( Display *dpy, GLXFBConfig config, Pixmap pixmap,
+                      const int *attribList )
+{
+   XMesaVisual v = (XMesaVisual) config;
+   XMesaBuffer b;
+   const int *attr;
+   int target = 0, format = 0, mipmap = 0;
+   int value;
+
+   if (!dpy || !config || !pixmap)
+      return 0;
+
+   for (attr = attribList; *attr; attr++) {
+      switch (*attr) {
+      case GLX_TEXTURE_FORMAT_EXT:
+         attr++;
+         switch (*attr) {
+         case GLX_TEXTURE_FORMAT_NONE_EXT:
+         case GLX_TEXTURE_FORMAT_RGB_EXT:
+         case GLX_TEXTURE_FORMAT_RGBA_EXT:
+            format = *attr;
+            break;
+         default:
+            /* error */
+            return 0;
+         }
+         break;
+      case GLX_TEXTURE_TARGET_EXT:
+         attr++;
+         switch (*attr) {
+         case GLX_TEXTURE_1D_EXT:
+         case GLX_TEXTURE_2D_EXT:
+         case GLX_TEXTURE_RECTANGLE_EXT:
+            target = *attr;
+            break;
+         default:
+            /* error */
+            return 0;
+         }
+         break;
+      case GLX_MIPMAP_TEXTURE_EXT:
+         attr++;
+         if (*attr)
+            mipmap = 1;
+         break;
+      default:
+         /* error */
+         return 0;
+      }
+   }
+
+   if (format == GLX_TEXTURE_FORMAT_RGB_EXT) {
+      if (get_config(v, GLX_BIND_TO_TEXTURE_RGB_EXT,
+                     &value, GL_TRUE) != Success
+          || !value) {
+         return 0; /* error! */
+      }
+   }
+   else if (format == GLX_TEXTURE_FORMAT_RGBA_EXT) {
+      if (get_config(v, GLX_BIND_TO_TEXTURE_RGBA_EXT,
+                     &value, GL_TRUE) != Success
+          || !value) {
+         return 0; /* error! */
+      }
+   }
+   if (mipmap) {
+      if (get_config(v, GLX_BIND_TO_MIPMAP_TEXTURE_EXT,
+                     &value, GL_TRUE) != Success
+          || !value) {
+         return 0; /* error! */
+      }
+   }
+   if (target == GLX_TEXTURE_1D_EXT) {
+      if (get_config(v, GLX_BIND_TO_TEXTURE_TARGETS_EXT,
+                     &value, GL_TRUE) != Success
+          || (value & GLX_TEXTURE_1D_BIT_EXT) == 0) {
+         return 0; /* error! */
+      }
+   }
+   else if (target == GLX_TEXTURE_2D_EXT) {
+      if (get_config(v, GLX_BIND_TO_TEXTURE_TARGETS_EXT,
+                     &value, GL_TRUE) != Success
+          || (value & GLX_TEXTURE_2D_BIT_EXT) == 0) {
+         return 0; /* error! */
+      }
+   }
+   if (target == GLX_TEXTURE_RECTANGLE_EXT) {
+      if (get_config(v, GLX_BIND_TO_TEXTURE_TARGETS_EXT,
+                     &value, GL_TRUE) != Success
+          || (value & GLX_TEXTURE_RECTANGLE_BIT_EXT) == 0) {
+         return 0; /* error! */
+      }
+   }
+
+   if (format || target || mipmap) {
+      /* texture from pixmap */
+      b = XMesaCreatePixmapTextureBuffer(v, pixmap, 0, format, target, mipmap);
+   }
+   else {
+      b = XMesaCreatePixmapBuffer( v, pixmap, 0 );
+   }
+   if (!b) {
+      return 0;
+   }
+
+   return pixmap;
+}
+
+
+static void
+Fake_glXDestroyPixmap( Display *dpy, GLXPixmap pixmap )
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, (XMesaDrawable)pixmap);
+   if (b)
+      XMesaDestroyBuffer(b);
+   /* don't destroy X pixmap */
+}
+
+
+static GLXPbuffer
+Fake_glXCreatePbuffer( Display *dpy, GLXFBConfig config,
+                       const int *attribList )
+{
+   XMesaVisual xmvis = (XMesaVisual) config;
+   XMesaBuffer xmbuf;
+   const int *attrib;
+   int width = 0, height = 0;
+   GLboolean useLargest = GL_FALSE, preserveContents = GL_FALSE;
+
+   (void) dpy;
+
+   for (attrib = attribList; *attrib; attrib++) {
+      switch (*attrib) {
+         case GLX_PBUFFER_WIDTH:
+            attrib++;
+            width = *attrib;
+            break;
+         case GLX_PBUFFER_HEIGHT:
+            attrib++;
+            height = *attrib;
+            break;
+         case GLX_PRESERVED_CONTENTS:
+            attrib++;
+            preserveContents = *attrib; /* ignored */
+            break;
+         case GLX_LARGEST_PBUFFER:
+            attrib++;
+            useLargest = *attrib; /* ignored */
+            break;
+         default:
+            return 0;
+      }
+   }
+
+   /* not used at this time */
+   (void) useLargest;
+   (void) preserveContents;
+
+   if (width == 0 || height == 0)
+      return 0;
+
+   xmbuf = XMesaCreatePBuffer( xmvis, 0, width, height);
+   /* A GLXPbuffer handle must be an X Drawable because that's what
+    * glXMakeCurrent takes.
+    */
+   if (xmbuf)
+      return (GLXPbuffer) xmbuf->drawable;
+   else
+      return 0;
+}
+
+
+static void
+Fake_glXDestroyPbuffer( Display *dpy, GLXPbuffer pbuf )
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, pbuf);
+   if (b) {
+      XMesaDestroyBuffer(b);
+   }
+}
+
+
+static void
+Fake_glXQueryDrawable( Display *dpy, GLXDrawable draw, int attribute,
+                       unsigned int *value )
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, draw);
+   if (!xmbuf)
+      return;
+
+   switch (attribute) {
+      case GLX_WIDTH:
+         *value = xmesa_buffer_width(xmbuf);
+         break;
+      case GLX_HEIGHT:
+         *value = xmesa_buffer_width(xmbuf);
+         break;
+      case GLX_PRESERVED_CONTENTS:
+         *value = True;
+         break;
+      case GLX_LARGEST_PBUFFER:
+         *value = xmesa_buffer_width(xmbuf) * xmesa_buffer_height(xmbuf);
+         break;
+      case GLX_FBCONFIG_ID:
+         *value = xmbuf->xm_visual->visinfo->visualid;
+         return;
+#ifdef GLX_EXT_texture_from_pixmap
+      case GLX_TEXTURE_FORMAT_EXT:
+         *value = xmbuf->TextureFormat;
+         break;
+      case GLX_TEXTURE_TARGET_EXT:
+         *value = xmbuf->TextureTarget;
+         break;
+      case GLX_MIPMAP_TEXTURE_EXT:
+         *value = xmbuf->TextureMipmap;
+         break;
+#endif
+
+      default:
+         return; /* raise BadValue error */
+   }
+}
+
+
+static GLXContext
+Fake_glXCreateNewContext( Display *dpy, GLXFBConfig config,
+                          int renderType, GLXContext shareList, Bool direct )
+{
+   struct fake_glx_context *glxCtx;
+   struct fake_glx_context *shareCtx = (struct fake_glx_context *) shareList;
+   XMesaVisual xmvis = (XMesaVisual) config;
+
+   if (!dpy || !config ||
+       (renderType != GLX_RGBA_TYPE && renderType != GLX_COLOR_INDEX_TYPE))
+      return 0;
+
+   glxCtx = CALLOC_STRUCT(fake_glx_context);
+   if (!glxCtx)
+      return 0;
+
+   /* deallocate unused windows/buffers */
+   XMesaGarbageCollect();
+
+   glxCtx->xmesaContext = XMesaCreateContext(xmvis,
+                                   shareCtx ? shareCtx->xmesaContext : NULL);
+   if (!glxCtx->xmesaContext) {
+      _mesa_free(glxCtx);
+      return NULL;
+   }
+
+   glxCtx->glxContext.isDirect = GL_FALSE;
+   glxCtx->glxContext.currentDpy = dpy;
+   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
+
+   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+
+   return (GLXContext) glxCtx;
+}
+
+
+static int
+Fake_glXQueryContext( Display *dpy, GLXContext ctx, int attribute, int *value )
+{
+   struct fake_glx_context *glxCtx = (struct fake_glx_context *) ctx;
+   XMesaContext xmctx = glxCtx->xmesaContext;
+
+   (void) dpy;
+   (void) ctx;
+
+   switch (attribute) {
+   case GLX_FBCONFIG_ID:
+      *value = xmctx->xm_visual->visinfo->visualid;
+      break;
+   case GLX_RENDER_TYPE:
+      if (xmctx->xm_visual->mesa_visual.rgbMode)
+         *value = GLX_RGBA_BIT;
+      else
+         *value = GLX_COLOR_INDEX_BIT;
+      break;
+   case GLX_SCREEN:
+      *value = 0;
+      return Success;
+   default:
+      return GLX_BAD_ATTRIBUTE;
+   }
+   return 0;
+}
+
+
+static void
+Fake_glXSelectEvent( Display *dpy, GLXDrawable drawable, unsigned long mask )
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
+   if (xmbuf)
+      xmbuf->selectedEvents = mask;
+}
+
+
+static void
+Fake_glXGetSelectedEvent( Display *dpy, GLXDrawable drawable,
+                          unsigned long *mask )
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
+   if (xmbuf)
+      *mask = xmbuf->selectedEvents;
+   else
+      *mask = 0;
+}
+
+
+
+/*** GLX_SGI_swap_control ***/
+
+static int
+Fake_glXSwapIntervalSGI(int interval)
+{
+   (void) interval;
+   return 0;
+}
+
+
+
+/*** GLX_SGI_video_sync ***/
+
+static unsigned int FrameCounter = 0;
+
+static int
+Fake_glXGetVideoSyncSGI(unsigned int *count)
+{
+   /* this is a bogus implementation */
+   *count = FrameCounter++;
+   return 0;
+}
+
+static int
+Fake_glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count)
+{
+   if (divisor <= 0 || remainder < 0)
+      return GLX_BAD_VALUE;
+   /* this is a bogus implementation */
+   FrameCounter++;
+   while (FrameCounter % divisor != remainder)
+      FrameCounter++;
+   *count = FrameCounter;
+   return 0;
+}
+
+
+
+/*** GLX_SGI_make_current_read ***/
+
+static Bool
+Fake_glXMakeCurrentReadSGI(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx)
+{
+   return Fake_glXMakeContextCurrent( dpy, draw, read, ctx );
+}
+
+/* not used
+static GLXDrawable
+Fake_glXGetCurrentReadDrawableSGI(void)
+{
+   return 0;
+}
+*/
+
+
+/*** GLX_SGIX_video_source ***/
+#if defined(_VL_H)
+
+static GLXVideoSourceSGIX
+Fake_glXCreateGLXVideoSourceSGIX(Display *dpy, int screen, VLServer server, VLPath path, int nodeClass, VLNode drainNode)
+{
+   (void) dpy;
+   (void) screen;
+   (void) server;
+   (void) path;
+   (void) nodeClass;
+   (void) drainNode;
+   return 0;
+}
+
+static void
+Fake_glXDestroyGLXVideoSourceSGIX(Display *dpy, GLXVideoSourceSGIX src)
+{
+   (void) dpy;
+   (void) src;
+}
+
+#endif
+
+
+/*** GLX_EXT_import_context ***/
+
+static void
+Fake_glXFreeContextEXT(Display *dpy, GLXContext context)
+{
+   (void) dpy;
+   (void) context;
+}
+
+static GLXContextID
+Fake_glXGetContextIDEXT(const GLXContext context)
+{
+   (void) context;
+   return 0;
+}
+
+static GLXContext
+Fake_glXImportContextEXT(Display *dpy, GLXContextID contextID)
+{
+   (void) dpy;
+   (void) contextID;
+   return 0;
+}
+
+static int
+Fake_glXQueryContextInfoEXT(Display *dpy, GLXContext context, int attribute, int *value)
+{
+   (void) dpy;
+   (void) context;
+   (void) attribute;
+   (void) value;
+   return 0;
+}
+
+
+
+/*** GLX_SGIX_fbconfig ***/
+
+static int
+Fake_glXGetFBConfigAttribSGIX(Display *dpy, GLXFBConfigSGIX config, int attribute, int *value)
+{
+   return Fake_glXGetFBConfigAttrib(dpy, config, attribute, value);
+}
+
+static GLXFBConfigSGIX *
+Fake_glXChooseFBConfigSGIX(Display *dpy, int screen, int *attrib_list, int *nelements)
+{
+   return (GLXFBConfig *) Fake_glXChooseFBConfig(dpy, screen, attrib_list, nelements);
+}
+
+
+static GLXPixmap
+Fake_glXCreateGLXPixmapWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, Pixmap pixmap)
+{
+   XMesaVisual xmvis = (XMesaVisual) config;
+   XMesaBuffer xmbuf = XMesaCreatePixmapBuffer(xmvis, pixmap, 0);
+   return xmbuf->drawable; /* need to return an X ID */
+}
+
+
+static GLXContext
+Fake_glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int render_type, GLXContext share_list, Bool direct)
+{
+   XMesaVisual xmvis = (XMesaVisual) config;
+   struct fake_glx_context *glxCtx;
+   struct fake_glx_context *shareCtx = (struct fake_glx_context *) share_list;
+
+   glxCtx = CALLOC_STRUCT(fake_glx_context);
+   if (!glxCtx)
+      return 0;
+
+   /* deallocate unused windows/buffers */
+   XMesaGarbageCollect();
+
+   glxCtx->xmesaContext = XMesaCreateContext(xmvis,
+                                   shareCtx ? shareCtx->xmesaContext : NULL);
+   if (!glxCtx->xmesaContext) {
+      _mesa_free(glxCtx);
+      return NULL;
+   }
+
+   glxCtx->glxContext.isDirect = GL_FALSE;
+   glxCtx->glxContext.currentDpy = dpy;
+   glxCtx->glxContext.xid = (XID) glxCtx;  /* self pointer */
+
+   assert((void *) glxCtx == (void *) &(glxCtx->glxContext));
+
+   return (GLXContext) glxCtx;
+}
+
+
+static XVisualInfo *
+Fake_glXGetVisualFromFBConfigSGIX(Display *dpy, GLXFBConfigSGIX config)
+{
+   return Fake_glXGetVisualFromFBConfig(dpy, config);
+}
+
+
+static GLXFBConfigSGIX
+Fake_glXGetFBConfigFromVisualSGIX(Display *dpy, XVisualInfo *vis)
+{
+   XMesaVisual xmvis = find_glx_visual(dpy, vis);
+   if (!xmvis) {
+      /* This visual wasn't found with glXChooseVisual() */
+      xmvis = create_glx_visual(dpy, vis);
+   }
+
+   return (GLXFBConfigSGIX) xmvis;
+}
+
+
+
+/*** GLX_SGIX_pbuffer ***/
+
+static GLXPbufferSGIX
+Fake_glXCreateGLXPbufferSGIX(Display *dpy, GLXFBConfigSGIX config,
+                             unsigned int width, unsigned int height,
+                             int *attribList)
+{
+   XMesaVisual xmvis = (XMesaVisual) config;
+   XMesaBuffer xmbuf;
+   const int *attrib;
+   GLboolean useLargest = GL_FALSE, preserveContents = GL_FALSE;
+
+   (void) dpy;
+
+   for (attrib = attribList; attrib && *attrib; attrib++) {
+      switch (*attrib) {
+         case GLX_PRESERVED_CONTENTS_SGIX:
+            attrib++;
+            preserveContents = *attrib; /* ignored */
+            break;
+         case GLX_LARGEST_PBUFFER_SGIX:
+            attrib++;
+            useLargest = *attrib; /* ignored */
+            break;
+         default:
+            return 0;
+      }
+   }
+
+   /* not used at this time */
+   (void) useLargest;
+   (void) preserveContents;
+
+   xmbuf = XMesaCreatePBuffer( xmvis, 0, width, height);
+   /* A GLXPbuffer handle must be an X Drawable because that's what
+    * glXMakeCurrent takes.
+    */
+   return (GLXPbuffer) xmbuf->drawable;
+}
+
+
+static void
+Fake_glXDestroyGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf)
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, pbuf);
+   if (xmbuf) {
+      XMesaDestroyBuffer(xmbuf);
+   }
+}
+
+
+static int
+Fake_glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value)
+{
+   const XMesaBuffer xmbuf = XMesaFindBuffer(dpy, pbuf);
+
+   if (!xmbuf) {
+      /* Generate GLXBadPbufferSGIX for bad pbuffer */
+      return 0;
+   }
+
+   switch (attribute) {
+      case GLX_PRESERVED_CONTENTS_SGIX:
+         *value = True;
+         break;
+      case GLX_LARGEST_PBUFFER_SGIX:
+         *value = xmesa_buffer_width(xmbuf) * xmesa_buffer_height(xmbuf);
+         break;
+      case GLX_WIDTH_SGIX:
+         *value = xmesa_buffer_width(xmbuf);
+         break;
+      case GLX_HEIGHT_SGIX:
+         *value = xmesa_buffer_height(xmbuf);
+         break;
+      case GLX_EVENT_MASK_SGIX:
+         *value = 0;  /* XXX might be wrong */
+         break;
+      default:
+         *value = 0;
+   }
+   return 0;
+}
+
+
+static void
+Fake_glXSelectEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long mask)
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
+   if (xmbuf) {
+      /* Note: we'll never generate clobber events */
+      xmbuf->selectedEvents = mask;
+   }
+}
+
+
+static void
+Fake_glXGetSelectedEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long *mask)
+{
+   XMesaBuffer xmbuf = XMesaFindBuffer(dpy, drawable);
+   if (xmbuf) {
+      *mask = xmbuf->selectedEvents;
+   }
+   else {
+      *mask = 0;
+   }
+}
+
+
+
+/*** GLX_SGI_cushion ***/
+
+static void
+Fake_glXCushionSGI(Display *dpy, Window win, float cushion)
+{
+   (void) dpy;
+   (void) win;
+   (void) cushion;
+}
+
+
+
+/*** GLX_SGIX_video_resize ***/
+
+static int
+Fake_glXBindChannelToWindowSGIX(Display *dpy, int screen, int channel , Window window)
+{
+   (void) dpy;
+   (void) screen;
+   (void) channel;
+   (void) window;
+   return 0;
+}
+
+static int
+Fake_glXChannelRectSGIX(Display *dpy, int screen, int channel, int x, int y, int w, int h)
+{
+   (void) dpy;
+   (void) screen;
+   (void) channel;
+   (void) x;
+   (void) y;
+   (void) w;
+   (void) h;
+   return 0;
+}
+
+static int
+Fake_glXQueryChannelRectSGIX(Display *dpy, int screen, int channel, int *x, int *y, int *w, int *h)
+{
+   (void) dpy;
+   (void) screen;
+   (void) channel;
+   (void) x;
+   (void) y;
+   (void) w;
+   (void) h;
+   return 0;
+}
+
+static int
+Fake_glXQueryChannelDeltasSGIX(Display *dpy, int screen, int channel, int *dx, int *dy, int *dw, int *dh)
+{
+   (void) dpy;
+   (void) screen;
+   (void) channel;
+   (void) dx;
+   (void) dy;
+   (void) dw;
+   (void) dh;
+   return 0;
+}
+
+static int
+Fake_glXChannelRectSyncSGIX(Display *dpy, int screen, int channel, GLenum synctype)
+{
+   (void) dpy;
+   (void) screen;
+   (void) channel;
+   (void) synctype;
+   return 0;
+}
+
+
+
+/*** GLX_SGIX_dmbuffer **/
+
+#if defined(_DM_BUFFER_H_)
+static Bool
+Fake_glXAssociateDMPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params, DMbuffer dmbuffer)
+{
+   (void) dpy;
+   (void) pbuffer;
+   (void) params;
+   (void) dmbuffer;
+   return False;
+}
+#endif
+
+
+/*** GLX_SGIX_swap_group ***/
+
+static void
+Fake_glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable, GLXDrawable member)
+{
+   (void) dpy;
+   (void) drawable;
+   (void) member;
+}
+
+
+
+/*** GLX_SGIX_swap_barrier ***/
+
+static void
+Fake_glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable, int barrier)
+{
+   (void) dpy;
+   (void) drawable;
+   (void) barrier;
+}
+
+static Bool
+Fake_glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen, int *max)
+{
+   (void) dpy;
+   (void) screen;
+   (void) max;
+   return False;
+}
+
+
+
+/*** GLX_SUN_get_transparent_index ***/
+
+static Status
+Fake_glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent)
+{
+   (void) dpy;
+   (void) overlay;
+   (void) underlay;
+   (void) pTransparent;
+   return 0;
+}
+
+
+
+/*** GLX_MESA_release_buffers ***/
+
+/*
+ * Release the depth, stencil, accum buffers attached to a GLXDrawable
+ * (a window or pixmap) prior to destroying the GLXDrawable.
+ */
+static Bool
+Fake_glXReleaseBuffersMESA( Display *dpy, GLXDrawable d )
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, d);
+   if (b) {
+      XMesaDestroyBuffer(b);
+      return True;
+   }
+   return False;
+}
+
+
+
+/*** GLX_MESA_set_3dfx_mode ***/
+
+static Bool
+Fake_glXSet3DfxModeMESA( int mode )
+{
+   return XMesaSetFXmode( mode );
+}
+
+
+
+/*** GLX_NV_vertex_array range ***/
+static void *
+Fake_glXAllocateMemoryNV( GLsizei size,
+                          GLfloat readFrequency,
+                          GLfloat writeFrequency,
+                          GLfloat priority )
+{
+   (void) size;
+   (void) readFrequency;
+   (void) writeFrequency;
+   (void) priority;
+   return NULL;
+}
+
+
+static void 
+Fake_glXFreeMemoryNV( GLvoid *pointer )
+{
+   (void) pointer;
+}
+
+
+/*** GLX_MESA_agp_offset ***/
+
+static GLuint
+Fake_glXGetAGPOffsetMESA( const GLvoid *pointer )
+{
+   (void) pointer;
+   return ~0;
+}
+
+
+/*** GLX_EXT_texture_from_pixmap ***/
+
+static void
+Fake_glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer,
+                        const int *attrib_list)
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, drawable);
+   if (b)
+      XMesaBindTexImage(dpy, b, buffer, attrib_list);
+}
+
+static void
+Fake_glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
+{
+   XMesaBuffer b = XMesaFindBuffer(dpy, drawable);
+   if (b)
+      XMesaReleaseTexImage(dpy, b, buffer);
+}
+
+
+/* silence warning */
+extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void);
+
+
+/**
+ * Create a new GLX API dispatch table with its function pointers
+ * initialized to point to Mesa's "fake" GLX API functions.
+ * Note: there's a similar function (_real_GetGLXDispatchTable) that
+ * returns a new dispatch table with all pointers initalized to point
+ * to "real" GLX functions (which understand GLX wire protocol, etc).
+ */
+struct _glxapi_table *
+_mesa_GetGLXDispatchTable(void)
+{
+   static struct _glxapi_table glx;
+
+   /* be sure our dispatch table size <= libGL's table */
+   {
+      GLuint size = sizeof(struct _glxapi_table) / sizeof(void *);
+      (void) size;
+      assert(_glxapi_get_dispatch_table_size() >= size);
+   }
+
+   /* initialize the whole table to no-ops */
+   _glxapi_set_no_op_table(&glx);
+
+   /* now initialize the table with the functions I implement */
+   glx.ChooseVisual = Fake_glXChooseVisual;
+   glx.CopyContext = Fake_glXCopyContext;
+   glx.CreateContext = Fake_glXCreateContext;
+   glx.CreateGLXPixmap = Fake_glXCreateGLXPixmap;
+   glx.DestroyContext = Fake_glXDestroyContext;
+   glx.DestroyGLXPixmap = Fake_glXDestroyGLXPixmap;
+   glx.GetConfig = Fake_glXGetConfig;
+   /*glx.GetCurrentContext = Fake_glXGetCurrentContext;*/
+   /*glx.GetCurrentDrawable = Fake_glXGetCurrentDrawable;*/
+   glx.IsDirect = Fake_glXIsDirect;
+   glx.MakeCurrent = Fake_glXMakeCurrent;
+   glx.QueryExtension = Fake_glXQueryExtension;
+   glx.QueryVersion = Fake_glXQueryVersion;
+   glx.SwapBuffers = Fake_glXSwapBuffers;
+   glx.UseXFont = Fake_glXUseXFont;
+   glx.WaitGL = Fake_glXWaitGL;
+   glx.WaitX = Fake_glXWaitX;
+
+   /*** GLX_VERSION_1_1 ***/
+   glx.GetClientString = Fake_glXGetClientString;
+   glx.QueryExtensionsString = Fake_glXQueryExtensionsString;
+   glx.QueryServerString = Fake_glXQueryServerString;
+
+   /*** GLX_VERSION_1_2 ***/
+   /*glx.GetCurrentDisplay = Fake_glXGetCurrentDisplay;*/
+
+   /*** GLX_VERSION_1_3 ***/
+   glx.ChooseFBConfig = Fake_glXChooseFBConfig;
+   glx.CreateNewContext = Fake_glXCreateNewContext;
+   glx.CreatePbuffer = Fake_glXCreatePbuffer;
+   glx.CreatePixmap = Fake_glXCreatePixmap;
+   glx.CreateWindow = Fake_glXCreateWindow;
+   glx.DestroyPbuffer = Fake_glXDestroyPbuffer;
+   glx.DestroyPixmap = Fake_glXDestroyPixmap;
+   glx.DestroyWindow = Fake_glXDestroyWindow;
+   /*glx.GetCurrentReadDrawable = Fake_glXGetCurrentReadDrawable;*/
+   glx.GetFBConfigAttrib = Fake_glXGetFBConfigAttrib;
+   glx.GetFBConfigs = Fake_glXGetFBConfigs;
+   glx.GetSelectedEvent = Fake_glXGetSelectedEvent;
+   glx.GetVisualFromFBConfig = Fake_glXGetVisualFromFBConfig;
+   glx.MakeContextCurrent = Fake_glXMakeContextCurrent;
+   glx.QueryContext = Fake_glXQueryContext;
+   glx.QueryDrawable = Fake_glXQueryDrawable;
+   glx.SelectEvent = Fake_glXSelectEvent;
+
+   /*** GLX_SGI_swap_control ***/
+   glx.SwapIntervalSGI = Fake_glXSwapIntervalSGI;
+
+   /*** GLX_SGI_video_sync ***/
+   glx.GetVideoSyncSGI = Fake_glXGetVideoSyncSGI;
+   glx.WaitVideoSyncSGI = Fake_glXWaitVideoSyncSGI;
+
+   /*** GLX_SGI_make_current_read ***/
+   glx.MakeCurrentReadSGI = Fake_glXMakeCurrentReadSGI;
+   /*glx.GetCurrentReadDrawableSGI = Fake_glXGetCurrentReadDrawableSGI;*/
+
+/*** GLX_SGIX_video_source ***/
+#if defined(_VL_H)
+   glx.CreateGLXVideoSourceSGIX = Fake_glXCreateGLXVideoSourceSGIX;
+   glx.DestroyGLXVideoSourceSGIX = Fake_glXDestroyGLXVideoSourceSGIX;
+#endif
+
+   /*** GLX_EXT_import_context ***/
+   glx.FreeContextEXT = Fake_glXFreeContextEXT;
+   glx.GetContextIDEXT = Fake_glXGetContextIDEXT;
+   /*glx.GetCurrentDisplayEXT = Fake_glXGetCurrentDisplayEXT;*/
+   glx.ImportContextEXT = Fake_glXImportContextEXT;
+   glx.QueryContextInfoEXT = Fake_glXQueryContextInfoEXT;
+
+   /*** GLX_SGIX_fbconfig ***/
+   glx.GetFBConfigAttribSGIX = Fake_glXGetFBConfigAttribSGIX;
+   glx.ChooseFBConfigSGIX = Fake_glXChooseFBConfigSGIX;
+   glx.CreateGLXPixmapWithConfigSGIX = Fake_glXCreateGLXPixmapWithConfigSGIX;
+   glx.CreateContextWithConfigSGIX = Fake_glXCreateContextWithConfigSGIX;
+   glx.GetVisualFromFBConfigSGIX = Fake_glXGetVisualFromFBConfigSGIX;
+   glx.GetFBConfigFromVisualSGIX = Fake_glXGetFBConfigFromVisualSGIX;
+
+   /*** GLX_SGIX_pbuffer ***/
+   glx.CreateGLXPbufferSGIX = Fake_glXCreateGLXPbufferSGIX;
+   glx.DestroyGLXPbufferSGIX = Fake_glXDestroyGLXPbufferSGIX;
+   glx.QueryGLXPbufferSGIX = Fake_glXQueryGLXPbufferSGIX;
+   glx.SelectEventSGIX = Fake_glXSelectEventSGIX;
+   glx.GetSelectedEventSGIX = Fake_glXGetSelectedEventSGIX;
+
+   /*** GLX_SGI_cushion ***/
+   glx.CushionSGI = Fake_glXCushionSGI;
+
+   /*** GLX_SGIX_video_resize ***/
+   glx.BindChannelToWindowSGIX = Fake_glXBindChannelToWindowSGIX;
+   glx.ChannelRectSGIX = Fake_glXChannelRectSGIX;
+   glx.QueryChannelRectSGIX = Fake_glXQueryChannelRectSGIX;
+   glx.QueryChannelDeltasSGIX = Fake_glXQueryChannelDeltasSGIX;
+   glx.ChannelRectSyncSGIX = Fake_glXChannelRectSyncSGIX;
+
+   /*** GLX_SGIX_dmbuffer **/
+#if defined(_DM_BUFFER_H_)
+   glx.AssociateDMPbufferSGIX = NULL;
+#endif
+
+   /*** GLX_SGIX_swap_group ***/
+   glx.JoinSwapGroupSGIX = Fake_glXJoinSwapGroupSGIX;
+
+   /*** GLX_SGIX_swap_barrier ***/
+   glx.BindSwapBarrierSGIX = Fake_glXBindSwapBarrierSGIX;
+   glx.QueryMaxSwapBarriersSGIX = Fake_glXQueryMaxSwapBarriersSGIX;
+
+   /*** GLX_SUN_get_transparent_index ***/
+   glx.GetTransparentIndexSUN = Fake_glXGetTransparentIndexSUN;
+
+   /*** GLX_MESA_copy_sub_buffer ***/
+   glx.CopySubBufferMESA = Fake_glXCopySubBufferMESA;
+
+   /*** GLX_MESA_release_buffers ***/
+   glx.ReleaseBuffersMESA = Fake_glXReleaseBuffersMESA;
+
+   /*** GLX_MESA_pixmap_colormap ***/
+   glx.CreateGLXPixmapMESA = Fake_glXCreateGLXPixmapMESA;
+
+   /*** GLX_MESA_set_3dfx_mode ***/
+   glx.Set3DfxModeMESA = Fake_glXSet3DfxModeMESA;
+
+   /*** GLX_NV_vertex_array_range ***/
+   glx.AllocateMemoryNV = Fake_glXAllocateMemoryNV;
+   glx.FreeMemoryNV = Fake_glXFreeMemoryNV;
+
+   /*** GLX_MESA_agp_offset ***/
+   glx.GetAGPOffsetMESA = Fake_glXGetAGPOffsetMESA;
+
+   /*** GLX_EXT_texture_from_pixmap ***/
+   glx.BindTexImageEXT = Fake_glXBindTexImageEXT;
+   glx.ReleaseTexImageEXT = Fake_glXReleaseTexImageEXT;
+
+   return &glx;
+}
diff --git a/src/gallium/winsys/xlib/glxapi.c b/src/gallium/winsys/xlib/glxapi.c
new file mode 100644
index 0000000000..c059fc3edb
--- /dev/null
+++ b/src/gallium/winsys/xlib/glxapi.c
@@ -0,0 +1,1390 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ * 
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/*
+ * This is the GLX API dispatcher.  Calls to the glX* functions are
+ * either routed to the real GLX encoders or to Mesa's pseudo-GLX functions.
+ * See the glxapi.h file for more details.
+ */
+
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "main/glheader.h"
+#include "glapi/glapi.h"
+#include "glxapi.h"
+#include "pipe/p_thread.h"
+
+
+extern struct _glxapi_table *_real_GetGLXDispatchTable(void);
+extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void);
+
+
+struct display_dispatch {
+   Display *Dpy;
+   struct _glxapi_table *Table;
+   struct display_dispatch *Next;
+};
+
+static struct display_dispatch *DispatchList = NULL;
+
+
+/* Display -> Dispatch caching */
+static Display *prevDisplay = NULL;
+static struct _glxapi_table *prevTable = NULL;
+
+
+static struct _glxapi_table *
+get_dispatch(Display *dpy)
+{
+   if (!dpy)
+      return NULL;
+
+   /* search list of display/dispatch pairs for this display */
+   {
+      const struct display_dispatch *d = DispatchList;
+      while (d) {
+         if (d->Dpy == dpy) {
+            prevDisplay = dpy;
+            prevTable = d->Table;
+            return d->Table;  /* done! */
+         }
+         d = d->Next;
+      }
+   }
+
+   /* A new display, determine if we should use real GLX
+    * or Mesa's pseudo-GLX.
+    */
+   {
+      struct _glxapi_table *t = _mesa_GetGLXDispatchTable();
+
+      if (t) {
+         struct display_dispatch *d;
+         d = (struct display_dispatch *) malloc(sizeof(struct display_dispatch));
+         if (d) {
+            d->Dpy = dpy;
+            d->Table = t;
+            /* insert at head of list */
+            d->Next = DispatchList;
+            DispatchList = d;
+            /* update cache */
+            prevDisplay = dpy;
+            prevTable = t;
+            return t;
+         }
+      }
+   }
+
+   /* If we get here that means we can't use real GLX on this display
+    * and the Mesa pseudo-GLX software renderer wasn't compiled in.
+    * Or, we ran out of memory!
+    */
+   return NULL;
+}
+
+
+/* Don't use the GET_DISPATCH defined in glthread.h */
+#undef GET_DISPATCH
+
+#define GET_DISPATCH(DPY, TABLE)	\
+   if (DPY == prevDisplay) {		\
+      TABLE = prevTable;		\
+   }					\
+   else if (!DPY) {			\
+      TABLE = NULL;			\
+   }					\
+   else {				\
+      TABLE = get_dispatch(DPY);	\
+   }
+
+   
+
+
+/**
+ * GLX API current context.
+ */
+pipe_tsd ContextTSD;
+
+
+static void
+SetCurrentContext(GLXContext c)
+{
+   pipe_tsd_set(&ContextTSD, c);
+}
+
+
+/*
+ * GLX API entrypoints
+ */
+
+/*** GLX_VERSION_1_0 ***/
+
+XVisualInfo PUBLIC *
+glXChooseVisual(Display *dpy, int screen, int *list)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->ChooseVisual)(dpy, screen, list);
+}
+
+
+void PUBLIC
+glXCopyContext(Display *dpy, GLXContext src, GLXContext dst, unsigned long mask)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->CopyContext)(dpy, src, dst, mask);
+}
+
+
+GLXContext PUBLIC
+glXCreateContext(Display *dpy, XVisualInfo *visinfo, GLXContext shareList, Bool direct)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateContext)(dpy, visinfo, shareList, direct);
+}
+
+
+GLXPixmap PUBLIC
+glXCreateGLXPixmap(Display *dpy, XVisualInfo *visinfo, Pixmap pixmap)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateGLXPixmap)(dpy, visinfo, pixmap);
+}
+
+
+void PUBLIC
+glXDestroyContext(Display *dpy, GLXContext ctx)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   if (glXGetCurrentContext() == ctx)
+      SetCurrentContext(NULL);
+   (t->DestroyContext)(dpy, ctx);
+}
+
+
+void PUBLIC
+glXDestroyGLXPixmap(Display *dpy, GLXPixmap pixmap)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->DestroyGLXPixmap)(dpy, pixmap);
+}
+
+
+int PUBLIC
+glXGetConfig(Display *dpy, XVisualInfo *visinfo, int attrib, int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return GLX_NO_EXTENSION;
+   return (t->GetConfig)(dpy, visinfo, attrib, value);
+}
+
+
+GLXContext PUBLIC
+glXGetCurrentContext(void)
+{
+   return (GLXContext) pipe_tsd_get(&ContextTSD);
+}
+
+
+GLXDrawable PUBLIC
+glXGetCurrentDrawable(void)
+{
+   __GLXcontext *gc = (__GLXcontext *) glXGetCurrentContext();
+   return gc ? gc->currentDrawable : 0;
+}
+
+
+Bool PUBLIC
+glXIsDirect(Display *dpy, GLXContext ctx)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->IsDirect)(dpy, ctx);
+}
+
+
+Bool PUBLIC
+glXMakeCurrent(Display *dpy, GLXDrawable drawable, GLXContext ctx)
+{
+   Bool b;
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t) {
+      return False;
+   }
+   b = (*t->MakeCurrent)(dpy, drawable, ctx);
+   if (b) {
+      SetCurrentContext(ctx);
+   }
+   return b;
+}
+
+
+Bool PUBLIC
+glXQueryExtension(Display *dpy, int *errorb, int *event)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->QueryExtension)(dpy, errorb, event);
+}
+
+
+Bool PUBLIC
+glXQueryVersion(Display *dpy, int *maj, int *min)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->QueryVersion)(dpy, maj, min);
+}
+
+
+void PUBLIC
+glXSwapBuffers(Display *dpy, GLXDrawable drawable)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->SwapBuffers)(dpy, drawable);
+}
+
+
+void PUBLIC
+glXUseXFont(Font font, int first, int count, int listBase)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->UseXFont)(font, first, count, listBase);
+}
+
+
+void PUBLIC
+glXWaitGL(void)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->WaitGL)();
+}
+
+
+void PUBLIC
+glXWaitX(void)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->WaitX)();
+}
+
+
+
+/*** GLX_VERSION_1_1 ***/
+
+const char PUBLIC *
+glXGetClientString(Display *dpy, int name)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->GetClientString)(dpy, name);
+}
+
+
+const char PUBLIC *
+glXQueryExtensionsString(Display *dpy, int screen)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->QueryExtensionsString)(dpy, screen);
+}
+
+
+const char PUBLIC *
+glXQueryServerString(Display *dpy, int screen, int name)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->QueryServerString)(dpy, screen, name);
+}
+
+
+/*** GLX_VERSION_1_2 ***/
+
+Display PUBLIC *
+glXGetCurrentDisplay(void)
+{
+   /* Same code as in libGL's glxext.c */
+   __GLXcontext *gc = (__GLXcontext *) glXGetCurrentContext();
+   if (NULL == gc) return NULL;
+   return gc->currentDpy;
+}
+
+
+
+/*** GLX_VERSION_1_3 ***/
+
+GLXFBConfig PUBLIC *
+glXChooseFBConfig(Display *dpy, int screen, const int *attribList, int *nitems)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->ChooseFBConfig)(dpy, screen, attribList, nitems);
+}
+
+
+GLXContext PUBLIC
+glXCreateNewContext(Display *dpy, GLXFBConfig config, int renderType, GLXContext shareList, Bool direct)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateNewContext)(dpy, config, renderType, shareList, direct);
+}
+
+
+GLXPbuffer PUBLIC
+glXCreatePbuffer(Display *dpy, GLXFBConfig config, const int *attribList)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreatePbuffer)(dpy, config, attribList);
+}
+
+
+GLXPixmap PUBLIC
+glXCreatePixmap(Display *dpy, GLXFBConfig config, Pixmap pixmap, const int *attribList)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreatePixmap)(dpy, config, pixmap, attribList);
+}
+
+
+GLXWindow PUBLIC
+glXCreateWindow(Display *dpy, GLXFBConfig config, Window win, const int *attribList)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateWindow)(dpy, config, win, attribList);
+}
+
+
+void PUBLIC
+glXDestroyPbuffer(Display *dpy, GLXPbuffer pbuf)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->DestroyPbuffer)(dpy, pbuf);
+}
+
+
+void PUBLIC
+glXDestroyPixmap(Display *dpy, GLXPixmap pixmap)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->DestroyPixmap)(dpy, pixmap);
+}
+
+
+void PUBLIC
+glXDestroyWindow(Display *dpy, GLXWindow window)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->DestroyWindow)(dpy, window);
+}
+
+
+GLXDrawable PUBLIC
+glXGetCurrentReadDrawable(void)
+{
+   __GLXcontext *gc = (__GLXcontext *) glXGetCurrentContext();
+   return gc ? gc->currentReadable : 0;
+}
+
+
+int PUBLIC
+glXGetFBConfigAttrib(Display *dpy, GLXFBConfig config, int attribute, int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return GLX_NO_EXTENSION;
+   return (t->GetFBConfigAttrib)(dpy, config, attribute, value);
+}
+
+
+GLXFBConfig PUBLIC *
+glXGetFBConfigs(Display *dpy, int screen, int *nelements)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->GetFBConfigs)(dpy, screen, nelements);
+}
+
+void PUBLIC
+glXGetSelectedEvent(Display *dpy, GLXDrawable drawable, unsigned long *mask)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->GetSelectedEvent)(dpy, drawable, mask);
+}
+
+
+XVisualInfo PUBLIC *
+glXGetVisualFromFBConfig(Display *dpy, GLXFBConfig config)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->GetVisualFromFBConfig)(dpy, config);
+}
+
+
+Bool PUBLIC
+glXMakeContextCurrent(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx)
+{
+   Bool b;
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   b = (t->MakeContextCurrent)(dpy, draw, read, ctx);
+   if (b) {
+      SetCurrentContext(ctx);
+   }
+   return b;
+}
+
+
+int PUBLIC
+glXQueryContext(Display *dpy, GLXContext ctx, int attribute, int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   assert(t);
+   if (!t)
+      return 0; /* XXX correct? */
+   return (t->QueryContext)(dpy, ctx, attribute, value);
+}
+
+
+void PUBLIC
+glXQueryDrawable(Display *dpy, GLXDrawable draw, int attribute, unsigned int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->QueryDrawable)(dpy, draw, attribute, value);
+}
+
+
+void PUBLIC
+glXSelectEvent(Display *dpy, GLXDrawable drawable, unsigned long mask)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->SelectEvent)(dpy, drawable, mask);
+}
+
+
+
+/*** GLX_SGI_swap_control ***/
+
+int PUBLIC
+glXSwapIntervalSGI(int interval)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->SwapIntervalSGI)(interval);
+}
+
+
+
+/*** GLX_SGI_video_sync ***/
+
+int PUBLIC
+glXGetVideoSyncSGI(unsigned int *count)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t || !glXGetCurrentContext())
+      return GLX_BAD_CONTEXT;
+   return (t->GetVideoSyncSGI)(count);
+}
+
+int PUBLIC
+glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t || !glXGetCurrentContext())
+      return GLX_BAD_CONTEXT;
+   return (t->WaitVideoSyncSGI)(divisor, remainder, count);
+}
+
+
+
+/*** GLX_SGI_make_current_read ***/
+
+Bool PUBLIC
+glXMakeCurrentReadSGI(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->MakeCurrentReadSGI)(dpy, draw, read, ctx);
+}
+
+GLXDrawable PUBLIC
+glXGetCurrentReadDrawableSGI(void)
+{
+   return glXGetCurrentReadDrawable();
+}
+
+
+#if defined(_VL_H)
+
+GLXVideoSourceSGIX PUBLIC
+glXCreateGLXVideoSourceSGIX(Display *dpy, int screen, VLServer server, VLPath path, int nodeClass, VLNode drainNode)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateGLXVideoSourceSGIX)(dpy, screen, server, path, nodeClass, drainNode);
+}
+
+void PUBLIC
+glXDestroyGLXVideoSourceSGIX(Display *dpy, GLXVideoSourceSGIX src)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->DestroyGLXVideoSourceSGIX)(dpy, src);
+}
+
+#endif
+
+
+/*** GLX_EXT_import_context ***/
+
+void PUBLIC
+glXFreeContextEXT(Display *dpy, GLXContext context)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->FreeContextEXT)(dpy, context);
+}
+
+GLXContextID PUBLIC
+glXGetContextIDEXT(const GLXContext context)
+{
+   return ((__GLXcontext *) context)->xid;
+}
+
+Display PUBLIC *
+glXGetCurrentDisplayEXT(void)
+{
+   return glXGetCurrentDisplay();
+}
+
+GLXContext PUBLIC
+glXImportContextEXT(Display *dpy, GLXContextID contextID)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->ImportContextEXT)(dpy, contextID);
+}
+
+int PUBLIC
+glXQueryContextInfoEXT(Display *dpy, GLXContext context, int attribute,int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;  /* XXX ok? */
+   return (t->QueryContextInfoEXT)(dpy, context, attribute, value);
+}
+
+
+
+/*** GLX_SGIX_fbconfig ***/
+
+int PUBLIC
+glXGetFBConfigAttribSGIX(Display *dpy, GLXFBConfigSGIX config, int attribute, int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->GetFBConfigAttribSGIX)(dpy, config, attribute, value);
+}
+
+GLXFBConfigSGIX PUBLIC *
+glXChooseFBConfigSGIX(Display *dpy, int screen, int *attrib_list, int *nelements)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->ChooseFBConfigSGIX)(dpy, screen, attrib_list, nelements);
+}
+
+GLXPixmap PUBLIC
+glXCreateGLXPixmapWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, Pixmap pixmap)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateGLXPixmapWithConfigSGIX)(dpy, config, pixmap);
+}
+
+GLXContext PUBLIC
+glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int render_type, GLXContext share_list, Bool direct)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateContextWithConfigSGIX)(dpy, config, render_type, share_list, direct);
+}
+
+XVisualInfo PUBLIC *
+glXGetVisualFromFBConfigSGIX(Display *dpy, GLXFBConfigSGIX config)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->GetVisualFromFBConfigSGIX)(dpy, config);
+}
+
+GLXFBConfigSGIX PUBLIC
+glXGetFBConfigFromVisualSGIX(Display *dpy, XVisualInfo *vis)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->GetFBConfigFromVisualSGIX)(dpy, vis);
+}
+
+
+
+/*** GLX_SGIX_pbuffer ***/
+
+GLXPbufferSGIX PUBLIC
+glXCreateGLXPbufferSGIX(Display *dpy, GLXFBConfigSGIX config, unsigned int width, unsigned int height, int *attrib_list)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateGLXPbufferSGIX)(dpy, config, width, height, attrib_list);
+}
+
+void PUBLIC
+glXDestroyGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->DestroyGLXPbufferSGIX)(dpy, pbuf);
+}
+
+int PUBLIC
+glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->QueryGLXPbufferSGIX)(dpy, pbuf, attribute, value);
+}
+
+void PUBLIC
+glXSelectEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long mask)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->SelectEventSGIX)(dpy, drawable, mask);
+}
+
+void PUBLIC
+glXGetSelectedEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long *mask)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->GetSelectedEventSGIX)(dpy, drawable, mask);
+}
+
+
+
+/*** GLX_SGI_cushion ***/
+
+void PUBLIC
+glXCushionSGI(Display *dpy, Window win, float cushion)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->CushionSGI)(dpy, win, cushion);
+}
+
+
+
+/*** GLX_SGIX_video_resize ***/
+
+int PUBLIC
+glXBindChannelToWindowSGIX(Display *dpy, int screen, int channel , Window window)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->BindChannelToWindowSGIX)(dpy, screen, channel, window);
+}
+
+int PUBLIC
+glXChannelRectSGIX(Display *dpy, int screen, int channel, int x, int y, int w, int h)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->ChannelRectSGIX)(dpy, screen, channel, x, y, w, h);
+}
+
+int PUBLIC
+glXQueryChannelRectSGIX(Display *dpy, int screen, int channel, int *x, int *y, int *w, int *h)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->QueryChannelRectSGIX)(dpy, screen, channel, x, y, w, h);
+}
+
+int PUBLIC
+glXQueryChannelDeltasSGIX(Display *dpy, int screen, int channel, int *dx, int *dy, int *dw, int *dh)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->QueryChannelDeltasSGIX)(dpy, screen, channel, dx, dy, dw, dh);
+}
+
+int PUBLIC
+glXChannelRectSyncSGIX(Display *dpy, int screen, int channel, GLenum synctype)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->ChannelRectSyncSGIX)(dpy, screen, channel, synctype);
+}
+
+
+
+#if defined(_DM_BUFFER_H_)
+
+Bool PUBLIC
+glXAssociateDMPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params, DMbuffer dmbuffer)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->AssociateDMPbufferSGIX)(dpy, pbuffer, params, dmbuffer);
+}
+
+#endif
+
+
+/*** GLX_SGIX_swap_group ***/
+
+void PUBLIC
+glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable, GLXDrawable member)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (*t->JoinSwapGroupSGIX)(dpy, drawable, member);
+}
+
+
+/*** GLX_SGIX_swap_barrier ***/
+
+void PUBLIC
+glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable, int barrier)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (*t->BindSwapBarrierSGIX)(dpy, drawable, barrier);
+}
+
+Bool PUBLIC
+glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen, int *max)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (*t->QueryMaxSwapBarriersSGIX)(dpy, screen, max);
+}
+
+
+
+/*** GLX_SUN_get_transparent_index ***/
+
+Status PUBLIC
+glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (*t->GetTransparentIndexSUN)(dpy, overlay, underlay, pTransparent);
+}
+
+
+
+/*** GLX_MESA_copy_sub_buffer ***/
+
+void PUBLIC
+glXCopySubBufferMESA(Display *dpy, GLXDrawable drawable, int x, int y, int width, int height)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->CopySubBufferMESA)(dpy, drawable, x, y, width, height);
+}
+
+
+
+/*** GLX_MESA_release_buffers ***/
+
+Bool PUBLIC
+glXReleaseBuffersMESA(Display *dpy, Window w)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->ReleaseBuffersMESA)(dpy, w);
+}
+
+
+
+/*** GLX_MESA_pixmap_colormap ***/
+
+GLXPixmap PUBLIC
+glXCreateGLXPixmapMESA(Display *dpy, XVisualInfo *visinfo, Pixmap pixmap, Colormap cmap)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateGLXPixmapMESA)(dpy, visinfo, pixmap, cmap);
+}
+
+
+
+/*** GLX_MESA_set_3dfx_mode ***/
+
+Bool PUBLIC
+glXSet3DfxModeMESA(int mode)
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return False;
+   return (t->Set3DfxModeMESA)(mode);
+}
+
+
+
+/*** GLX_NV_vertex_array_range ***/
+
+void PUBLIC *
+glXAllocateMemoryNV( GLsizei size,
+                     GLfloat readFrequency,
+                     GLfloat writeFrequency,
+                     GLfloat priority )
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return NULL;
+   return (t->AllocateMemoryNV)(size, readFrequency, writeFrequency, priority);
+}
+
+
+void PUBLIC
+glXFreeMemoryNV( GLvoid *pointer )
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return;
+   (t->FreeMemoryNV)(pointer);
+}
+
+
+
+
+/*** GLX_MESA_agp_offset */
+
+GLuint PUBLIC
+glXGetAGPOffsetMESA( const GLvoid *pointer )
+{
+   struct _glxapi_table *t;
+   Display *dpy = glXGetCurrentDisplay();
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return ~0;
+   return (t->GetAGPOffsetMESA)(pointer);
+}
+
+
+/*** GLX_MESA_allocate_memory */
+
+void *
+glXAllocateMemoryMESA(Display *dpy, int scrn, size_t size,
+                      float readfreq, float writefreq, float priority)
+{
+   /* dummy */
+   return NULL;
+}
+
+void
+glXFreeMemoryMESA(Display *dpy, int scrn, void *pointer)
+{
+   /* dummy */
+}
+
+
+GLuint
+glXGetMemoryOffsetMESA(Display *dpy, int scrn, const void *pointer)
+{
+   /* dummy */
+   return 0;
+}
+
+
+/*** GLX_EXT_texture_from_pixmap */
+
+void
+glXBindTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer,
+                   const int *attrib_list)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (t)
+      t->BindTexImageEXT(dpy, drawable, buffer, attrib_list);
+}
+
+void
+glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (t)
+      t->ReleaseTexImageEXT(dpy, drawable, buffer);
+}
+
+
+/**********************************************************************/
+/* GLX API management functions                                       */
+/**********************************************************************/
+
+
+const char *
+_glxapi_get_version(void)
+{
+   return "1.3";
+}
+
+
+/*
+ * Return array of extension strings.
+ */
+const char **
+_glxapi_get_extensions(void)
+{
+   static const char *extensions[] = {
+#ifdef GLX_EXT_import_context
+      "GLX_EXT_import_context",
+#endif
+#ifdef GLX_SGI_video_sync
+      "GLX_SGI_video_sync",
+#endif
+#ifdef GLX_MESA_copy_sub_buffer
+      "GLX_MESA_copy_sub_buffer",
+#endif
+#ifdef GLX_MESA_release_buffers
+      "GLX_MESA_release_buffers",
+#endif
+#ifdef GLX_MESA_pixmap_colormap
+      "GLX_MESA_pixmap_colormap",
+#endif
+#ifdef GLX_MESA_set_3dfx_mode
+      "GLX_MESA_set_3dfx_mode",
+#endif
+#ifdef GLX_SGIX_fbconfig
+      "GLX_SGIX_fbconfig",
+#endif
+#ifdef GLX_SGIX_pbuffer
+      "GLX_SGIX_pbuffer",
+#endif
+#ifdef GLX_EXT_texture_from_pixmap
+      "GLX_EXT_texture_from_pixmap",
+#endif
+      NULL
+   };
+   return extensions;
+}
+
+
+/*
+ * Return size of the GLX dispatch table, in entries, not bytes.
+ */
+GLuint
+_glxapi_get_dispatch_table_size(void)
+{
+   return sizeof(struct _glxapi_table) / sizeof(void *);
+}
+
+
+static int
+generic_no_op_func(void)
+{
+   return 0;
+}
+
+
+/*
+ * Initialize all functions in given dispatch table to be no-ops
+ */
+void
+_glxapi_set_no_op_table(struct _glxapi_table *t)
+{
+   typedef int (*nop_func)(void);
+   nop_func *dispatch = (nop_func *) t;
+   GLuint n = _glxapi_get_dispatch_table_size();
+   GLuint i;
+   for (i = 0; i < n; i++) {
+      dispatch[i] = generic_no_op_func;
+   }
+}
+
+
+struct name_address_pair {
+   const char *Name;
+   __GLXextFuncPtr Address;
+};
+
+static struct name_address_pair GLX_functions[] = {
+   /*** GLX_VERSION_1_0 ***/
+   { "glXChooseVisual", (__GLXextFuncPtr) glXChooseVisual },
+   { "glXCopyContext", (__GLXextFuncPtr) glXCopyContext },
+   { "glXCreateContext", (__GLXextFuncPtr) glXCreateContext },
+   { "glXCreateGLXPixmap", (__GLXextFuncPtr) glXCreateGLXPixmap },
+   { "glXDestroyContext", (__GLXextFuncPtr) glXDestroyContext },
+   { "glXDestroyGLXPixmap", (__GLXextFuncPtr) glXDestroyGLXPixmap },
+   { "glXGetConfig", (__GLXextFuncPtr) glXGetConfig },
+   { "glXGetCurrentContext", (__GLXextFuncPtr) glXGetCurrentContext },
+   { "glXGetCurrentDrawable", (__GLXextFuncPtr) glXGetCurrentDrawable },
+   { "glXIsDirect", (__GLXextFuncPtr) glXIsDirect },
+   { "glXMakeCurrent", (__GLXextFuncPtr) glXMakeCurrent },
+   { "glXQueryExtension", (__GLXextFuncPtr) glXQueryExtension },
+   { "glXQueryVersion", (__GLXextFuncPtr) glXQueryVersion },
+   { "glXSwapBuffers", (__GLXextFuncPtr) glXSwapBuffers },
+   { "glXUseXFont", (__GLXextFuncPtr) glXUseXFont },
+   { "glXWaitGL", (__GLXextFuncPtr) glXWaitGL },
+   { "glXWaitX", (__GLXextFuncPtr) glXWaitX },
+
+   /*** GLX_VERSION_1_1 ***/
+   { "glXGetClientString", (__GLXextFuncPtr) glXGetClientString },
+   { "glXQueryExtensionsString", (__GLXextFuncPtr) glXQueryExtensionsString },
+   { "glXQueryServerString", (__GLXextFuncPtr) glXQueryServerString },
+
+   /*** GLX_VERSION_1_2 ***/
+   { "glXGetCurrentDisplay", (__GLXextFuncPtr) glXGetCurrentDisplay },
+
+   /*** GLX_VERSION_1_3 ***/
+   { "glXChooseFBConfig", (__GLXextFuncPtr) glXChooseFBConfig },
+   { "glXCreateNewContext", (__GLXextFuncPtr) glXCreateNewContext },
+   { "glXCreatePbuffer", (__GLXextFuncPtr) glXCreatePbuffer },
+   { "glXCreatePixmap", (__GLXextFuncPtr) glXCreatePixmap },
+   { "glXCreateWindow", (__GLXextFuncPtr) glXCreateWindow },
+   { "glXDestroyPbuffer", (__GLXextFuncPtr) glXDestroyPbuffer },
+   { "glXDestroyPixmap", (__GLXextFuncPtr) glXDestroyPixmap },
+   { "glXDestroyWindow", (__GLXextFuncPtr) glXDestroyWindow },
+   { "glXGetCurrentReadDrawable", (__GLXextFuncPtr) glXGetCurrentReadDrawable },
+   { "glXGetFBConfigAttrib", (__GLXextFuncPtr) glXGetFBConfigAttrib },
+   { "glXGetFBConfigs", (__GLXextFuncPtr) glXGetFBConfigs },
+   { "glXGetSelectedEvent", (__GLXextFuncPtr) glXGetSelectedEvent },
+   { "glXGetVisualFromFBConfig", (__GLXextFuncPtr) glXGetVisualFromFBConfig },
+   { "glXMakeContextCurrent", (__GLXextFuncPtr) glXMakeContextCurrent },
+   { "glXQueryContext", (__GLXextFuncPtr) glXQueryContext },
+   { "glXQueryDrawable", (__GLXextFuncPtr) glXQueryDrawable },
+   { "glXSelectEvent", (__GLXextFuncPtr) glXSelectEvent },
+
+   /*** GLX_VERSION_1_4 ***/
+   { "glXGetProcAddress", (__GLXextFuncPtr) glXGetProcAddress },
+
+   /*** GLX_SGI_swap_control ***/
+   { "glXSwapIntervalSGI", (__GLXextFuncPtr) glXSwapIntervalSGI },
+
+   /*** GLX_SGI_video_sync ***/
+   { "glXGetVideoSyncSGI", (__GLXextFuncPtr) glXGetVideoSyncSGI },
+   { "glXWaitVideoSyncSGI", (__GLXextFuncPtr) glXWaitVideoSyncSGI },
+
+   /*** GLX_SGI_make_current_read ***/
+   { "glXMakeCurrentReadSGI", (__GLXextFuncPtr) glXMakeCurrentReadSGI },
+   { "glXGetCurrentReadDrawableSGI", (__GLXextFuncPtr) glXGetCurrentReadDrawableSGI },
+
+   /*** GLX_SGIX_video_source ***/
+#if defined(_VL_H)
+   { "glXCreateGLXVideoSourceSGIX", (__GLXextFuncPtr) glXCreateGLXVideoSourceSGIX },
+   { "glXDestroyGLXVideoSourceSGIX", (__GLXextFuncPtr) glXDestroyGLXVideoSourceSGIX },
+#endif
+
+   /*** GLX_EXT_import_context ***/
+   { "glXFreeContextEXT", (__GLXextFuncPtr) glXFreeContextEXT },
+   { "glXGetContextIDEXT", (__GLXextFuncPtr) glXGetContextIDEXT },
+   { "glXGetCurrentDisplayEXT", (__GLXextFuncPtr) glXGetCurrentDisplayEXT },
+   { "glXImportContextEXT", (__GLXextFuncPtr) glXImportContextEXT },
+   { "glXQueryContextInfoEXT", (__GLXextFuncPtr) glXQueryContextInfoEXT },
+
+   /*** GLX_SGIX_fbconfig ***/
+   { "glXGetFBConfigAttribSGIX", (__GLXextFuncPtr) glXGetFBConfigAttribSGIX },
+   { "glXChooseFBConfigSGIX", (__GLXextFuncPtr) glXChooseFBConfigSGIX },
+   { "glXCreateGLXPixmapWithConfigSGIX", (__GLXextFuncPtr) glXCreateGLXPixmapWithConfigSGIX },
+   { "glXCreateContextWithConfigSGIX", (__GLXextFuncPtr) glXCreateContextWithConfigSGIX },
+   { "glXGetVisualFromFBConfigSGIX", (__GLXextFuncPtr) glXGetVisualFromFBConfigSGIX },
+   { "glXGetFBConfigFromVisualSGIX", (__GLXextFuncPtr) glXGetFBConfigFromVisualSGIX },
+
+   /*** GLX_SGIX_pbuffer ***/
+   { "glXCreateGLXPbufferSGIX", (__GLXextFuncPtr) glXCreateGLXPbufferSGIX },
+   { "glXDestroyGLXPbufferSGIX", (__GLXextFuncPtr) glXDestroyGLXPbufferSGIX },
+   { "glXQueryGLXPbufferSGIX", (__GLXextFuncPtr) glXQueryGLXPbufferSGIX },
+   { "glXSelectEventSGIX", (__GLXextFuncPtr) glXSelectEventSGIX },
+   { "glXGetSelectedEventSGIX", (__GLXextFuncPtr) glXGetSelectedEventSGIX },
+
+   /*** GLX_SGI_cushion ***/
+   { "glXCushionSGI", (__GLXextFuncPtr) glXCushionSGI },
+
+   /*** GLX_SGIX_video_resize ***/
+   { "glXBindChannelToWindowSGIX", (__GLXextFuncPtr) glXBindChannelToWindowSGIX },
+   { "glXChannelRectSGIX", (__GLXextFuncPtr) glXChannelRectSGIX },
+   { "glXQueryChannelRectSGIX", (__GLXextFuncPtr) glXQueryChannelRectSGIX },
+   { "glXQueryChannelDeltasSGIX", (__GLXextFuncPtr) glXQueryChannelDeltasSGIX },
+   { "glXChannelRectSyncSGIX", (__GLXextFuncPtr) glXChannelRectSyncSGIX },
+
+   /*** GLX_SGIX_dmbuffer **/
+#if defined(_DM_BUFFER_H_)
+   { "glXAssociateDMPbufferSGIX", (__GLXextFuncPtr) glXAssociateDMPbufferSGIX },
+#endif
+
+   /*** GLX_SGIX_swap_group ***/
+   { "glXJoinSwapGroupSGIX", (__GLXextFuncPtr) glXJoinSwapGroupSGIX },
+
+   /*** GLX_SGIX_swap_barrier ***/
+   { "glXBindSwapBarrierSGIX", (__GLXextFuncPtr) glXBindSwapBarrierSGIX },
+   { "glXQueryMaxSwapBarriersSGIX", (__GLXextFuncPtr) glXQueryMaxSwapBarriersSGIX },
+
+   /*** GLX_SUN_get_transparent_index ***/
+   { "glXGetTransparentIndexSUN", (__GLXextFuncPtr) glXGetTransparentIndexSUN },
+
+   /*** GLX_MESA_copy_sub_buffer ***/
+   { "glXCopySubBufferMESA", (__GLXextFuncPtr) glXCopySubBufferMESA },
+
+   /*** GLX_MESA_pixmap_colormap ***/
+   { "glXCreateGLXPixmapMESA", (__GLXextFuncPtr) glXCreateGLXPixmapMESA },
+
+   /*** GLX_MESA_release_buffers ***/
+   { "glXReleaseBuffersMESA", (__GLXextFuncPtr) glXReleaseBuffersMESA },
+
+   /*** GLX_MESA_set_3dfx_mode ***/
+   { "glXSet3DfxModeMESA", (__GLXextFuncPtr) glXSet3DfxModeMESA },
+
+   /*** GLX_ARB_get_proc_address ***/
+   { "glXGetProcAddressARB", (__GLXextFuncPtr) glXGetProcAddressARB },
+
+   /*** GLX_NV_vertex_array_range ***/
+   { "glXAllocateMemoryNV", (__GLXextFuncPtr) glXAllocateMemoryNV },
+   { "glXFreeMemoryNV", (__GLXextFuncPtr) glXFreeMemoryNV },
+
+   /*** GLX_MESA_agp_offset ***/
+   { "glXGetAGPOffsetMESA", (__GLXextFuncPtr) glXGetAGPOffsetMESA },
+
+   /*** GLX_MESA_allocate_memory ***/
+   { "glXAllocateMemoryMESA", (__GLXextFuncPtr) glXAllocateMemoryMESA },
+   { "glXFreeMemoryMESA", (__GLXextFuncPtr) glXFreeMemoryMESA },
+   { "glXGetMemoryOffsetMESA", (__GLXextFuncPtr) glXGetMemoryOffsetMESA },
+
+   /*** GLX_EXT_texture_from_pixmap ***/
+   { "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT },
+   { "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT },
+
+   { NULL, NULL }   /* end of list */
+};
+
+
+
+/*
+ * Return address of named glX function, or NULL if not found.
+ */
+__GLXextFuncPtr
+_glxapi_get_proc_address(const char *funcName)
+{
+   GLuint i;
+   for (i = 0; GLX_functions[i].Name; i++) {
+      if (strcmp(GLX_functions[i].Name, funcName) == 0)
+         return GLX_functions[i].Address;
+   }
+   return NULL;
+}
+
+
+
+/*
+ * This function does not get dispatched through the dispatch table
+ * since it's really a "meta" function.
+ */
+__GLXextFuncPtr
+glXGetProcAddressARB(const GLubyte *procName)
+{
+   __GLXextFuncPtr f;
+
+   f = _glxapi_get_proc_address((const char *) procName);
+   if (f) {
+      return f;
+   }
+
+   f = (__GLXextFuncPtr) _glapi_get_proc_address((const char *) procName);
+   return f;
+}
+
+
+/* GLX 1.4 */
+void (*glXGetProcAddress(const GLubyte *procName))()
+{
+   return glXGetProcAddressARB(procName);
+}
diff --git a/src/gallium/winsys/xlib/glxapi.h b/src/gallium/winsys/xlib/glxapi.h
new file mode 100644
index 0000000000..37de81e55a
--- /dev/null
+++ b/src/gallium/winsys/xlib/glxapi.h
@@ -0,0 +1,228 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ * 
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef _glxapi_h_
+#define _glxapi_h_
+
+
+#define GLX_GLXEXT_PROTOTYPES
+#include "GL/glx.h"
+
+
+/* The GLX API dispatcher (i.e. this code) is being built into stand-alone
+ * Mesa.  We don't know anything about XFree86 or real GLX so we define a
+ * minimal __GLXContextRec here so some of the functions in this file can
+ * work properly.
+ */
+typedef struct __GLXcontextRec {
+   Display *currentDpy;
+   GLboolean isDirect;
+   GLXDrawable currentDrawable;
+   GLXDrawable currentReadable;
+   XID xid;
+} __GLXcontext;
+
+
+/*
+ * Almost all the GLX API functions get routed through this dispatch table.
+ * The exceptions are the glXGetCurrentXXX() functions.
+ *
+ * This dispatch table allows multiple GLX client-side modules to coexist.
+ * Specifically, a real GLX library (like SGI's or the Utah GLX) and Mesa's
+ * pseudo-GLX can be present at the same time.  The former being used on
+ * GLX-enabled X servers and the later on non-GLX X servers.
+ *
+ * Red Hat has been using this since Red Hat Linux 7.0 (I think).
+ * This'll be a standard feature in XFree86 4.3.  It basically allows one
+ * libGL to do both DRI-rendering and "fake GLX" rendering to X displays
+ * that lack the GLX extension.
+ */
+struct _glxapi_table {
+   /*** GLX_VERSION_1_0 ***/
+   XVisualInfo *(*ChooseVisual)(Display *dpy, int screen, int *list);
+   void (*CopyContext)(Display *dpy, GLXContext src, GLXContext dst, unsigned long mask);
+   GLXContext (*CreateContext)(Display *dpy, XVisualInfo *visinfo, GLXContext shareList, Bool direct);
+   GLXPixmap (*CreateGLXPixmap)(Display *dpy, XVisualInfo *visinfo, Pixmap pixmap);
+   void (*DestroyContext)(Display *dpy, GLXContext ctx);
+   void (*DestroyGLXPixmap)(Display *dpy, GLXPixmap pixmap);
+   int (*GetConfig)(Display *dpy, XVisualInfo *visinfo, int attrib, int *value);
+   /*GLXContext (*GetCurrentContext)(void);*/
+   /*GLXDrawable (*GetCurrentDrawable)(void);*/
+   Bool (*IsDirect)(Display *dpy, GLXContext ctx);
+   Bool (*MakeCurrent)(Display *dpy, GLXDrawable drawable, GLXContext ctx);
+   Bool (*QueryExtension)(Display *dpy, int *errorb, int *event);
+   Bool (*QueryVersion)(Display *dpy, int *maj, int *min);
+   void (*SwapBuffers)(Display *dpy, GLXDrawable drawable);
+   void (*UseXFont)(Font font, int first, int count, int listBase);
+   void (*WaitGL)(void);
+   void (*WaitX)(void);
+
+   /*** GLX_VERSION_1_1 ***/
+   const char *(*GetClientString)(Display *dpy, int name);
+   const char *(*QueryExtensionsString)(Display *dpy, int screen);
+   const char *(*QueryServerString)(Display *dpy, int screen, int name);
+
+   /*** GLX_VERSION_1_2 ***/
+   /*Display *(*GetCurrentDisplay)(void);*/
+
+   /*** GLX_VERSION_1_3 ***/
+   GLXFBConfig *(*ChooseFBConfig)(Display *dpy, int screen, const int *attribList, int *nitems);
+   GLXContext (*CreateNewContext)(Display *dpy, GLXFBConfig config, int renderType, GLXContext shareList, Bool direct);
+   GLXPbuffer (*CreatePbuffer)(Display *dpy, GLXFBConfig config, const int *attribList);
+   GLXPixmap (*CreatePixmap)(Display *dpy, GLXFBConfig config, Pixmap pixmap, const int *attribList);
+   GLXWindow (*CreateWindow)(Display *dpy, GLXFBConfig config, Window win, const int *attribList);
+   void (*DestroyPbuffer)(Display *dpy, GLXPbuffer pbuf);
+   void (*DestroyPixmap)(Display *dpy, GLXPixmap pixmap);
+   void (*DestroyWindow)(Display *dpy, GLXWindow window);
+   /*GLXDrawable (*GetCurrentReadDrawable)(void);*/
+   int (*GetFBConfigAttrib)(Display *dpy, GLXFBConfig config, int attribute, int *value);
+   GLXFBConfig *(*GetFBConfigs)(Display *dpy, int screen, int *nelements);
+   void (*GetSelectedEvent)(Display *dpy, GLXDrawable drawable, unsigned long *mask);
+   XVisualInfo *(*GetVisualFromFBConfig)(Display *dpy, GLXFBConfig config);
+   Bool (*MakeContextCurrent)(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx);
+   int (*QueryContext)(Display *dpy, GLXContext ctx, int attribute, int *value);
+   void (*QueryDrawable)(Display *dpy, GLXDrawable draw, int attribute, unsigned int *value);
+   void (*SelectEvent)(Display *dpy, GLXDrawable drawable, unsigned long mask);
+
+   /*** GLX_SGI_swap_control ***/
+   int (*SwapIntervalSGI)(int);
+
+   /*** GLX_SGI_video_sync ***/
+   int (*GetVideoSyncSGI)(unsigned int *count);
+   int (*WaitVideoSyncSGI)(int divisor, int remainder, unsigned int *count);
+
+   /*** GLX_SGI_make_current_read ***/
+   Bool (*MakeCurrentReadSGI)(Display *, GLXDrawable, GLXDrawable, GLXContext);
+   /*GLXDrawable (*GetCurrentReadDrawableSGI)(void);*/
+
+   /*** GLX_SGIX_video_source (needs video library) ***/
+#if defined(_VL_H_)
+   GLXVideoSourceSGIX (*CreateGLXVideoSourceSGIX)(Display *, int, VLServer, VLPath, int, VLNode);
+   void (*DestroyGLXVideoSourceSGIX)(Display *, GLXVideoSourceSGIX);
+#else
+   void *CreateGLXVideoSourceSGIX;
+   void *DestroyGLXVideoSourceSGIX;
+#endif
+
+   /*** GLX_EXT_import_context ***/
+   void (*FreeContextEXT)(Display *dpy, GLXContext context);
+   GLXContextID (*GetContextIDEXT)(const GLXContext context);
+   /*Display *(*GetCurrentDisplayEXT)(void);*/
+   GLXContext (*ImportContextEXT)(Display *dpy, GLXContextID contextID);
+   int (*QueryContextInfoEXT)(Display *dpy, GLXContext context, int attribute,int *value);
+
+   /*** GLX_SGIX_fbconfig ***/
+   int (*GetFBConfigAttribSGIX)(Display *, GLXFBConfigSGIX, int, int *);
+   GLXFBConfigSGIX * (*ChooseFBConfigSGIX)(Display *, int, int *, int *);
+   GLXPixmap (*CreateGLXPixmapWithConfigSGIX)(Display *, GLXFBConfigSGIX, Pixmap);
+   GLXContext (*CreateContextWithConfigSGIX)(Display *, GLXFBConfigSGIX, int, GLXContext, Bool);
+   XVisualInfo * (*GetVisualFromFBConfigSGIX)(Display *, GLXFBConfigSGIX);
+   GLXFBConfigSGIX (*GetFBConfigFromVisualSGIX)(Display *, XVisualInfo *);
+
+   /*** GLX_SGIX_pbuffer ***/
+   GLXPbufferSGIX (*CreateGLXPbufferSGIX)(Display *, GLXFBConfigSGIX, unsigned int, unsigned int, int *);
+   void (*DestroyGLXPbufferSGIX)(Display *, GLXPbufferSGIX);
+   int (*QueryGLXPbufferSGIX)(Display *, GLXPbufferSGIX, int, unsigned int *);
+   void (*SelectEventSGIX)(Display *, GLXDrawable, unsigned long);
+   void (*GetSelectedEventSGIX)(Display *, GLXDrawable, unsigned long *);
+
+   /*** GLX_SGI_cushion ***/
+   void (*CushionSGI)(Display *, Window, float);
+
+   /*** GLX_SGIX_video_resize ***/
+   int (*BindChannelToWindowSGIX)(Display *, int, int, Window);
+   int (*ChannelRectSGIX)(Display *, int, int, int, int, int, int);
+   int (*QueryChannelRectSGIX)(Display *, int, int, int *, int *, int *, int *);
+   int (*QueryChannelDeltasSGIX)(Display *, int, int, int *, int *, int *, int *);
+   int (*ChannelRectSyncSGIX)(Display *, int, int, GLenum);
+
+   /*** GLX_SGIX_dmbuffer (needs dmedia library) ***/
+#if defined (_DM_BUFFER_H_)
+   Bool (*AssociateDMPbufferSGIX)(Display *, GLXPbufferSGIX, DMparams *, DMbuffer);
+#else
+   void *AssociciateDMPbufferSGIX;
+#endif
+
+   /*** GLX_SGIX_swap_group ***/
+   void (*JoinSwapGroupSGIX)(Display *, GLXDrawable, GLXDrawable);
+
+   /*** GLX_SGIX_swap_barrier ***/
+   void (*BindSwapBarrierSGIX)(Display *, GLXDrawable, int);
+   Bool (*QueryMaxSwapBarriersSGIX)(Display *, int, int *);
+
+   /*** GLX_SUN_get_transparent_index ***/
+   Status (*GetTransparentIndexSUN)(Display *, Window, Window, long *);
+
+   /*** GLX_MESA_copy_sub_buffer ***/
+   void (*CopySubBufferMESA)(Display *dpy, GLXDrawable drawable, int x, int y, int width, int height);
+
+   /*** GLX_MESA_release_buffers ***/
+   Bool (*ReleaseBuffersMESA)(Display *dpy, Window w);
+
+   /*** GLX_MESA_pixmap_colormap ***/
+   GLXPixmap (*CreateGLXPixmapMESA)(Display *dpy, XVisualInfo *visinfo, Pixmap pixmap, Colormap cmap);
+
+   /*** GLX_MESA_set_3dfx_mode ***/
+   Bool (*Set3DfxModeMESA)(int mode);
+
+   /*** GLX_NV_vertex_array_range ***/
+   void * (*AllocateMemoryNV)( GLsizei size,
+                               GLfloat readFrequency,
+                               GLfloat writeFrequency,
+                               GLfloat priority );
+   void (*FreeMemoryNV)( GLvoid *pointer );
+
+   /*** GLX_MESA_agp_offset ***/
+   GLuint (*GetAGPOffsetMESA)( const GLvoid *pointer );
+
+   /*** GLX_EXT_texture_from_pixmap ***/
+   void (*BindTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer,
+                           const int *attrib_list);
+   void (*ReleaseTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer);
+};
+
+
+
+extern const char *
+_glxapi_get_version(void);
+
+
+extern const char **
+_glxapi_get_extensions(void);
+
+
+extern GLuint
+_glxapi_get_dispatch_table_size(void);
+
+
+extern void
+_glxapi_set_no_op_table(struct _glxapi_table *t);
+
+
+extern __GLXextFuncPtr
+_glxapi_get_proc_address(const char *funcName);
+
+
+#endif
diff --git a/src/gallium/winsys/xlib/glxheader.h b/src/gallium/winsys/xlib/glxheader.h
new file mode 100644
index 0000000000..a402191f13
--- /dev/null
+++ b/src/gallium/winsys/xlib/glxheader.h
@@ -0,0 +1,62 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.5.1
+ * 
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef GLX_HEADER_H
+#define GLX_HEADER_H
+
+#ifdef __VMS
+#include <GL/vms_x_fix.h>
+#endif
+
+#include "glheader.h"
+
+#ifdef XFree86Server
+
+# include "resource.h"
+# include "windowstr.h"
+
+#else
+
+# include <X11/Xlib.h>
+# include <X11/Xlibint.h>
+# include <X11/Xutil.h>
+# ifdef USE_XSHM  /* was SHM */
+#  include <sys/ipc.h>
+#  include <sys/shm.h>
+#  include <X11/extensions/XShm.h>
+# endif
+# include <GL/glx.h>
+# include <sys/time.h>
+
+#endif
+
+
+
+/* this silences a compiler warning on several systems */
+struct timespec;
+struct itimerspec;
+
+
+#endif /*GLX_HEADER*/
diff --git a/src/gallium/winsys/xlib/realglx.c b/src/gallium/winsys/xlib/realglx.c
new file mode 100644
index 0000000000..30adb7465b
--- /dev/null
+++ b/src/gallium/winsys/xlib/realglx.c
@@ -0,0 +1,180 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  5.1
+ * 
+ * Copyright (C) 1999-2002  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include <assert.h>
+#include <GL/glx.h>
+#include "realglx.h"
+#include "glxapi.h"
+
+
+struct _glxapi_table *
+_real_GetGLXDispatchTable(void)
+{
+   static struct _glxapi_table glx;
+
+   /* be sure our dispatch table size <= libGL's table */
+   {
+      GLuint size = sizeof(struct _glxapi_table) / sizeof(void *);
+      (void) size;
+      assert(_glxapi_get_dispatch_table_size() >= size);
+   }
+
+   /* initialize the whole table to no-ops */
+   _glxapi_set_no_op_table(&glx);
+
+   /* now initialize the table with the functions I implement */
+
+   /*** GLX_VERSION_1_0 ***/
+   glx.ChooseVisual = _real_glXChooseVisual;
+   glx.CopyContext = _real_glXCopyContext;
+   glx.CreateContext = _real_glXCreateContext;
+   glx.CreateGLXPixmap = _real_glXCreateGLXPixmap;
+   glx.DestroyContext = _real_glXDestroyContext;
+   glx.DestroyGLXPixmap = _real_glXDestroyGLXPixmap;
+   glx.GetConfig = _real_glXGetConfig;
+   /*glx.GetCurrentContext = _real_glXGetCurrentContext;*/
+   /*glx.GetCurrentDrawable = _real_glXGetCurrentDrawable;*/
+   glx.IsDirect = _real_glXIsDirect;
+   glx.MakeCurrent = _real_glXMakeCurrent;
+   glx.QueryExtension = _real_glXQueryExtension;
+   glx.QueryVersion = _real_glXQueryVersion;
+   glx.SwapBuffers = _real_glXSwapBuffers;
+   glx.UseXFont = _real_glXUseXFont;
+   glx.WaitGL = _real_glXWaitGL;
+   glx.WaitX = _real_glXWaitX;
+
+   /*** GLX_VERSION_1_1 ***/
+   glx.GetClientString = _real_glXGetClientString;
+   glx.QueryExtensionsString = _real_glXQueryExtensionsString;
+   glx.QueryServerString = _real_glXQueryServerString;
+
+   /*** GLX_VERSION_1_2 ***/
+   /*glx.GetCurrentDisplay = _real_glXGetCurrentDisplay;*/
+
+   /*** GLX_VERSION_1_3 ***/
+   glx.ChooseFBConfig = _real_glXChooseFBConfig;
+   glx.CreateNewContext = _real_glXCreateNewContext;
+   glx.CreatePbuffer = _real_glXCreatePbuffer;
+   glx.CreatePixmap = _real_glXCreatePixmap;
+   glx.CreateWindow = _real_glXCreateWindow;
+   glx.DestroyPbuffer = _real_glXDestroyPbuffer;
+   glx.DestroyPixmap = _real_glXDestroyPixmap;
+   glx.DestroyWindow = _real_glXDestroyWindow;
+   /*glx.GetCurrentReadDrawable = _real_glXGetCurrentReadDrawable;*/
+   glx.GetFBConfigAttrib = _real_glXGetFBConfigAttrib;
+   glx.GetFBConfigs = _real_glXGetFBConfigs;
+   glx.GetSelectedEvent = _real_glXGetSelectedEvent;
+   glx.GetVisualFromFBConfig = _real_glXGetVisualFromFBConfig;
+   glx.MakeContextCurrent = _real_glXMakeContextCurrent;
+   glx.QueryContext = _real_glXQueryContext;
+   glx.QueryDrawable = _real_glXQueryDrawable;
+   glx.SelectEvent = _real_glXSelectEvent;
+
+   /*** GLX_SGI_swap_control ***/
+   glx.SwapIntervalSGI = _real_glXSwapIntervalSGI;
+
+   /*** GLX_SGI_video_sync ***/
+   glx.GetVideoSyncSGI = _real_glXGetVideoSyncSGI;
+   glx.WaitVideoSyncSGI = _real_glXWaitVideoSyncSGI;
+
+   /*** GLX_SGI_make_current_read ***/
+   glx.MakeCurrentReadSGI = _real_glXMakeCurrentReadSGI;
+   /*glx.GetCurrentReadDrawableSGI = _real_glXGetCurrentReadDrawableSGI;*/
+
+#if defined(_VL_H)
+   /*** GLX_SGIX_video_source ***/
+   glx.CreateGLXVideoSourceSGIX = _real_glXCreateGLXVideoSourceSGIX;
+   glx.DestroyGLXVideoSourceSGIX = _real_glXDestroyGLXVideoSourceSGIX;
+#endif
+
+   /*** GLX_EXT_import_context ***/
+   glx.FreeContextEXT = _real_glXFreeContextEXT;
+   /*glx.GetContextIDEXT = _real_glXGetContextIDEXT;*/
+   /*glx.GetCurrentDisplayEXT = _real_glXGetCurrentDisplayEXT;*/
+   glx.ImportContextEXT = _real_glXImportContextEXT;
+   glx.QueryContextInfoEXT = _real_glXQueryContextInfoEXT;
+
+   /*** GLX_SGIX_fbconfig ***/
+   glx.GetFBConfigAttribSGIX = _real_glXGetFBConfigAttribSGIX;
+   glx.ChooseFBConfigSGIX = _real_glXChooseFBConfigSGIX;
+   glx.CreateGLXPixmapWithConfigSGIX = _real_glXCreateGLXPixmapWithConfigSGIX;
+   glx.CreateContextWithConfigSGIX = _real_glXCreateContextWithConfigSGIX;
+   glx.GetVisualFromFBConfigSGIX = _real_glXGetVisualFromFBConfigSGIX;
+   glx.GetFBConfigFromVisualSGIX = _real_glXGetFBConfigFromVisualSGIX;
+
+   /*** GLX_SGIX_pbuffer ***/
+   glx.CreateGLXPbufferSGIX = _real_glXCreateGLXPbufferSGIX;
+   glx.DestroyGLXPbufferSGIX = _real_glXDestroyGLXPbufferSGIX;
+   glx.QueryGLXPbufferSGIX = _real_glXQueryGLXPbufferSGIX;
+   glx.SelectEventSGIX = _real_glXSelectEventSGIX;
+   glx.GetSelectedEventSGIX = _real_glXGetSelectedEventSGIX;
+
+   /*** GLX_SGI_cushion ***/
+   glx.CushionSGI = _real_glXCushionSGI;
+
+   /*** GLX_SGIX_video_resize ***/
+   glx.BindChannelToWindowSGIX = _real_glXBindChannelToWindowSGIX;
+   glx.ChannelRectSGIX = _real_glXChannelRectSGIX;
+   glx.QueryChannelRectSGIX = _real_glXQueryChannelRectSGIX;
+   glx.QueryChannelDeltasSGIX = _real_glXQueryChannelDeltasSGIX;
+   glx.ChannelRectSyncSGIX = _real_glXChannelRectSyncSGIX;
+
+#if defined(_DM_BUFFER_H_)
+   /*** (GLX_SGIX_dmbuffer ***/
+   glx.AssociateDMPbufferSGIX = NULL;
+#endif
+
+   /*** GLX_SGIX_swap_group ***/
+   glx.JoinSwapGroupSGIX = _real_glXJoinSwapGroupSGIX;
+
+   /*** GLX_SGIX_swap_barrier ***/
+   glx.BindSwapBarrierSGIX = _real_glXBindSwapBarrierSGIX;
+   glx.QueryMaxSwapBarriersSGIX = _real_glXQueryMaxSwapBarriersSGIX;
+
+   /*** GLX_SUN_get_transparent_index ***/
+   glx.GetTransparentIndexSUN = _real_glXGetTransparentIndexSUN;
+
+   /*** GLX_MESA_copy_sub_buffer ***/
+   glx.CopySubBufferMESA = _real_glXCopySubBufferMESA;
+
+   /*** GLX_MESA_release_buffers ***/
+   glx.ReleaseBuffersMESA = _real_glXReleaseBuffersMESA;
+
+   /*** GLX_MESA_pixmap_colormap ***/
+   glx.CreateGLXPixmapMESA = _real_glXCreateGLXPixmapMESA;
+
+   /*** GLX_MESA_set_3dfx_mode ***/
+   glx.Set3DfxModeMESA = _real_glXSet3DfxModeMESA;
+
+   /*** GLX_NV_vertex_array_range ***/
+   glx.AllocateMemoryNV = _real_glXAllocateMemoryNV;
+   glx.FreeMemoryNV = _real_glXFreeMemoryNV;
+
+   /*** GLX_MESA_agp_offset ***/
+   glx.GetAGPOffsetMESA = _real_glXGetAGPOffsetMESA;
+
+   return &glx;
+}
diff --git a/src/gallium/winsys/xlib/realglx.h b/src/gallium/winsys/xlib/realglx.h
new file mode 100644
index 0000000000..150129db68
--- /dev/null
+++ b/src/gallium/winsys/xlib/realglx.h
@@ -0,0 +1,326 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ * 
+ * Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef REALGLX_H
+#define REALGLX_H
+
+
+extern struct _glxapi_table *
+_real_GetGLXDispatchTable(void);
+
+
+/*
+ * Basically just need these to prevent compiler warnings.
+ */
+
+
+extern XVisualInfo *
+_real_glXChooseVisual( Display *dpy, int screen, int *list );
+
+extern GLXContext
+_real_glXCreateContext( Display *dpy, XVisualInfo *visinfo,
+                        GLXContext share_list, Bool direct );
+
+extern GLXPixmap
+_real_glXCreateGLXPixmap( Display *dpy, XVisualInfo *visinfo, Pixmap pixmap );
+
+extern GLXPixmap
+_real_glXCreateGLXPixmapMESA( Display *dpy, XVisualInfo *visinfo,
+                              Pixmap pixmap, Colormap cmap );
+
+extern void
+_real_glXDestroyGLXPixmap( Display *dpy, GLXPixmap pixmap );
+
+extern void
+_real_glXCopyContext( Display *dpy, GLXContext src, GLXContext dst,
+                      unsigned long mask );
+
+extern Bool
+_real_glXMakeCurrent( Display *dpy, GLXDrawable drawable, GLXContext ctx );
+
+extern Bool
+_real_glXQueryExtension( Display *dpy, int *errorb, int *event );
+
+extern void
+_real_glXDestroyContext( Display *dpy, GLXContext ctx );
+
+extern Bool
+_real_glXIsDirect( Display *dpy, GLXContext ctx );
+
+extern void
+_real_glXSwapBuffers( Display *dpy, GLXDrawable drawable );
+
+extern void
+_real_glXUseXFont( Font font, int first, int count, int listbase );
+
+extern Bool
+_real_glXQueryVersion( Display *dpy, int *maj, int *min );
+
+extern int
+_real_glXGetConfig( Display *dpy, XVisualInfo *visinfo,
+                    int attrib, int *value );
+
+extern void
+_real_glXWaitGL( void );
+
+
+extern void
+_real_glXWaitX( void );
+
+/* GLX 1.1 and later */
+extern const char *
+_real_glXQueryExtensionsString( Display *dpy, int screen );
+
+/* GLX 1.1 and later */
+extern const char *
+_real_glXQueryServerString( Display *dpy, int screen, int name );
+
+/* GLX 1.1 and later */
+extern const char *
+_real_glXGetClientString( Display *dpy, int name );
+
+
+/*
+ * GLX 1.3 and later
+ */
+
+extern GLXFBConfig *
+_real_glXChooseFBConfig( Display *dpy, int screen,
+                         const int *attribList, int *nitems );
+
+extern int
+_real_glXGetFBConfigAttrib( Display *dpy, GLXFBConfig config,
+                            int attribute, int *value );
+
+extern GLXFBConfig *
+_real_glXGetFBConfigs( Display *dpy, int screen, int *nelements );
+
+extern XVisualInfo *
+_real_glXGetVisualFromFBConfig( Display *dpy, GLXFBConfig config );
+
+extern GLXWindow
+_real_glXCreateWindow( Display *dpy, GLXFBConfig config, Window win,
+                       const int *attribList );
+
+extern void
+_real_glXDestroyWindow( Display *dpy, GLXWindow window );
+
+extern GLXPixmap
+_real_glXCreatePixmap( Display *dpy, GLXFBConfig config, Pixmap pixmap,
+                       const int *attribList );
+
+extern void
+_real_glXDestroyPixmap( Display *dpy, GLXPixmap pixmap );
+
+extern GLXPbuffer
+_real_glXCreatePbuffer( Display *dpy, GLXFBConfig config,
+                        const int *attribList );
+
+extern void
+_real_glXDestroyPbuffer( Display *dpy, GLXPbuffer pbuf );
+
+extern void
+_real_glXQueryDrawable( Display *dpy, GLXDrawable draw, int attribute,
+                        unsigned int *value );
+
+extern GLXContext
+_real_glXCreateNewContext( Display *dpy, GLXFBConfig config,
+                           int renderType, GLXContext shareList, Bool direct );
+
+
+extern Bool
+_real_glXMakeContextCurrent( Display *dpy, GLXDrawable draw,
+                             GLXDrawable read, GLXContext ctx );
+
+extern int
+_real_glXQueryContext( Display *dpy, GLXContext ctx, int attribute, int *value );
+
+extern void
+_real_glXSelectEvent( Display *dpy, GLXDrawable drawable, unsigned long mask );
+
+extern void
+_real_glXGetSelectedEvent( Display *dpy, GLXDrawable drawable,
+                           unsigned long *mask );
+
+#ifdef GLX_SGI_swap_control
+extern int
+_real_glXSwapIntervalSGI(int interval);
+#endif
+
+
+#ifdef GLX_SGI_video_sync
+extern int
+_real_glXGetVideoSyncSGI(unsigned int *count);
+
+extern int
+_real_glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count);
+#endif
+
+
+#ifdef GLX_SGI_make_current_read
+extern Bool
+_real_glXMakeCurrentReadSGI(Display *dpy, GLXDrawable draw, GLXDrawable read, GLXContext ctx);
+
+extern GLXDrawable
+_real_glXGetCurrentReadDrawableSGI(void);
+#endif
+
+#if defined(_VL_H) && defined(GLX_SGIX_video_source)
+extern GLXVideoSourceSGIX
+_real_glXCreateGLXVideoSourceSGIX(Display *dpy, int screen, VLServer server, VLPath path, int nodeClass, VLNode drainNode);
+
+extern void
+_real_glXDestroyGLXVideoSourceSGIX(Display *dpy, GLXVideoSourceSGIX src);
+#endif
+
+#ifdef GLX_EXT_import_context
+extern void
+_real_glXFreeContextEXT(Display *dpy, GLXContext context);
+
+extern GLXContextID
+_real_glXGetContextIDEXT(const GLXContext context);
+
+extern Display *
+_real_glXGetCurrentDisplayEXT(void);
+
+extern GLXContext
+_real_glXImportContextEXT(Display *dpy, GLXContextID contextID);
+
+extern int
+_real_glXQueryContextInfoEXT(Display *dpy, GLXContext context, int attribute, int *value);
+#endif
+
+#ifdef GLX_SGIX_fbconfig
+extern int
+_real_glXGetFBConfigAttribSGIX(Display *dpy, GLXFBConfigSGIX config, int attribute, int *value);
+
+extern GLXFBConfigSGIX *
+_real_glXChooseFBConfigSGIX(Display *dpy, int screen, int *attrib_list, int *nelements);
+
+extern GLXPixmap
+_real_glXCreateGLXPixmapWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, Pixmap pixmap);
+
+extern GLXContext
+_real_glXCreateContextWithConfigSGIX(Display *dpy, GLXFBConfigSGIX config, int render_type, GLXContext share_list, Bool direct);
+
+extern XVisualInfo *
+_real_glXGetVisualFromFBConfigSGIX(Display *dpy, GLXFBConfigSGIX config);
+
+extern GLXFBConfigSGIX
+_real_glXGetFBConfigFromVisualSGIX(Display *dpy, XVisualInfo *vis);
+#endif
+
+#ifdef GLX_SGIX_pbuffer
+extern GLXPbufferSGIX
+_real_glXCreateGLXPbufferSGIX(Display *dpy, GLXFBConfigSGIX config, unsigned int width, unsigned int height, int *attrib_list);
+
+extern void
+_real_glXDestroyGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf);
+
+extern int
+_real_glXQueryGLXPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuf, int attribute, unsigned int *value);
+
+extern void
+_real_glXSelectEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long mask);
+
+extern void
+_real_glXGetSelectedEventSGIX(Display *dpy, GLXDrawable drawable, unsigned long *mask);
+#endif
+
+#ifdef GLX_SGI_cushion
+extern void
+_real_glXCushionSGI(Display *dpy, Window win, float cushion);
+#endif
+
+#ifdef GLX_SGIX_video_resize
+extern int
+_real_glXBindChannelToWindowSGIX(Display *dpy, int screen, int channel , Window window);
+
+extern int
+_real_glXChannelRectSGIX(Display *dpy, int screen, int channel, int x, int y, int w, int h);
+
+extern int
+_real_glXQueryChannelRectSGIX(Display *dpy, int screen, int channel, int *x, int *y, int *w, int *h);
+
+extern int
+_real_glXQueryChannelDeltasSGIX(Display *dpy, int screen, int channel, int *dx, int *dy, int *dw, int *dh);
+
+extern int
+_real_glXChannelRectSyncSGIX(Display *dpy, int screen, int channel, GLenum synctype);
+#endif
+
+#if defined(_DM_BUFFER_H_) && defined(GLX_SGIX_dmbuffer)
+extern Bool
+_real_glXAssociateDMPbufferSGIX(Display *dpy, GLXPbufferSGIX pbuffer, DMparams *params, DMbuffer dmbuffer);
+#endif
+
+#ifdef GLX_SGIX_swap_group
+extern void
+_real_glXJoinSwapGroupSGIX(Display *dpy, GLXDrawable drawable, GLXDrawable member);
+#endif
+
+#ifdef GLX_SGIX_swap_barrier
+extern void
+_real_glXBindSwapBarrierSGIX(Display *dpy, GLXDrawable drawable, int barrier);
+
+extern Bool
+_real_glXQueryMaxSwapBarriersSGIX(Display *dpy, int screen, int *max);
+#endif
+
+#ifdef GLX_SUN_get_transparent_index
+extern Status
+_real_glXGetTransparentIndexSUN(Display *dpy, Window overlay, Window underlay, long *pTransparent);
+#endif
+
+#ifdef GLX_MESA_release_buffers
+extern Bool
+_real_glXReleaseBuffersMESA( Display *dpy, GLXDrawable d );
+#endif
+
+#ifdef GLX_MESA_set_3dfx_mode
+extern Bool
+_real_glXSet3DfxModeMESA( int mode );
+#endif
+
+#ifdef GLX_NV_vertex_array_range
+extern void *
+_real_glXAllocateMemoryNV(GLsizei size, GLfloat readfreq, GLfloat writefreq, GLfloat priority);
+extern void
+_real_glXFreeMemoryNV(GLvoid *pointer);
+#endif
+
+#ifdef GLX_MESA_agp_offset
+extern GLuint
+_real_glXGetAGPOffsetMESA(const GLvoid *pointer);
+#endif
+
+#ifdef GLX_MESA_copy_sub_buffer
+extern void
+_real_glXCopySubBufferMESA( Display *dpy, GLXDrawable drawable,
+                            int x, int y, int width, int height );
+#endif
+
+#endif /* REALGLX_H */
diff --git a/src/gallium/winsys/xlib/xfonts.c b/src/gallium/winsys/xlib/xfonts.c
new file mode 100644
index 0000000000..d72c600bd1
--- /dev/null
+++ b/src/gallium/winsys/xlib/xfonts.c
@@ -0,0 +1,377 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2000  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/* xfonts.c -- glXUseXFont() for Mesa written by
+ * Copyright (C) 1995 Thorsten.Ohl @ Physik.TH-Darmstadt.de
+ */
+
+#ifdef __VMS
+#include <GL/vms_x_fix.h>
+#endif
+
+#include "glxheader.h"
+#include "context.h"
+#include "imports.h"
+#include "xfonts.h"
+
+
+/* Some debugging info.  */
+
+#ifdef DEBUG
+#undef _R
+#undef _G
+#undef _B
+#include <ctype.h>
+
+int debug_xfonts = 0;
+
+static void
+dump_char_struct(XCharStruct * ch, char *prefix)
+{
+   printf("%slbearing = %d, rbearing = %d, width = %d\n",
+	  prefix, ch->lbearing, ch->rbearing, ch->width);
+   printf("%sascent = %d, descent = %d, attributes = %u\n",
+	  prefix, ch->ascent, ch->descent, (unsigned int) ch->attributes);
+}
+
+static void
+dump_font_struct(XFontStruct * font)
+{
+   printf("ascent = %d, descent = %d\n", font->ascent, font->descent);
+   printf("char_or_byte2 = (%u,%u)\n",
+	  font->min_char_or_byte2, font->max_char_or_byte2);
+   printf("byte1 = (%u,%u)\n", font->min_byte1, font->max_byte1);
+   printf("all_chars_exist = %s\n", font->all_chars_exist ? "True" : "False");
+   printf("default_char = %c (\\%03o)\n",
+	  (char) (isprint(font->default_char) ? font->default_char : ' '),
+	  font->default_char);
+   dump_char_struct(&font->min_bounds, "min> ");
+   dump_char_struct(&font->max_bounds, "max> ");
+#if 0
+   for (c = font->min_char_or_byte2; c <= font->max_char_or_byte2; c++) {
+      char prefix[8];
+      sprintf(prefix, "%d> ", c);
+      dump_char_struct(&font->per_char[c], prefix);
+   }
+#endif
+}
+
+static void
+dump_bitmap(unsigned int width, unsigned int height, GLubyte * bitmap)
+{
+   unsigned int x, y;
+
+   printf("    ");
+   for (x = 0; x < 8 * width; x++)
+      printf("%o", 7 - (x % 8));
+   putchar('\n');
+   for (y = 0; y < height; y++) {
+      printf("%3o:", y);
+      for (x = 0; x < 8 * width; x++)
+	 putchar((bitmap[width * (height - y - 1) + x / 8] & (1 << (7 - (x %
+									 8))))
+		 ? '*' : '.');
+      printf("   ");
+      for (x = 0; x < width; x++)
+	 printf("0x%02x, ", bitmap[width * (height - y - 1) + x]);
+      putchar('\n');
+   }
+}
+#endif /* DEBUG */
+
+
+/* Implementation.  */
+
+/* Fill a BITMAP with a character C from thew current font
+   in the graphics context GC.  WIDTH is the width in bytes
+   and HEIGHT is the height in bits.
+
+   Note that the generated bitmaps must be used with
+
+        glPixelStorei (GL_UNPACK_SWAP_BYTES, GL_FALSE);
+        glPixelStorei (GL_UNPACK_LSB_FIRST, GL_FALSE);
+        glPixelStorei (GL_UNPACK_ROW_LENGTH, 0);
+        glPixelStorei (GL_UNPACK_SKIP_ROWS, 0);
+        glPixelStorei (GL_UNPACK_SKIP_PIXELS, 0);
+        glPixelStorei (GL_UNPACK_ALIGNMENT, 1);
+
+   Possible optimizations:
+
+     * use only one reusable pixmap with the maximum dimensions.
+     * draw the entire font into a single pixmap (careful with
+       proportional fonts!).
+*/
+
+
+/*
+ * Generate OpenGL-compatible bitmap.
+ */
+static void
+fill_bitmap(Display * dpy, Window win, GC gc,
+	    unsigned int width, unsigned int height,
+	    int x0, int y0, unsigned int c, GLubyte * bitmap)
+{
+   XImage *image;
+   unsigned int x, y;
+   Pixmap pixmap;
+   XChar2b char2b;
+
+   pixmap = XCreatePixmap(dpy, win, 8 * width, height, 1);
+   XSetForeground(dpy, gc, 0);
+   XFillRectangle(dpy, pixmap, gc, 0, 0, 8 * width, height);
+   XSetForeground(dpy, gc, 1);
+
+   char2b.byte1 = (c >> 8) & 0xff;
+   char2b.byte2 = (c & 0xff);
+
+   XDrawString16(dpy, pixmap, gc, x0, y0, &char2b, 1);
+
+   image = XGetImage(dpy, pixmap, 0, 0, 8 * width, height, 1, XYPixmap);
+   if (image) {
+      /* Fill the bitmap (X11 and OpenGL are upside down wrt each other).  */
+      for (y = 0; y < height; y++)
+	 for (x = 0; x < 8 * width; x++)
+	    if (XGetPixel(image, x, y))
+	       bitmap[width * (height - y - 1) + x / 8] |=
+		  (1 << (7 - (x % 8)));
+      XDestroyImage(image);
+   }
+
+   XFreePixmap(dpy, pixmap);
+}
+
+/*
+ * determine if a given glyph is valid and return the
+ * corresponding XCharStruct.
+ */
+static XCharStruct *
+isvalid(XFontStruct * fs, unsigned int which)
+{
+   unsigned int rows, pages;
+   unsigned int byte1 = 0, byte2 = 0;
+   int i, valid = 1;
+
+   rows = fs->max_byte1 - fs->min_byte1 + 1;
+   pages = fs->max_char_or_byte2 - fs->min_char_or_byte2 + 1;
+
+   if (rows == 1) {
+      /* "linear" fonts */
+      if ((fs->min_char_or_byte2 > which) || (fs->max_char_or_byte2 < which))
+	 valid = 0;
+   }
+   else {
+      /* "matrix" fonts */
+      byte2 = which & 0xff;
+      byte1 = which >> 8;
+      if ((fs->min_char_or_byte2 > byte2) ||
+	  (fs->max_char_or_byte2 < byte2) ||
+	  (fs->min_byte1 > byte1) || (fs->max_byte1 < byte1))
+	 valid = 0;
+   }
+
+   if (valid) {
+      if (fs->per_char) {
+	 if (rows == 1) {
+	    /* "linear" fonts */
+	    return (fs->per_char + (which - fs->min_char_or_byte2));
+	 }
+	 else {
+	    /* "matrix" fonts */
+	    i = ((byte1 - fs->min_byte1) * pages) +
+	       (byte2 - fs->min_char_or_byte2);
+	    return (fs->per_char + i);
+	 }
+      }
+      else {
+	 return (&fs->min_bounds);
+      }
+   }
+   return (NULL);
+}
+
+
+void
+Fake_glXUseXFont(Font font, int first, int count, int listbase)
+{
+   Display *dpy;
+   Window win;
+   Pixmap pixmap;
+   GC gc;
+   XGCValues values;
+   unsigned long valuemask;
+   XFontStruct *fs;
+   GLint swapbytes, lsbfirst, rowlength;
+   GLint skiprows, skippixels, alignment;
+   unsigned int max_width, max_height, max_bm_width, max_bm_height;
+   GLubyte *bm;
+   int i;
+
+   dpy = glXGetCurrentDisplay();
+   if (!dpy)
+      return;			/* I guess glXMakeCurrent wasn't called */
+   win = RootWindow(dpy, DefaultScreen(dpy));
+
+   fs = XQueryFont(dpy, font);
+   if (!fs) {
+      _mesa_error(NULL, GL_INVALID_VALUE,
+		  "Couldn't get font structure information");
+      return;
+   }
+
+   /* Allocate a bitmap that can fit all characters.  */
+   max_width = fs->max_bounds.rbearing - fs->min_bounds.lbearing;
+   max_height = fs->max_bounds.ascent + fs->max_bounds.descent;
+   max_bm_width = (max_width + 7) / 8;
+   max_bm_height = max_height;
+
+   bm = (GLubyte *) MALLOC((max_bm_width * max_bm_height) * sizeof(GLubyte));
+   if (!bm) {
+      XFreeFontInfo(NULL, fs, 1);
+      _mesa_error(NULL, GL_OUT_OF_MEMORY,
+		  "Couldn't allocate bitmap in glXUseXFont()");
+      return;
+   }
+
+#if 0
+   /* get the page info */
+   pages = fs->max_char_or_byte2 - fs->min_char_or_byte2 + 1;
+   firstchar = (fs->min_byte1 << 8) + fs->min_char_or_byte2;
+   lastchar = (fs->max_byte1 << 8) + fs->max_char_or_byte2;
+   rows = fs->max_byte1 - fs->min_byte1 + 1;
+   unsigned int first_char, last_char, pages, rows;
+#endif
+
+   /* Save the current packing mode for bitmaps.  */
+   glGetIntegerv(GL_UNPACK_SWAP_BYTES, &swapbytes);
+   glGetIntegerv(GL_UNPACK_LSB_FIRST, &lsbfirst);
+   glGetIntegerv(GL_UNPACK_ROW_LENGTH, &rowlength);
+   glGetIntegerv(GL_UNPACK_SKIP_ROWS, &skiprows);
+   glGetIntegerv(GL_UNPACK_SKIP_PIXELS, &skippixels);
+   glGetIntegerv(GL_UNPACK_ALIGNMENT, &alignment);
+
+   /* Enforce a standard packing mode which is compatible with
+      fill_bitmap() from above.  This is actually the default mode,
+      except for the (non)alignment.  */
+   glPixelStorei(GL_UNPACK_SWAP_BYTES, GL_FALSE);
+   glPixelStorei(GL_UNPACK_LSB_FIRST, GL_FALSE);
+   glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+   glPixelStorei(GL_UNPACK_SKIP_ROWS, 0);
+   glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0);
+   glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+   pixmap = XCreatePixmap(dpy, win, 10, 10, 1);
+   values.foreground = BlackPixel(dpy, DefaultScreen(dpy));
+   values.background = WhitePixel(dpy, DefaultScreen(dpy));
+   values.font = fs->fid;
+   valuemask = GCForeground | GCBackground | GCFont;
+   gc = XCreateGC(dpy, pixmap, valuemask, &values);
+   XFreePixmap(dpy, pixmap);
+
+#ifdef DEBUG
+   if (debug_xfonts)
+      dump_font_struct(fs);
+#endif
+
+   for (i = 0; i < count; i++) {
+      unsigned int width, height, bm_width, bm_height;
+      GLfloat x0, y0, dx, dy;
+      XCharStruct *ch;
+      int x, y;
+      unsigned int c = first + i;
+      int list = listbase + i;
+      int valid;
+
+      /* check on index validity and get the bounds */
+      ch = isvalid(fs, c);
+      if (!ch) {
+	 ch = &fs->max_bounds;
+	 valid = 0;
+      }
+      else {
+	 valid = 1;
+      }
+
+#ifdef DEBUG
+      if (debug_xfonts) {
+	 char s[7];
+	 sprintf(s, isprint(c) ? "%c> " : "\\%03o> ", c);
+	 dump_char_struct(ch, s);
+      }
+#endif
+
+      /* glBitmap()' parameters:
+         straight from the glXUseXFont(3) manpage.  */
+      width = ch->rbearing - ch->lbearing;
+      height = ch->ascent + ch->descent;
+      x0 = -ch->lbearing;
+      y0 = ch->descent - 0;	/* XXX used to subtract 1 here */
+      /* but that caused a conformace failure */
+      dx = ch->width;
+      dy = 0;
+
+      /* X11's starting point.  */
+      x = -ch->lbearing;
+      y = ch->ascent;
+
+      /* Round the width to a multiple of eight.  We will use this also
+         for the pixmap for capturing the X11 font.  This is slightly
+         inefficient, but it makes the OpenGL part real easy.  */
+      bm_width = (width + 7) / 8;
+      bm_height = height;
+
+      glNewList(list, GL_COMPILE);
+      if (valid && (bm_width > 0) && (bm_height > 0)) {
+
+	 MEMSET(bm, '\0', bm_width * bm_height);
+	 fill_bitmap(dpy, win, gc, bm_width, bm_height, x, y, c, bm);
+
+	 glBitmap(width, height, x0, y0, dx, dy, bm);
+#ifdef DEBUG
+	 if (debug_xfonts) {
+	    printf("width/height = %u/%u\n", width, height);
+	    printf("bm_width/bm_height = %u/%u\n", bm_width, bm_height);
+	    dump_bitmap(bm_width, bm_height, bm);
+	 }
+#endif
+      }
+      else {
+	 glBitmap(0, 0, 0.0, 0.0, dx, dy, NULL);
+      }
+      glEndList();
+   }
+
+   FREE(bm);
+   XFreeFontInfo(NULL, fs, 1);
+   XFreeGC(dpy, gc);
+
+   /* Restore saved packing modes.  */
+   glPixelStorei(GL_UNPACK_SWAP_BYTES, swapbytes);
+   glPixelStorei(GL_UNPACK_LSB_FIRST, lsbfirst);
+   glPixelStorei(GL_UNPACK_ROW_LENGTH, rowlength);
+   glPixelStorei(GL_UNPACK_SKIP_ROWS, skiprows);
+   glPixelStorei(GL_UNPACK_SKIP_PIXELS, skippixels);
+   glPixelStorei(GL_UNPACK_ALIGNMENT, alignment);
+}
diff --git a/src/gallium/winsys/xlib/xfonts.h b/src/gallium/winsys/xlib/xfonts.h
new file mode 100644
index 0000000000..e36f42f817
--- /dev/null
+++ b/src/gallium/winsys/xlib/xfonts.h
@@ -0,0 +1,41 @@
+
+/*
+ * Mesa 3-D graphics library
+ * Version:  3.5
+ *
+ * Copyright (C) 1999-2000  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef XFONTS_H
+#define XFONTS_H
+
+#ifdef __VMS
+#include <GL/vms_x_fix.h>
+#endif
+
+#include <X11/Xlib.h>
+
+
+extern void Fake_glXUseXFont( Font font, int first, int count, int listbase );
+
+
+#endif
+
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
new file mode 100644
index 0000000000..d28a6423b9
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -0,0 +1,1422 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file xm_api.c
+ *
+ * All the XMesa* API functions.
+ *
+ *
+ * NOTES:
+ *
+ * The window coordinate system origin (0,0) is in the lower-left corner
+ * of the window.  X11's window coordinate origin is in the upper-left
+ * corner of the window.  Therefore, most drawing functions in this
+ * file have to flip Y coordinates.
+ *
+ * Define USE_XSHM in the Makefile with -DUSE_XSHM if you want to compile
+ * in support for the MIT Shared Memory extension.  If enabled, when you
+ * use an Ximage for the back buffer in double buffered mode, the "swap"
+ * operation will be faster.  You must also link with -lXext.
+ *
+ * Byte swapping:  If the Mesa host and the X display use a different
+ * byte order then there's some trickiness to be aware of when using
+ * XImages.  The byte ordering used for the XImage is that of the X
+ * display, not the Mesa host.
+ * The color-to-pixel encoding for True/DirectColor must be done
+ * according to the display's visual red_mask, green_mask, and blue_mask.
+ * If XPutPixel is used to put a pixel into an XImage then XPutPixel will
+ * do byte swapping if needed.  If one wants to directly "poke" the pixel
+ * into the XImage's buffer then the pixel must be byte swapped first.
+ *
+ */
+
+#ifdef __CYGWIN__
+#undef WIN32
+#undef __WIN32__
+#endif
+
+#include "glxheader.h"
+#include "GL/xmesa.h"
+#include "xmesaP.h"
+#include "main/context.h"
+#include "main/framebuffer.h"
+
+#include "state_tracker/st_public.h"
+#include "state_tracker/st_context.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_context.h"
+
+#include "xm_winsys_aub.h"
+
+/**
+ * Global X driver lock
+ */
+pipe_mutex _xmesa_lock;
+
+
+int xmesa_mode;
+
+
+/**********************************************************************/
+/*****                     X Utility Functions                    *****/
+/**********************************************************************/
+
+
+/**
+ * Return the host's byte order as LSBFirst or MSBFirst ala X.
+ */
+#ifndef XFree86Server
+static int host_byte_order( void )
+{
+   int i = 1;
+   char *cptr = (char *) &i;
+   return (*cptr==1) ? LSBFirst : MSBFirst;
+}
+#endif
+
+
+/**
+ * Check if the X Shared Memory extension is available.
+ * Return:  0 = not available
+ *          1 = shared XImage support available
+ *          2 = shared Pixmap support available also
+ */
+int xmesa_check_for_xshm( XMesaDisplay *display )
+{
+#if defined(USE_XSHM) && !defined(XFree86Server)
+   int major, minor, ignore;
+   Bool pixmaps;
+
+   if (getenv("SP_NO_RAST")) 
+      return 0;
+
+   if (getenv("MESA_NOSHM")) {
+      return 0;
+   }
+
+   if (XQueryExtension( display, "MIT-SHM", &ignore, &ignore, &ignore )) {
+      if (XShmQueryVersion( display, &major, &minor, &pixmaps )==True) {
+	 return (pixmaps==True) ? 2 : 1;
+      }
+      else {
+	 return 0;
+      }
+   }
+   else {
+      return 0;
+   }
+#else
+   /* No  XSHM support */
+   return 0;
+#endif
+}
+
+
+/**
+ * Return the true number of bits per pixel for XImages.
+ * For example, if we request a 24-bit deep visual we may actually need/get
+ * 32bpp XImages.  This function returns the appropriate bpp.
+ * Input:  dpy - the X display
+ *         visinfo - desribes the visual to be used for XImages
+ * Return:  true number of bits per pixel for XImages
+ */
+static int
+bits_per_pixel( XMesaVisual xmv )
+{
+#ifdef XFree86Server
+   const int depth = xmv->nplanes;
+   int i;
+   assert(depth > 0);
+   for (i = 0; i < screenInfo.numPixmapFormats; i++) {
+      if (screenInfo.formats[i].depth == depth)
+         return screenInfo.formats[i].bitsPerPixel;
+   }
+   return depth;  /* should never get here, but this should be safe */
+#else
+   XMesaDisplay *dpy = xmv->display;
+   XMesaVisualInfo visinfo = xmv->visinfo;
+   XMesaImage *img;
+   int bitsPerPixel;
+   /* Create a temporary XImage */
+   img = XCreateImage( dpy, visinfo->visual, visinfo->depth,
+		       ZPixmap, 0,           /*format, offset*/
+		       (char*) MALLOC(8),    /*data*/
+		       1, 1,                 /*width, height*/
+		       32,                   /*bitmap_pad*/
+		       0                     /*bytes_per_line*/
+                     );
+   assert(img);
+   /* grab the bits/pixel value */
+   bitsPerPixel = img->bits_per_pixel;
+   /* free the XImage */
+   _mesa_free( img->data );
+   img->data = NULL;
+   XMesaDestroyImage( img );
+   return bitsPerPixel;
+#endif
+}
+
+
+
+/*
+ * Determine if a given X window ID is valid (window exists).
+ * Do this by calling XGetWindowAttributes() for the window and
+ * checking if we catch an X error.
+ * Input:  dpy - the display
+ *         win - the window to check for existance
+ * Return:  GL_TRUE - window exists
+ *          GL_FALSE - window doesn't exist
+ */
+#ifndef XFree86Server
+static GLboolean WindowExistsFlag;
+
+static int window_exists_err_handler( XMesaDisplay* dpy, XErrorEvent* xerr )
+{
+   (void) dpy;
+   if (xerr->error_code == BadWindow) {
+      WindowExistsFlag = GL_FALSE;
+   }
+   return 0;
+}
+
+static GLboolean window_exists( XMesaDisplay *dpy, Window win )
+{
+   XWindowAttributes wa;
+   int (*old_handler)( XMesaDisplay*, XErrorEvent* );
+   WindowExistsFlag = GL_TRUE;
+   old_handler = XSetErrorHandler(window_exists_err_handler);
+   XGetWindowAttributes( dpy, win, &wa ); /* dummy request */
+   XSetErrorHandler(old_handler);
+   return WindowExistsFlag;
+}
+
+static Status
+get_drawable_size( XMesaDisplay *dpy, Drawable d, uint *width, uint *height )
+{
+   Window root;
+   Status stat;
+   int xpos, ypos;
+   unsigned int w, h, bw, depth;
+   stat = XGetGeometry(dpy, d, &root, &xpos, &ypos, &w, &h, &bw, &depth);
+   *width = w;
+   *height = h;
+   return stat;
+}
+#endif
+
+
+/**
+ * Return the size of the window (or pixmap) that corresponds to the
+ * given XMesaBuffer.
+ * \param width  returns width in pixels
+ * \param height  returns height in pixels
+ */
+static void
+xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
+                      GLuint *width, GLuint *height)
+{
+#ifdef XFree86Server
+   *width = MIN2(b->drawable->width, MAX_WIDTH);
+   *height = MIN2(b->drawable->height, MAX_HEIGHT);
+#else
+   Status stat;
+
+   pipe_mutex_lock(_xmesa_lock);
+   XSync(b->xm_visual->display, 0); /* added for Chromium */
+   stat = get_drawable_size(dpy, b->drawable, width, height);
+   pipe_mutex_unlock(_xmesa_lock);
+
+   if (!stat) {
+      /* probably querying a window that's recently been destroyed */
+      _mesa_warning(NULL, "XGetGeometry failed!\n");
+      *width = *height = 1;
+   }
+#endif
+}
+
+
+/**
+ * Choose the pixel format for the given visual.
+ * This will tell the gallium driver how to pack pixel data into
+ * drawing surfaces.
+ */
+static GLuint
+choose_pixel_format(XMesaVisual v)
+{
+   if (   GET_REDMASK(v)   == 0x0000ff
+       && GET_GREENMASK(v) == 0x00ff00
+       && GET_BLUEMASK(v)  == 0xff0000
+       && v->BitsPerPixel == 32) {
+      if (CHECK_BYTE_ORDER(v)) {
+         /* no byteswapping needed */
+         return 0 /* PIXEL_FORMAT_U_A8_B8_G8_R8 */;
+      }
+      else {
+         return PIPE_FORMAT_R8G8B8A8_UNORM;
+      }
+   }
+   else if (   GET_REDMASK(v)   == 0xff0000
+            && GET_GREENMASK(v) == 0x00ff00
+            && GET_BLUEMASK(v)  == 0x0000ff
+            && v->BitsPerPixel == 32) {
+      if (CHECK_BYTE_ORDER(v)) {
+         /* no byteswapping needed */
+         return PIPE_FORMAT_A8R8G8B8_UNORM;
+      }
+      else {
+         return PIPE_FORMAT_B8G8R8A8_UNORM;
+      }
+   }
+   else if (   GET_REDMASK(v)   == 0xf800
+            && GET_GREENMASK(v) == 0x07e0
+            && GET_BLUEMASK(v)  == 0x001f
+            && CHECK_BYTE_ORDER(v)
+            && v->BitsPerPixel == 16) {
+      /* 5-6-5 RGB */
+      return PIPE_FORMAT_R5G6B5_UNORM;
+   }
+
+   assert(0);
+   return 0;
+}
+
+
+
+/**********************************************************************/
+/*****                Linked list of XMesaBuffers                 *****/
+/**********************************************************************/
+
+XMesaBuffer XMesaBufferList = NULL;
+
+
+/**
+ * Allocate a new XMesaBuffer object which corresponds to the given drawable.
+ * Note that XMesaBuffer is derived from GLframebuffer.
+ * The new XMesaBuffer will not have any size (Width=Height=0).
+ *
+ * \param d  the corresponding X drawable (window or pixmap)
+ * \param type  either WINDOW, PIXMAP or PBUFFER, describing d
+ * \param vis  the buffer's visual
+ * \param cmap  the window's colormap, if known.
+ * \return new XMesaBuffer or NULL if any problem
+ */
+static XMesaBuffer
+create_xmesa_buffer(XMesaDrawable d, BufferType type,
+                    XMesaVisual vis, XMesaColormap cmap)
+{
+   XMesaBuffer b;
+   GLframebuffer *fb;
+   enum pipe_format colorFormat, depthFormat, stencilFormat;
+   uint width, height;
+
+   ASSERT(type == WINDOW || type == PIXMAP || type == PBUFFER);
+
+   b = (XMesaBuffer) CALLOC_STRUCT(xmesa_buffer);
+   if (!b)
+      return NULL;
+
+   b->drawable = d;
+
+   b->xm_visual = vis;
+   b->type = type;
+   b->cmap = cmap;
+
+   /* determine PIPE_FORMATs for buffers */
+   colorFormat = choose_pixel_format(vis);
+
+   if (vis->mesa_visual.depthBits == 0)
+      depthFormat = PIPE_FORMAT_NONE;
+#ifdef GALLIUM_CELL /* XXX temporary for Cell! */
+   else
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+#else
+   else if (vis->mesa_visual.depthBits <= 16)
+      depthFormat = PIPE_FORMAT_Z16_UNORM;
+   else if (vis->mesa_visual.depthBits <= 24)
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+   else
+      depthFormat = PIPE_FORMAT_Z32_UNORM;
+#endif
+
+   if (vis->mesa_visual.stencilBits == 8) {
+      if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
+         stencilFormat = depthFormat;
+      else
+         stencilFormat = PIPE_FORMAT_S8_UNORM;
+   }
+   else {
+      /* no stencil */
+      stencilFormat = PIPE_FORMAT_NONE;
+      if (depthFormat == PIPE_FORMAT_S8Z24_UNORM) {
+         /* use 24-bit Z, undefined stencil channel */
+         depthFormat = PIPE_FORMAT_X8Z24_UNORM;
+      }
+   }
+
+
+   get_drawable_size(vis->display, d, &width, &height);
+
+   /*
+    * Create framebuffer, but we'll plug in our own renderbuffers below.
+    */
+   b->stfb = st_create_framebuffer(&vis->mesa_visual,
+                                   colorFormat, depthFormat, stencilFormat,
+                                   width, height,
+                                   (void *) b);
+   fb = &b->stfb->Base;
+
+   /*
+    * Create scratch XImage for xmesa_display_surface()
+    */
+   b->tempImage = XCreateImage(vis->display,
+                               vis->visinfo->visual,
+                               vis->visinfo->depth,
+                               ZPixmap, 0,   /* format, offset */
+                               NULL,         /* data */
+                               0, 0,         /* size */
+                               32,           /* bitmap_pad */
+                               0);           /* bytes_per_line */
+
+   /* GLX_EXT_texture_from_pixmap */
+   b->TextureTarget = 0;
+   b->TextureFormat = GLX_TEXTURE_FORMAT_NONE_EXT;
+   b->TextureMipmap = 0;
+
+   /* insert buffer into linked list */
+   b->Next = XMesaBufferList;
+   XMesaBufferList = b;
+
+   return b;
+}
+
+
+/**
+ * Find an XMesaBuffer by matching X display and colormap but NOT matching
+ * the notThis buffer.
+ */
+XMesaBuffer
+xmesa_find_buffer(XMesaDisplay *dpy, XMesaColormap cmap, XMesaBuffer notThis)
+{
+   XMesaBuffer b;
+   for (b = XMesaBufferList; b; b = b->Next) {
+      if (b->xm_visual->display == dpy &&
+          b->cmap == cmap &&
+          b != notThis) {
+         return b;
+      }
+   }
+   return NULL;
+}
+
+
+/**
+ * Remove buffer from linked list, delete if no longer referenced.
+ */
+static void
+xmesa_free_buffer(XMesaBuffer buffer)
+{
+   XMesaBuffer prev = NULL, b;
+
+   for (b = XMesaBufferList; b; b = b->Next) {
+      if (b == buffer) {
+         struct gl_framebuffer *fb = &buffer->stfb->Base;
+
+         /* unlink buffer from list */
+         if (prev)
+            prev->Next = buffer->Next;
+         else
+            XMesaBufferList = buffer->Next;
+
+         /* mark as delete pending */
+         fb->DeletePending = GL_TRUE;
+
+         /* Since the X window for the XMesaBuffer is going away, we don't
+          * want to dereference this pointer in the future.
+          */
+         b->drawable = 0;
+
+         buffer->tempImage->data = NULL;
+         XDestroyImage(buffer->tempImage);
+
+         /* Unreference.  If count = zero we'll really delete the buffer */
+         _mesa_unreference_framebuffer(&fb);
+
+         XFreeGC(b->xm_visual->display, b->gc);
+
+         free(buffer);
+
+         return;
+      }
+      /* continue search */
+      prev = b;
+   }
+   /* buffer not found in XMesaBufferList */
+   _mesa_problem(NULL,"xmesa_free_buffer() - buffer not found\n");
+}
+
+
+
+/**********************************************************************/
+/*****                   Misc Private Functions                   *****/
+/**********************************************************************/
+
+
+/**
+ * When a context is bound for the first time, we can finally finish
+ * initializing the context's visual and buffer information.
+ * \param v  the XMesaVisual to initialize
+ * \param b  the XMesaBuffer to initialize (may be NULL)
+ * \param rgb_flag  TRUE = RGBA mode, FALSE = color index mode
+ * \param window  the window/pixmap we're rendering into
+ * \param cmap  the colormap associated with the window/pixmap
+ * \return GL_TRUE=success, GL_FALSE=failure
+ */
+static GLboolean
+initialize_visual_and_buffer(XMesaVisual v, XMesaBuffer b,
+                             GLboolean rgb_flag, XMesaDrawable window,
+                             XMesaColormap cmap)
+{
+#ifdef XFree86Server
+   int client = (window) ? CLIENT_ID(window->id) : 0;
+#endif
+
+   ASSERT(!b || b->xm_visual == v);
+
+   /* Save true bits/pixel */
+   v->BitsPerPixel = bits_per_pixel(v);
+   assert(v->BitsPerPixel > 0);
+
+   if (rgb_flag == GL_FALSE) {
+      /* COLOR-INDEXED WINDOW: not supported*/
+      return GL_FALSE;
+   }
+   else {
+      /* RGB WINDOW:
+       * We support RGB rendering into almost any kind of visual.
+       */
+      const int xclass = v->mesa_visual.visualType;
+      if (xclass != GLX_TRUE_COLOR && xclass == !GLX_DIRECT_COLOR) {
+	 _mesa_warning(NULL,
+            "XMesa: RGB mode rendering not supported in given visual.\n");
+	 return GL_FALSE;
+      }
+      v->mesa_visual.indexBits = 0;
+
+      if (v->BitsPerPixel == 32) {
+         /* We use XImages for all front/back buffers.  If an X Window or
+          * X Pixmap is 32bpp, there's no guarantee that the alpha channel
+          * will be preserved.  For XImages we're in luck.
+          */
+         v->mesa_visual.alphaBits = 8;
+      }
+   }
+
+   /*
+    * If MESA_INFO env var is set print out some debugging info
+    * which can help Brian figure out what's going on when a user
+    * reports bugs.
+    */
+   if (_mesa_getenv("MESA_INFO")) {
+      _mesa_printf("X/Mesa visual = %p\n", (void *) v);
+      _mesa_printf("X/Mesa level = %d\n", v->mesa_visual.level);
+      _mesa_printf("X/Mesa depth = %d\n", GET_VISUAL_DEPTH(v));
+      _mesa_printf("X/Mesa bits per pixel = %d\n", v->BitsPerPixel);
+   }
+
+   if (b && window) {
+      /* these should have been set in create_xmesa_buffer */
+      ASSERT(b->drawable == window);
+
+      /* Setup for single/double buffering */
+      if (v->mesa_visual.doubleBufferMode) {
+         /* Double buffered */
+         b->shm = xmesa_check_for_xshm( v->display );
+      }
+
+      /* X11 graphics context */
+#ifdef XFree86Server
+      b->gc = CreateScratchGC(v->display, window->depth);
+#else
+      b->gc = XCreateGC( v->display, window, 0, NULL );
+#endif
+      XMesaSetFunction( v->display, b->gc, GXcopy );
+   }
+
+   return GL_TRUE;
+}
+
+
+
+#define NUM_VISUAL_TYPES   6
+
+/**
+ * Convert an X visual type to a GLX visual type.
+ * 
+ * \param visualType X visual type (i.e., \c TrueColor, \c StaticGray, etc.)
+ *        to be converted.
+ * \return If \c visualType is a valid X visual type, a GLX visual type will
+ *         be returned.  Otherwise \c GLX_NONE will be returned.
+ * 
+ * \note
+ * This code was lifted directly from lib/GL/glx/glcontextmodes.c in the
+ * DRI CVS tree.
+ */
+static GLint
+xmesa_convert_from_x_visual_type( int visualType )
+{
+    static const int glx_visual_types[ NUM_VISUAL_TYPES ] = {
+	GLX_STATIC_GRAY,  GLX_GRAY_SCALE,
+	GLX_STATIC_COLOR, GLX_PSEUDO_COLOR,
+	GLX_TRUE_COLOR,   GLX_DIRECT_COLOR
+    };
+
+    return ( (unsigned) visualType < NUM_VISUAL_TYPES )
+	? glx_visual_types[ visualType ] : GLX_NONE;
+}
+
+
+/**********************************************************************/
+/*****                       Public Functions                     *****/
+/**********************************************************************/
+
+
+/*
+ * Create a new X/Mesa visual.
+ * Input:  display - X11 display
+ *         visinfo - an XVisualInfo pointer
+ *         rgb_flag - GL_TRUE = RGB mode,
+ *                    GL_FALSE = color index mode
+ *         alpha_flag - alpha buffer requested?
+ *         db_flag - GL_TRUE = double-buffered,
+ *                   GL_FALSE = single buffered
+ *         stereo_flag - stereo visual?
+ *         ximage_flag - GL_TRUE = use an XImage for back buffer,
+ *                       GL_FALSE = use an off-screen pixmap for back buffer
+ *         depth_size - requested bits/depth values, or zero
+ *         stencil_size - requested bits/stencil values, or zero
+ *         accum_red_size - requested bits/red accum values, or zero
+ *         accum_green_size - requested bits/green accum values, or zero
+ *         accum_blue_size - requested bits/blue accum values, or zero
+ *         accum_alpha_size - requested bits/alpha accum values, or zero
+ *         num_samples - number of samples/pixel if multisampling, or zero
+ *         level - visual level, usually 0
+ *         visualCaveat - ala the GLX extension, usually GLX_NONE
+ * Return;  a new XMesaVisual or 0 if error.
+ */
+PUBLIC
+XMesaVisual XMesaCreateVisual( XMesaDisplay *display,
+                               XMesaVisualInfo visinfo,
+                               GLboolean rgb_flag,
+                               GLboolean alpha_flag,
+                               GLboolean db_flag,
+                               GLboolean stereo_flag,
+                               GLboolean ximage_flag,
+                               GLint depth_size,
+                               GLint stencil_size,
+                               GLint accum_red_size,
+                               GLint accum_green_size,
+                               GLint accum_blue_size,
+                               GLint accum_alpha_size,
+                               GLint num_samples,
+                               GLint level,
+                               GLint visualCaveat )
+{
+   XMesaVisual v;
+   GLint red_bits, green_bits, blue_bits, alpha_bits;
+
+#ifndef XFree86Server
+   /* For debugging only */
+   if (_mesa_getenv("MESA_XSYNC")) {
+      /* This makes debugging X easier.
+       * In your debugger, set a breakpoint on _XError to stop when an
+       * X protocol error is generated.
+       */
+      XSynchronize( display, 1 );
+   }
+#endif
+
+   v = (XMesaVisual) CALLOC_STRUCT(xmesa_visual);
+   if (!v) {
+      return NULL;
+   }
+
+   v->display = display;
+
+   /* Save a copy of the XVisualInfo struct because the user may X_mesa_free()
+    * the struct but we may need some of the information contained in it
+    * at a later time.
+    */
+#ifndef XFree86Server
+   v->visinfo = (XVisualInfo *) MALLOC(sizeof(*visinfo));
+   if(!v->visinfo) {
+      _mesa_free(v);
+      return NULL;
+   }
+   MEMCPY(v->visinfo, visinfo, sizeof(*visinfo));
+#endif
+
+   v->ximage_flag = ximage_flag;
+
+#ifdef XFree86Server
+   /* We could calculate these values by ourselves.  nplanes is either the sum
+    * of the red, green, and blue bits or the number index bits.
+    * ColormapEntries is either (1U << index_bits) or
+    * (1U << max(redBits, greenBits, blueBits)).
+    */
+   assert(visinfo->nplanes > 0);
+   v->nplanes = visinfo->nplanes;
+   v->ColormapEntries = visinfo->ColormapEntries;
+
+   v->mesa_visual.redMask = visinfo->redMask;
+   v->mesa_visual.greenMask = visinfo->greenMask;
+   v->mesa_visual.blueMask = visinfo->blueMask;
+   v->mesa_visual.visualID = visinfo->vid;
+   v->mesa_visual.screen = 0; /* FIXME: What should be done here? */
+#else
+   v->mesa_visual.redMask = visinfo->red_mask;
+   v->mesa_visual.greenMask = visinfo->green_mask;
+   v->mesa_visual.blueMask = visinfo->blue_mask;
+   v->mesa_visual.visualID = visinfo->visualid;
+   v->mesa_visual.screen = visinfo->screen;
+#endif
+
+#if defined(XFree86Server) || !(defined(__cplusplus) || defined(c_plusplus))
+   v->mesa_visual.visualType = xmesa_convert_from_x_visual_type(visinfo->class);
+#else
+   v->mesa_visual.visualType = xmesa_convert_from_x_visual_type(visinfo->c_class);
+#endif
+
+   v->mesa_visual.visualRating = visualCaveat;
+
+   if (alpha_flag)
+      v->mesa_visual.alphaBits = 8;
+
+   (void) initialize_visual_and_buffer( v, NULL, rgb_flag, 0, 0 );
+
+   {
+      const int xclass = v->mesa_visual.visualType;
+      if (xclass == GLX_TRUE_COLOR || xclass == GLX_DIRECT_COLOR) {
+         red_bits   = _mesa_bitcount(GET_REDMASK(v));
+         green_bits = _mesa_bitcount(GET_GREENMASK(v));
+         blue_bits  = _mesa_bitcount(GET_BLUEMASK(v));
+      }
+      else {
+         /* this is an approximation */
+         int depth;
+         depth = GET_VISUAL_DEPTH(v);
+         red_bits = depth / 3;
+         depth -= red_bits;
+         green_bits = depth / 2;
+         depth -= green_bits;
+         blue_bits = depth;
+         alpha_bits = 0;
+         assert( red_bits + green_bits + blue_bits == GET_VISUAL_DEPTH(v) );
+      }
+      alpha_bits = v->mesa_visual.alphaBits;
+   }
+
+   _mesa_initialize_visual( &v->mesa_visual,
+                            rgb_flag, db_flag, stereo_flag,
+                            red_bits, green_bits,
+                            blue_bits, alpha_bits,
+                            v->mesa_visual.indexBits,
+                            depth_size,
+                            stencil_size,
+                            accum_red_size, accum_green_size,
+                            accum_blue_size, accum_alpha_size,
+                            0 );
+
+   /* XXX minor hack */
+   v->mesa_visual.level = level;
+   return v;
+}
+
+
+PUBLIC
+void XMesaDestroyVisual( XMesaVisual v )
+{
+#ifndef XFree86Server
+   _mesa_free(v->visinfo);
+#endif
+   _mesa_free(v);
+}
+
+
+
+/**
+ * Create a new XMesaContext.
+ * \param v  the XMesaVisual
+ * \param share_list  another XMesaContext with which to share display
+ *                    lists or NULL if no sharing is wanted.
+ * \return an XMesaContext or NULL if error.
+ */
+PUBLIC
+XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
+{
+   static GLboolean firstTime = GL_TRUE;
+   struct pipe_context *pipe;
+   XMesaContext c;
+   GLcontext *mesaCtx;
+   uint pf;
+
+   if (firstTime) {
+      pipe_mutex_init(_xmesa_lock);
+      firstTime = GL_FALSE;
+   }
+
+   /* Note: the XMesaContext contains a Mesa GLcontext struct (inheritance) */
+   c = (XMesaContext) CALLOC_STRUCT(xmesa_context);
+   if (!c)
+      return NULL;
+
+   pf = choose_pixel_format(v);
+   assert(pf);
+
+   c->xm_visual = v;
+   c->xm_buffer = NULL;   /* set later by XMesaMakeCurrent */
+
+   if (!getenv("XM_AUB")) {
+      xmesa_mode = XMESA_SOFTPIPE;
+      pipe = xmesa_create_pipe_context( c, pf );
+   }
+   else {
+      xmesa_mode = XMESA_AUB;
+      pipe = xmesa_create_i965simple(xmesa_get_pipe_winsys_aub(v));
+   }
+
+   if (pipe == NULL)
+      goto fail;
+
+   c->st = st_create_context(pipe, &v->mesa_visual,
+                             share_list ? share_list->st : NULL);
+   if (c->st == NULL)
+      goto fail;
+   
+   mesaCtx = c->st->ctx;
+   c->st->ctx->DriverCtx = c;
+
+#if 00
+   _mesa_enable_sw_extensions(mesaCtx);
+   _mesa_enable_1_3_extensions(mesaCtx);
+   _mesa_enable_1_4_extensions(mesaCtx);
+   _mesa_enable_1_5_extensions(mesaCtx);
+   _mesa_enable_2_0_extensions(mesaCtx);
+#endif
+
+#ifdef XFree86Server
+   /* If we're running in the X server, do bounds checking to prevent
+    * segfaults and server crashes!
+    */
+   mesaCtx->Const.CheckArrayBounds = GL_TRUE;
+#endif
+
+   return c;
+
+ fail:
+   if (c->st)
+      st_destroy_context(c->st);
+   else if (pipe)
+      pipe->destroy(pipe);
+   FREE(c);
+   return NULL;
+}
+
+
+
+PUBLIC
+void XMesaDestroyContext( XMesaContext c )
+{
+   struct pipe_screen *screen = c->st->pipe->screen;
+   st_destroy_context(c->st);
+   /* FIXME: We should destroy the screen here, but if we do so, surfaces may 
+    * outlive it, causing segfaults
+   screen->destroy(screen);
+   */
+   _mesa_free(c);
+}
+
+
+
+/**
+ * Private function for creating an XMesaBuffer which corresponds to an
+ * X window or pixmap.
+ * \param v  the window's XMesaVisual
+ * \param w  the window we're wrapping
+ * \return  new XMesaBuffer or NULL if error
+ */
+PUBLIC XMesaBuffer
+XMesaCreateWindowBuffer(XMesaVisual v, XMesaWindow w)
+{
+#ifndef XFree86Server
+   XWindowAttributes attr;
+#endif
+   XMesaBuffer b;
+   XMesaColormap cmap;
+   int depth;
+
+   assert(v);
+   assert(w);
+
+   /* Check that window depth matches visual depth */
+#ifdef XFree86Server
+   depth = ((XMesaDrawable)w)->depth;
+#else
+   XGetWindowAttributes( v->display, w, &attr );
+   depth = attr.depth;
+#endif
+   if (GET_VISUAL_DEPTH(v) != depth) {
+      _mesa_warning(NULL, "XMesaCreateWindowBuffer: depth mismatch between visual (%d) and window (%d)!\n",
+                    GET_VISUAL_DEPTH(v), depth);
+      return NULL;
+   }
+
+   /* Find colormap */
+#ifdef XFree86Server
+   cmap = (ColormapPtr)LookupIDByType(wColormap(w), RT_COLORMAP);
+#else
+   if (attr.colormap) {
+      cmap = attr.colormap;
+   }
+   else {
+      _mesa_warning(NULL, "Window %u has no colormap!\n", (unsigned int) w);
+      /* this is weird, a window w/out a colormap!? */
+      /* OK, let's just allocate a new one and hope for the best */
+      cmap = XCreateColormap(v->display, w, attr.visual, AllocNone);
+   }
+#endif
+
+   b = create_xmesa_buffer((XMesaDrawable) w, WINDOW, v, cmap);
+   if (!b)
+      return NULL;
+
+   if (!initialize_visual_and_buffer( v, b, v->mesa_visual.rgbMode,
+                                      (XMesaDrawable) w, cmap )) {
+      xmesa_free_buffer(b);
+      return NULL;
+   }
+
+   return b;
+}
+
+
+
+/**
+ * Create a new XMesaBuffer from an X pixmap.
+ *
+ * \param v    the XMesaVisual
+ * \param p    the pixmap
+ * \param cmap the colormap, may be 0 if using a \c GLX_TRUE_COLOR or
+ *             \c GLX_DIRECT_COLOR visual for the pixmap
+ * \returns new XMesaBuffer or NULL if error
+ */
+PUBLIC XMesaBuffer
+XMesaCreatePixmapBuffer(XMesaVisual v, XMesaPixmap p, XMesaColormap cmap)
+{
+   XMesaBuffer b;
+
+   assert(v);
+
+   b = create_xmesa_buffer((XMesaDrawable) p, PIXMAP, v, cmap);
+   if (!b)
+      return NULL;
+
+   if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode,
+				     (XMesaDrawable) p, cmap)) {
+      xmesa_free_buffer(b);
+      return NULL;
+   }
+
+   return b;
+}
+
+
+/**
+ * For GLX_EXT_texture_from_pixmap
+ */
+XMesaBuffer
+XMesaCreatePixmapTextureBuffer(XMesaVisual v, XMesaPixmap p,
+                               XMesaColormap cmap,
+                               int format, int target, int mipmap)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   XMesaBuffer b;
+   GLuint width, height;
+
+   assert(v);
+
+   b = create_xmesa_buffer((XMesaDrawable) p, PIXMAP, v, cmap);
+   if (!b)
+      return NULL;
+
+   /* get pixmap size, update framebuffer/renderbuffer dims */
+   xmesa_get_window_size(v->display, b, &width, &height);
+   _mesa_resize_framebuffer(NULL, &(b->stfb->Base), width, height);
+
+   if (target == 0) {
+      /* examine dims */
+      if (ctx->Extensions.ARB_texture_non_power_of_two) {
+         target = GLX_TEXTURE_2D_EXT;
+      }
+      else if (   _mesa_bitcount(width)  == 1
+               && _mesa_bitcount(height) == 1) {
+         /* power of two size */
+         if (height == 1) {
+            target = GLX_TEXTURE_1D_EXT;
+         }
+         else {
+            target = GLX_TEXTURE_2D_EXT;
+         }
+      }
+      else if (ctx->Extensions.NV_texture_rectangle) {
+         target = GLX_TEXTURE_RECTANGLE_EXT;
+      }
+      else {
+         /* non power of two textures not supported */
+         XMesaDestroyBuffer(b);
+         return 0;
+      }
+   }
+
+   b->TextureTarget = target;
+   b->TextureFormat = format;
+   b->TextureMipmap = mipmap;
+
+   if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode,
+				     (XMesaDrawable) p, cmap)) {
+      xmesa_free_buffer(b);
+      return NULL;
+   }
+
+   return b;
+}
+
+
+
+XMesaBuffer
+XMesaCreatePBuffer(XMesaVisual v, XMesaColormap cmap,
+                   unsigned int width, unsigned int height)
+{
+#ifndef XFree86Server
+   XMesaWindow root;
+   XMesaDrawable drawable;  /* X Pixmap Drawable */
+   XMesaBuffer b;
+
+   /* allocate pixmap for front buffer */
+   root = RootWindow( v->display, v->visinfo->screen );
+   drawable = XCreatePixmap(v->display, root, width, height,
+                            v->visinfo->depth);
+   if (!drawable)
+      return NULL;
+
+   b = create_xmesa_buffer(drawable, PBUFFER, v, cmap);
+   if (!b)
+      return NULL;
+
+   if (!initialize_visual_and_buffer(v, b, v->mesa_visual.rgbMode,
+				     drawable, cmap)) {
+      xmesa_free_buffer(b);
+      return NULL;
+   }
+
+   return b;
+#else
+   return 0;
+#endif
+}
+
+
+
+/*
+ * Deallocate an XMesaBuffer structure and all related info.
+ */
+PUBLIC void
+XMesaDestroyBuffer(XMesaBuffer b)
+{
+   xmesa_free_buffer(b);
+}
+
+
+/**
+ * Query the current window size and update the corresponding GLframebuffer
+ * and all attached renderbuffers.
+ * Called when:
+ *  1. the first time a buffer is bound to a context.
+ *  2. from the XMesaResizeBuffers() API function.
+ *  3. SwapBuffers.  XXX probabaly from xm_flush_frontbuffer() too...
+ * Note: it's possible (and legal) for xmctx to be NULL.  That can happen
+ * when resizing a buffer when no rendering context is bound.
+ */
+void
+xmesa_check_and_update_buffer_size(XMesaContext xmctx, XMesaBuffer drawBuffer)
+{
+   GLuint width, height;
+   xmesa_get_window_size(drawBuffer->xm_visual->display, drawBuffer, &width, &height);
+   st_resize_framebuffer(drawBuffer->stfb, width, height);
+}
+
+
+/*
+ * Bind buffer b to context c and make c the current rendering context.
+ */
+GLboolean XMesaMakeCurrent( XMesaContext c, XMesaBuffer b )
+{
+   return XMesaMakeCurrent2( c, b, b );
+}
+
+
+/*
+ * Bind buffer b to context c and make c the current rendering context.
+ */
+PUBLIC
+GLboolean XMesaMakeCurrent2( XMesaContext c, XMesaBuffer drawBuffer,
+                             XMesaBuffer readBuffer )
+{
+   if (c) {
+      if (!drawBuffer || !readBuffer)
+         return GL_FALSE;  /* must specify buffers! */
+
+#if 0
+      /* XXX restore this optimization */
+      if (&(c->mesa) == _mesa_get_current_context()
+          && c->mesa.DrawBuffer == &drawBuffer->mesa_buffer
+          && c->mesa.ReadBuffer == &readBuffer->mesa_buffer
+          && xmesa_buffer(c->mesa.DrawBuffer)->wasCurrent) {
+         /* same context and buffer, do nothing */
+         return GL_TRUE;
+      }
+#endif
+
+      c->xm_buffer = drawBuffer;
+
+      /* Call this periodically to detect when the user has begun using
+       * GL rendering from multiple threads.
+       */
+      _glapi_check_multithread();
+
+      st_make_current(c->st, drawBuffer->stfb, readBuffer->stfb);
+
+      xmesa_check_and_update_buffer_size(c, drawBuffer);
+      if (readBuffer != drawBuffer)
+         xmesa_check_and_update_buffer_size(c, readBuffer);
+
+      /* Solution to Stephane Rehel's problem with glXReleaseBuffersMESA(): */
+      drawBuffer->wasCurrent = GL_TRUE;
+   }
+   else {
+      /* Detach */
+      st_make_current( NULL, NULL, NULL );
+   }
+   return GL_TRUE;
+}
+
+
+/*
+ * Unbind the context c from its buffer.
+ */
+GLboolean XMesaUnbindContext( XMesaContext c )
+{
+   /* A no-op for XFree86 integration purposes */
+   return GL_TRUE;
+}
+
+
+XMesaContext XMesaGetCurrentContext( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx) {
+      XMesaContext xmesa = xmesa_context(ctx);
+      return xmesa;
+   }
+   else {
+      return 0;
+   }
+}
+
+
+XMesaBuffer XMesaGetCurrentBuffer( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx) {
+      XMesaBuffer xmbuf = xmesa_buffer(ctx->DrawBuffer);
+      return xmbuf;
+   }
+   else {
+      return 0;
+   }
+}
+
+
+/* New in Mesa 3.1 */
+XMesaBuffer XMesaGetCurrentReadBuffer( void )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   if (ctx) {
+      return xmesa_buffer(ctx->ReadBuffer);
+   }
+   else {
+      return 0;
+   }
+}
+
+
+#ifdef XFree86Server
+PUBLIC
+GLboolean XMesaForceCurrent(XMesaContext c)
+{
+   if (c) {
+      _glapi_set_dispatch(c->mesa.CurrentDispatch);
+
+      if (&(c->mesa) != _mesa_get_current_context()) {
+	 _mesa_make_current(&c->mesa, c->mesa.DrawBuffer, c->mesa.ReadBuffer);
+      }
+   }
+   else {
+      _mesa_make_current(NULL, NULL, NULL);
+   }
+   return GL_TRUE;
+}
+
+
+PUBLIC
+GLboolean XMesaLoseCurrent(XMesaContext c)
+{
+   (void) c;
+   _mesa_make_current(NULL, NULL, NULL);
+   return GL_TRUE;
+}
+
+
+PUBLIC
+GLboolean XMesaCopyContext( XMesaContext xm_src, XMesaContext xm_dst, GLuint mask )
+{
+   _mesa_copy_context(&xm_src->mesa, &xm_dst->mesa, mask);
+   return GL_TRUE;
+}
+#endif /* XFree86Server */
+
+
+#ifndef FX
+GLboolean XMesaSetFXmode( GLint mode )
+{
+   (void) mode;
+   return GL_FALSE;
+}
+#endif
+
+
+
+/*
+ * Copy the back buffer to the front buffer.  If there's no back buffer
+ * this is a no-op.
+ */
+PUBLIC
+void XMesaSwapBuffers( XMesaBuffer b )
+{
+   struct pipe_surface *surf;
+
+   /* If we're swapping the buffer associated with the current context
+    * we have to flush any pending rendering commands first.
+    */
+   st_notify_swapbuffers(b->stfb);
+
+   surf = st_get_framebuffer_surface(b->stfb, ST_SURFACE_BACK_LEFT);
+   if (surf) {
+      if (xmesa_mode == XMESA_AUB)
+         xmesa_display_aub( surf );
+      else
+	 xmesa_display_surface(b, surf);
+   }
+
+   xmesa_check_and_update_buffer_size(NULL, b);
+}
+
+
+
+/*
+ * Copy sub-region of back buffer to front buffer
+ */
+void XMesaCopySubBuffer( XMesaBuffer b, int x, int y, int width, int height )
+{
+   struct pipe_surface *surf_front
+      = st_get_framebuffer_surface(b->stfb, ST_SURFACE_FRONT_LEFT);
+   struct pipe_surface *surf_back
+      = st_get_framebuffer_surface(b->stfb, ST_SURFACE_BACK_LEFT);
+   struct pipe_context *pipe = NULL; /* XXX fix */
+
+   if (!surf_front || !surf_back)
+      return;
+
+   pipe->surface_copy(pipe,
+                      FALSE,
+                      surf_front, x, y,  /* dest */
+                      surf_back, x, y,   /* src */
+                      width, height);
+}
+
+
+
+/*
+ * Return the depth buffer associated with an XMesaBuffer.
+ * Input:  b - the XMesa buffer handle
+ * Output:  width, height - size of buffer in pixels
+ *          bytesPerValue - bytes per depth value (2 or 4)
+ *          buffer - pointer to depth buffer values
+ * Return:  GL_TRUE or GL_FALSE to indicate success or failure.
+ */
+GLboolean XMesaGetDepthBuffer( XMesaBuffer b, GLint *width, GLint *height,
+                               GLint *bytesPerValue, void **buffer )
+{
+   *width = 0;
+   *height = 0;
+   *bytesPerValue = 0;
+   *buffer = 0;
+   return GL_FALSE;
+}
+
+
+void XMesaFlush( XMesaContext c )
+{
+   if (c && c->xm_visual->display) {
+#ifdef XFree86Server
+      /* NOT_NEEDED */
+#else
+      st_finish(c->st);
+      XSync( c->xm_visual->display, False );
+#endif
+   }
+}
+
+
+
+const char *XMesaGetString( XMesaContext c, int name )
+{
+   (void) c;
+   if (name==XMESA_VERSION) {
+      return "5.0";
+   }
+   else if (name==XMESA_EXTENSIONS) {
+      return "";
+   }
+   else {
+      return NULL;
+   }
+}
+
+
+
+XMesaBuffer XMesaFindBuffer( XMesaDisplay *dpy, XMesaDrawable d )
+{
+   XMesaBuffer b;
+   for (b=XMesaBufferList; b; b=b->Next) {
+      if (b->drawable == d && b->xm_visual->display == dpy) {
+         return b;
+      }
+   }
+   return NULL;
+}
+
+
+/**
+ * Free/destroy all XMesaBuffers associated with given display.
+ */
+void xmesa_destroy_buffers_on_display(XMesaDisplay *dpy)
+{
+   XMesaBuffer b, next;
+   for (b = XMesaBufferList; b; b = next) {
+      next = b->Next;
+      if (b->xm_visual->display == dpy) {
+         xmesa_free_buffer(b);
+      }
+   }
+}
+
+
+/*
+ * Look for XMesaBuffers whose X window has been destroyed.
+ * Deallocate any such XMesaBuffers.
+ */
+void XMesaGarbageCollect( void )
+{
+   XMesaBuffer b, next;
+   for (b=XMesaBufferList; b; b=next) {
+      next = b->Next;
+      if (b->xm_visual &&
+          b->xm_visual->display &&
+          b->drawable &&
+          b->type == WINDOW) {
+#ifdef XFree86Server
+	 /* NOT_NEEDED */
+#else
+         XSync(b->xm_visual->display, False);
+         if (!window_exists( b->xm_visual->display, b->drawable )) {
+            /* found a dead window, free the ancillary info */
+            XMesaDestroyBuffer( b );
+         }
+#endif
+      }
+   }
+}
+
+
+unsigned long XMesaDitherColor( XMesaContext xmesa, GLint x, GLint y,
+                                GLfloat red, GLfloat green,
+                                GLfloat blue, GLfloat alpha )
+{
+   /* no longer supported */
+   return 0;
+}
+
+
+/*
+ * This is typically called when the window size changes and we need
+ * to reallocate the buffer's back/depth/stencil/accum buffers.
+ */
+PUBLIC void
+XMesaResizeBuffers( XMesaBuffer b )
+{
+   GET_CURRENT_CONTEXT(ctx);
+   XMesaContext xmctx = xmesa_context(ctx);
+   if (!xmctx)
+      return;
+   xmesa_check_and_update_buffer_size(xmctx, b);
+}
+
+
+
+
+PUBLIC void
+XMesaBindTexImage(XMesaDisplay *dpy, XMesaBuffer drawable, int buffer,
+                  const int *attrib_list)
+{
+}
+
+
+
+PUBLIC void
+XMesaReleaseTexImage(XMesaDisplay *dpy, XMesaBuffer drawable, int buffer)
+{
+}
+
diff --git a/src/gallium/winsys/xlib/xm_image.c b/src/gallium/winsys/xlib/xm_image.c
new file mode 100644
index 0000000000..087b4e4c3a
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_image.c
@@ -0,0 +1,133 @@
+/**************************************************************************
+
+Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <kevin@precisioninsight.com>
+ *   Brian Paul <brian@precisioninsight.com>
+ */
+
+#include <stdlib.h>
+#include <X11/Xmd.h>
+
+#include "glxheader.h"
+#include "xmesaP.h"
+
+#ifdef XFree86Server
+
+#ifdef ROUNDUP
+#undef ROUNDUP
+#endif
+
+#define ROUNDUP(nbytes, pad) ((((nbytes) + ((pad)-1)) / (pad)) * ((pad)>>3))
+
+XMesaImage *XMesaCreateImage(int bitsPerPixel, int width, int height, char *data)
+{
+    XMesaImage *image;
+
+    image = (XMesaImage *)xalloc(sizeof(XMesaImage));
+
+    if (image) {
+	image->width = width;
+	image->height = height;
+	image->data = data;
+	/* Always pad to 32 bits */
+	image->bytes_per_line = ROUNDUP((bitsPerPixel * width), 32);
+	image->bits_per_pixel = bitsPerPixel;
+    }
+
+    return image;
+}
+
+void XMesaDestroyImage(XMesaImage *image)
+{
+    if (image->data)
+	free(image->data);
+    xfree(image);
+}
+
+unsigned long XMesaGetPixel(XMesaImage *image, int x, int y)
+{
+    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
+    CARD8  *i8;
+    CARD16 *i16;
+    CARD32 *i32;
+    switch (image->bits_per_pixel) {
+    case 8:
+	i8 = (CARD8 *)row;
+	return i8[x];
+	break;
+    case 15:
+    case 16:
+	i16 = (CARD16 *)row;
+	return i16[x];
+	break;
+    case 24: /* WARNING: architecture specific code */
+	i8 = (CARD8 *)row;
+	return (((CARD32)i8[x*3]) |
+		(((CARD32)i8[x*3+1])<<8) |
+		(((CARD32)i8[x*3+2])<<16));
+	break;
+    case 32:
+	i32 = (CARD32 *)row;
+	return i32[x];
+	break;
+    }
+    return 0;
+}
+
+#ifndef XMESA_USE_PUTPIXEL_MACRO
+void XMesaPutPixel(XMesaImage *image, int x, int y, unsigned long pixel)
+{
+    CARD8  *row = (CARD8 *)(image->data + y*image->bytes_per_line);
+    CARD8  *i8;
+    CARD16 *i16;
+    CARD32 *i32;
+    switch (image->bits_per_pixel) {
+    case 8:
+	i8 = (CARD8 *)row;
+	i8[x] = (CARD8)pixel;
+	break;
+    case 15:
+    case 16:
+	i16 = (CARD16 *)row;
+	i16[x] = (CARD16)pixel;
+	break;
+    case 24: /* WARNING: architecture specific code */
+	i8 = (CARD8 *)__row;
+	i8[x*3]   = (CARD8)(p);
+	i8[x*3+1] = (CARD8)(p>>8);
+	i8[x*3+2] = (CARD8)(p>>16);
+    case 32:
+	i32 = (CARD32 *)row;
+	i32[x] = (CARD32)pixel;
+	break;
+    }
+}
+#endif
+
+#endif /* XFree86Server */
diff --git a/src/gallium/winsys/xlib/xm_image.h b/src/gallium/winsys/xlib/xm_image.h
new file mode 100644
index 0000000000..2a5e0f3777
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_image.h
@@ -0,0 +1,77 @@
+/**************************************************************************
+
+Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sub license, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
+ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Kevin E. Martin <kevin@precisioninsight.com>
+ *   Brian Paul <brian@precisioninsight.com>
+ */
+
+#ifndef _XM_IMAGE_H_
+#define _XM_IMAGE_H_
+
+#define XMESA_USE_PUTPIXEL_MACRO
+
+extern XMesaImage *XMesaCreateImage(int bitsPerPixel, int width, int height,
+				    char *data);
+extern void XMesaDestroyImage(XMesaImage *image);
+extern unsigned long XMesaGetPixel(XMesaImage *image, int x, int y);
+#ifdef XMESA_USE_PUTPIXEL_MACRO
+#define XMesaPutPixel(__i,__x,__y,__p) \
+{ \
+    CARD8  *__row = (CARD8 *)(__i->data + __y*__i->bytes_per_line); \
+    CARD8  *__i8; \
+    CARD16 *__i16; \
+    CARD32 *__i32; \
+    switch (__i->bits_per_pixel) { \
+    case 8: \
+	__i8 = (CARD8 *)__row; \
+	__i8[__x] = (CARD8)__p; \
+	break; \
+    case 15: \
+    case 16: \
+	__i16 = (CARD16 *)__row; \
+	__i16[__x] = (CARD16)__p; \
+	break; \
+    case 24: /* WARNING: architecture specific code */ \
+	__i8 = (CARD8 *)__row; \
+	__i8[__x*3]   = (CARD8)(__p); \
+	__i8[__x*3+1] = (CARD8)(__p>>8); \
+	__i8[__x*3+2] = (CARD8)(__p>>16); \
+	break; \
+    case 32: \
+	__i32 = (CARD32 *)__row; \
+	__i32[__x] = (CARD32)__p; \
+	break; \
+    } \
+}
+#else
+extern void XMesaPutPixel(XMesaImage *image, int x, int y,
+			  unsigned long pixel);
+#endif
+
+#endif /* _XM_IMAGE_H_ */
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
new file mode 100644
index 0000000000..acb5ad8f71
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -0,0 +1,719 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell
+ *   Brian Paul
+ */
+
+
+#include "glxheader.h"
+#include "xmesaP.h"
+
+#undef ASSERT
+#undef Elements
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "softpipe/sp_winsys.h"
+
+#ifdef GALLIUM_CELL
+#include "cell/ppu/cell_context.h"
+#include "cell/ppu/cell_screen.h"
+#include "cell/ppu/cell_winsys.h"
+#else
+#define TILE_SIZE 32  /* avoid compilation errors */
+#endif
+
+#ifdef GALLIUM_TRACE
+#include "trace/tr_screen.h"
+#include "trace/tr_context.h"
+#endif
+
+#include "xm_winsys_aub.h"
+
+
+/**
+ * Subclass of pipe_buffer for Xlib winsys.
+ * Low-level OS/window system memory buffer
+ */
+struct xm_buffer
+{
+   struct pipe_buffer base;
+   boolean userBuffer;  /** Is this a user-space buffer? */
+   void *data;
+   void *mapped;
+   
+   XImage *tempImage;
+   int shm;
+#if defined(USE_XSHM) && !defined(XFree86Server)
+   XShmSegmentInfo shminfo;
+#endif
+};
+
+
+/**
+ * Subclass of pipe_winsys for Xlib winsys
+ */
+struct xmesa_pipe_winsys
+{
+   struct pipe_winsys base;
+   struct xmesa_visual *xm_visual;
+   int shm;
+};
+
+
+
+/** Cast wrapper */
+static INLINE struct xm_buffer *
+xm_buffer( struct pipe_buffer *buf )
+{
+   return (struct xm_buffer *)buf;
+}
+
+
+/**
+ * X Shared Memory Image extension code
+ */
+#if defined(USE_XSHM) && !defined(XFree86Server)
+
+#define XSHM_ENABLED(b) ((b)->shm)
+
+static volatile int mesaXErrorFlag = 0;
+
+/**
+ * Catches potential Xlib errors.
+ */
+static int
+mesaHandleXError(XMesaDisplay *dpy, XErrorEvent *event)
+{
+   (void) dpy;
+   (void) event;
+   mesaXErrorFlag = 1;
+   return 0;
+}
+
+
+static GLboolean alloc_shm(struct xm_buffer *buf, unsigned size)
+{
+   XShmSegmentInfo *const shminfo = & buf->shminfo;
+
+   shminfo->shmid = shmget(IPC_PRIVATE, size, IPC_CREAT|0777);
+   if (shminfo->shmid < 0) {
+      return GL_FALSE;
+   }
+
+   shminfo->shmaddr = (char *) shmat(shminfo->shmid, 0, 0);
+   if (shminfo->shmaddr == (char *) -1) {
+      shmctl(shminfo->shmid, IPC_RMID, 0);
+      return GL_FALSE;
+   }
+
+   shminfo->readOnly = False;
+   return GL_TRUE;
+}
+
+
+/**
+ * Allocate a shared memory XImage back buffer for the given XMesaBuffer.
+ */
+static void
+alloc_shm_ximage(struct xm_buffer *b, struct xmesa_buffer *xmb,
+                 unsigned width, unsigned height)
+{
+   /*
+    * We have to do a _lot_ of error checking here to be sure we can
+    * really use the XSHM extension.  It seems different servers trigger
+    * errors at different points if the extension won't work.  Therefore
+    * we have to be very careful...
+    */
+#if 0
+   GC gc;
+#endif
+   int (*old_handler)(XMesaDisplay *, XErrorEvent *);
+
+   b->tempImage = XShmCreateImage(xmb->xm_visual->display,
+                                  xmb->xm_visual->visinfo->visual,
+                                  xmb->xm_visual->visinfo->depth,
+                                  ZPixmap,
+                                  NULL,
+                                  &b->shminfo,
+                                  width, height);
+   if (b->tempImage == NULL) {
+      b->shm = 0;
+      return;
+   }
+
+
+   mesaXErrorFlag = 0;
+   old_handler = XSetErrorHandler(mesaHandleXError);
+   /* This may trigger the X protocol error we're ready to catch: */
+   XShmAttach(xmb->xm_visual->display, &b->shminfo);
+   XSync(xmb->xm_visual->display, False);
+
+   if (mesaXErrorFlag) {
+      /* we are on a remote display, this error is normal, don't print it */
+      XFlush(xmb->xm_visual->display);
+      mesaXErrorFlag = 0;
+      XDestroyImage(b->tempImage);
+      b->tempImage = NULL;
+      b->shm = 0;
+      (void) XSetErrorHandler(old_handler);
+      return;
+   }
+
+
+   /* Finally, try an XShmPutImage to be really sure the extension works */
+#if 0
+   gc = XCreateGC(xmb->xm_visual->display, xmb->drawable, 0, NULL);
+   XShmPutImage(xmb->xm_visual->display, xmb->drawable, gc,
+                b->tempImage, 0, 0, 0, 0, 1, 1 /*one pixel*/, False);
+   XSync(xmb->xm_visual->display, False);
+   XFreeGC(xmb->xm_visual->display, gc);
+   (void) XSetErrorHandler(old_handler);
+   if (mesaXErrorFlag) {
+      XFlush(xmb->xm_visual->display);
+      mesaXErrorFlag = 0;
+      XDestroyImage(b->tempImage);
+      b->tempImage = NULL;
+      b->shm = 0;
+      return;
+   }
+#endif
+}
+
+#else
+
+#define XSHM_ENABLED(b) 0
+
+static void
+alloc_shm_ximage(struct xm_buffer *b, struct xmesa_buffer *xmb,
+                 unsigned width, unsigned height)
+{
+   b->shm = 0;
+}
+#endif /* USE_XSHM */
+
+
+
+
+/* Most callbacks map direcly onto dri_bufmgr operations:
+ */
+static void *
+xm_buffer_map(struct pipe_winsys *pws, struct pipe_buffer *buf,
+              unsigned flags)
+{
+   struct xm_buffer *xm_buf = xm_buffer(buf);
+   xm_buf->mapped = xm_buf->data;
+   return xm_buf->mapped;
+}
+
+static void
+xm_buffer_unmap(struct pipe_winsys *pws, struct pipe_buffer *buf)
+{
+   struct xm_buffer *xm_buf = xm_buffer(buf);
+   xm_buf->mapped = NULL;
+}
+
+static void
+xm_buffer_destroy(struct pipe_winsys *pws,
+                  struct pipe_buffer *buf)
+{
+   struct xm_buffer *oldBuf = xm_buffer(buf);
+
+   if (oldBuf->data) {
+#if defined(USE_XSHM) && !defined(XFree86Server)
+      if (oldBuf->shminfo.shmid >= 0) {
+         shmdt(oldBuf->shminfo.shmaddr);
+         shmctl(oldBuf->shminfo.shmid, IPC_RMID, 0);
+         
+         oldBuf->shminfo.shmid = -1;
+         oldBuf->shminfo.shmaddr = (char *) -1;
+      }
+      else
+#endif
+      {
+         if (!oldBuf->userBuffer) {
+            align_free(oldBuf->data);
+         }
+      }
+
+      oldBuf->data = NULL;
+   }
+
+   free(oldBuf);
+}
+
+
+/**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(const uint *tileIn, uint *tileOut)
+{
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tileOut[y * TILE_SIZE + (x + 0)] = tileIn[k];
+         tileOut[y * TILE_SIZE + (x + 1)] = tileIn[k+1];
+         tileOut[(y + 1) * TILE_SIZE + (x + 0)] = tileIn[k+2];
+         tileOut[(y + 1) * TILE_SIZE + (x + 1)] = tileIn[k+3];
+      }
+   }
+}
+
+
+
+/**
+ * Display a surface that's in a tiled configuration.  That is, all the
+ * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory.
+ */
+static void
+xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
+{
+   XImage *ximage;
+   struct xm_buffer *xm_buf = xm_buffer(surf->buffer);
+   const uint tilesPerRow = (surf->width + TILE_SIZE - 1) / TILE_SIZE;
+   uint x, y;
+
+   if (XSHM_ENABLED(xm_buf) && (xm_buf->tempImage == NULL)) {
+      alloc_shm_ximage(xm_buf, b, TILE_SIZE, TILE_SIZE);
+   }
+
+   ximage = (XSHM_ENABLED(xm_buf)) ? xm_buf->tempImage : b->tempImage;
+
+   /* check that the XImage has been previously initialized */
+   assert(ximage->format);
+   assert(ximage->bitmap_unit);
+
+   if (!XSHM_ENABLED(xm_buf)) {
+      /* update XImage's fields */
+      ximage->width = TILE_SIZE;
+      ximage->height = TILE_SIZE;
+      ximage->bytes_per_line = TILE_SIZE * 4;
+   }
+
+   for (y = 0; y < surf->height; y += TILE_SIZE) {
+      for (x = 0; x < surf->width; x += TILE_SIZE) {
+         uint tmpTile[TILE_SIZE * TILE_SIZE];
+         int tx = x / TILE_SIZE;
+         int ty = y / TILE_SIZE;
+         int offset = ty * tilesPerRow + tx;
+         int w = TILE_SIZE;
+         int h = TILE_SIZE;
+
+         if (y + h > surf->height)
+            h = surf->height - y;
+         if (x + w > surf->width)
+            w = surf->width - x;
+
+         /* offset in pixels */
+         offset *= TILE_SIZE * TILE_SIZE;
+
+         if (0 && XSHM_ENABLED(xm_buf)) {
+            ximage->data = (char *) xm_buf->data + 4 * offset;
+            /* make copy of tile data */
+            memcpy(tmpTile, (uint *) ximage->data, sizeof(tmpTile));
+            /* twiddle from temp to ximage in shared memory */
+            twiddle_tile(tmpTile, (uint *) ximage->data);
+            /* display image in shared memory */
+#if defined(USE_XSHM) && !defined(XFree86Server)
+            XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
+                         ximage, 0, 0, x, y, w, h, False);
+#endif
+         }
+         else {
+            /* twiddle from ximage buffer to temp tile */
+            twiddle_tile((uint *) xm_buf->data + offset, tmpTile);
+            /* display temp tile data */
+            ximage->data = (char *) tmpTile;
+            XPutImage(b->xm_visual->display, b->drawable, b->gc,
+                      ximage, 0, 0, x, y, w, h);
+         }
+      }
+   }
+}
+
+
+/**
+ * Display/copy the image in the surface into the X window specified
+ * by the XMesaBuffer.
+ */
+void
+xmesa_display_surface(XMesaBuffer b, const struct pipe_surface *surf)
+{
+   XImage *ximage;
+   struct xm_buffer *xm_buf = xm_buffer(surf->buffer);
+   static boolean no_swap = 0;
+   static boolean firsttime = 1;
+   static int tileSize = 0;
+
+   if (firsttime) {
+      no_swap = getenv("SP_NO_RAST") != NULL;
+#ifdef GALLIUM_CELL
+      if (!getenv("GALLIUM_NOCELL")) {
+         tileSize = 32; /** probably temporary */
+      }
+#endif
+      firsttime = 0;
+   }
+
+   if (no_swap)
+      return;
+
+   if (tileSize) {
+      xmesa_display_surface_tiled(b, surf);
+      return;
+   }
+
+   if (XSHM_ENABLED(xm_buf) && (xm_buf->tempImage == NULL)) {
+      assert(surf->block.width == 1);
+      assert(surf->block.height == 1);
+      alloc_shm_ximage(xm_buf, b, surf->stride/surf->block.size, surf->height);
+   }
+
+   ximage = (XSHM_ENABLED(xm_buf)) ? xm_buf->tempImage : b->tempImage;
+   ximage->data = xm_buf->data;
+
+   /* display image in Window */
+   if (XSHM_ENABLED(xm_buf)) {
+#if defined(USE_XSHM) && !defined(XFree86Server)
+      XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
+                   ximage, 0, 0, 0, 0, surf->width, surf->height, False);
+#endif
+   } else {
+      /* check that the XImage has been previously initialized */
+      assert(ximage->format);
+      assert(ximage->bitmap_unit);
+
+      /* update XImage's fields */
+      ximage->width = surf->width;
+      ximage->height = surf->height;
+      ximage->bytes_per_line = surf->stride;
+
+      XPutImage(b->xm_visual->display, b->drawable, b->gc,
+                ximage, 0, 0, 0, 0, surf->width, surf->height);
+   }
+}
+
+
+static void
+xm_flush_frontbuffer(struct pipe_winsys *pws,
+                     struct pipe_surface *surf,
+                     void *context_private)
+{
+   /*
+    * The front color buffer is actually just another XImage buffer.
+    * This function copies that XImage to the actual X Window.
+    */
+   XMesaContext xmctx = (XMesaContext) context_private;
+   xmesa_display_surface(xmctx->xm_buffer, surf);
+}
+
+
+
+static const char *
+xm_get_name(struct pipe_winsys *pws)
+{
+   return "Xlib";
+}
+
+
+static struct pipe_buffer *
+xm_buffer_create(struct pipe_winsys *pws, 
+                 unsigned alignment, 
+                 unsigned usage,
+                 unsigned size)
+{
+   struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
+#if defined(USE_XSHM) && !defined(XFree86Server)
+   struct xmesa_pipe_winsys *xpws = (struct xmesa_pipe_winsys *) pws;
+#endif
+
+   buffer->base.refcount = 1;
+   buffer->base.alignment = alignment;
+   buffer->base.usage = usage;
+   buffer->base.size = size;
+
+
+#if defined(USE_XSHM) && !defined(XFree86Server)
+   buffer->shminfo.shmid = -1;
+   buffer->shminfo.shmaddr = (char *) -1;
+
+   if (xpws->shm && (usage & PIPE_BUFFER_USAGE_PIXEL) != 0) {
+      buffer->shm = xpws->shm;
+
+      if (alloc_shm(buffer, size)) {
+         buffer->data = buffer->shminfo.shmaddr;
+      }
+   }
+#endif
+
+   if (buffer->data == NULL) {
+      buffer->shm = 0;
+
+      /* align to 16-byte multiple for Cell */
+      buffer->data = align_malloc(size, max(alignment, 16));
+   }
+
+   return &buffer->base;
+}
+
+
+/**
+ * Create buffer which wraps user-space data.
+ */
+static struct pipe_buffer *
+xm_user_buffer_create(struct pipe_winsys *pws, void *ptr, unsigned bytes)
+{
+   struct xm_buffer *buffer = CALLOC_STRUCT(xm_buffer);
+   buffer->base.refcount = 1;
+   buffer->base.size = bytes;
+   buffer->userBuffer = TRUE;
+   buffer->data = ptr;
+   buffer->shm = 0;
+
+   return &buffer->base;
+}
+
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int
+xm_surface_alloc_storage(struct pipe_winsys *winsys,
+                         struct pipe_surface *surf,
+                         unsigned width, unsigned height,
+                         enum pipe_format format, 
+                         unsigned flags,
+                         unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+#ifdef GALLIUM_CELL /* XXX a bit of a hack */
+                                        surf->stride * round_up(surf->nblocksy, TILE_SIZE));
+#else
+                                        surf->stride * surf->nblocksy);
+#endif
+
+   if(!surf->buffer)
+      return -1;
+   
+   return 0;
+}
+
+
+/**
+ * Called via winsys->surface_alloc() to create new surfaces.
+ */
+static struct pipe_surface *
+xm_surface_alloc(struct pipe_winsys *ws)
+{
+   struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
+
+   assert(ws);
+
+   surface->refcount = 1;
+   surface->winsys = ws;
+
+   return surface;
+}
+
+
+
+static void
+xm_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   assert(!surf->texture);
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+/*
+ * Fence functions - basically nothing to do, as we don't create any actual
+ * fence objects.
+ */
+
+static void
+xm_fence_reference(struct pipe_winsys *sws, struct pipe_fence_handle **ptr,
+                   struct pipe_fence_handle *fence)
+{
+}
+
+
+static int
+xm_fence_signalled(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
+                   unsigned flag)
+{
+   return 0;
+}
+
+
+static int
+xm_fence_finish(struct pipe_winsys *sws, struct pipe_fence_handle *fence,
+                unsigned flag)
+{
+   return 0;
+}
+
+
+/**
+ * Return pointer to a pipe_winsys object.
+ * For Xlib, this is a singleton object.
+ * Nothing special for the Xlib driver so no subclassing or anything.
+ */
+struct pipe_winsys *
+xmesa_get_pipe_winsys_aub(struct xmesa_visual *xm_vis)
+{
+   static struct xmesa_pipe_winsys *ws = NULL;
+
+   if (!ws) {
+      ws = (struct xmesa_pipe_winsys *) xmesa_create_pipe_winsys_aub();
+   }
+   return &ws->base;
+}
+
+
+static struct pipe_winsys *
+xmesa_get_pipe_winsys(struct xmesa_visual *xm_vis)
+{
+   static struct xmesa_pipe_winsys *ws = NULL;
+
+   if (!ws) {
+      ws = CALLOC_STRUCT(xmesa_pipe_winsys);
+
+      ws->xm_visual = xm_vis;
+      ws->shm = xmesa_check_for_xshm(xm_vis->display);
+
+      /* Fill in this struct with callbacks that pipe will need to
+       * communicate with the window system, buffer manager, etc. 
+       */
+      ws->base.buffer_create = xm_buffer_create;
+      ws->base.user_buffer_create = xm_user_buffer_create;
+      ws->base.buffer_map = xm_buffer_map;
+      ws->base.buffer_unmap = xm_buffer_unmap;
+      ws->base.buffer_destroy = xm_buffer_destroy;
+
+      ws->base.surface_alloc = xm_surface_alloc;
+      ws->base.surface_alloc_storage = xm_surface_alloc_storage;
+      ws->base.surface_release = xm_surface_release;
+
+      ws->base.fence_reference = xm_fence_reference;
+      ws->base.fence_signalled = xm_fence_signalled;
+      ws->base.fence_finish = xm_fence_finish;
+
+      ws->base.flush_frontbuffer = xm_flush_frontbuffer;
+      ws->base.get_name = xm_get_name;
+   }
+
+   return &ws->base;
+}
+
+
+struct pipe_context *
+xmesa_create_pipe_context(XMesaContext xmesa, uint pixelformat)
+{
+   struct pipe_winsys *pws;
+   struct pipe_context *pipe;
+   
+   if (getenv("XM_AUB")) {
+      pws = xmesa_get_pipe_winsys_aub(xmesa->xm_visual);
+   }
+   else {
+      pws = xmesa_get_pipe_winsys(xmesa->xm_visual);
+   }
+
+#ifdef GALLIUM_CELL
+   if (!getenv("GALLIUM_NOCELL")) {
+      struct cell_winsys *cws = cell_get_winsys(pixelformat);
+      struct pipe_screen *screen = cell_create_screen(pws);
+
+      pipe = cell_create_context(screen, cws);
+   }
+   else
+#endif
+   {
+      struct pipe_screen *screen = softpipe_create_screen(pws);
+
+      pipe = softpipe_create(screen, pws, NULL);
+
+#ifdef GALLIUM_TRACE
+      screen = trace_screen_create(screen);
+      
+      pipe = trace_context_create(screen, pipe);
+#endif
+   }
+
+   if (pipe)
+      pipe->priv = xmesa;
+
+   return pipe;
+}
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
new file mode 100644
index 0000000000..b7c10b6bca
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -0,0 +1,586 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell
+ *   Brian Paul
+ */
+
+
+#include "glxheader.h"
+#include "xmesaP.h"
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "i965simple/brw_winsys.h"
+#include "i965simple/brw_screen.h"
+#include "brw_aub.h"
+#include "xm_winsys_aub.h"
+
+
+
+struct aub_buffer {
+   char *data;
+   unsigned offset;
+   unsigned size;
+   unsigned refcount;
+   unsigned map_count;
+   boolean dump_on_unmap;
+};
+
+
+
+struct aub_pipe_winsys {
+   struct pipe_winsys winsys;
+
+   struct brw_aubfile *aubfile;
+
+   /* This is simple, isn't it:
+    */
+   char *pool;
+   unsigned size;
+   unsigned used;
+};
+
+
+/* Turn a pipe winsys into an aub/pipe winsys:
+ */
+static inline struct aub_pipe_winsys *
+aub_pipe_winsys( struct pipe_winsys *winsys )
+{
+   return (struct aub_pipe_winsys *)winsys;
+}
+
+
+
+static INLINE struct aub_buffer *
+aub_bo( struct pipe_buffer *bo )
+{
+   return (struct aub_buffer *)bo;
+}
+
+static INLINE struct pipe_buffer *
+pipe_bo( struct aub_buffer *bo )
+{
+   return (struct pipe_buffer *)bo;
+}
+
+
+
+
+static void *aub_buffer_map(struct pipe_winsys *winsys, 
+			      struct pipe_buffer *buf,
+			      unsigned flags )
+{
+   struct aub_buffer *sbo = aub_bo(buf);
+
+   assert(sbo->data);
+
+   if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+      sbo->dump_on_unmap = 1;
+
+   sbo->map_count++;
+   return sbo->data;
+}
+
+static void aub_buffer_unmap(struct pipe_winsys *winsys, 
+			       struct pipe_buffer *buf)
+{
+   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   struct aub_buffer *sbo = aub_bo(buf);
+
+   sbo->map_count--;
+
+   if (sbo->map_count == 0 &&
+       sbo->dump_on_unmap) {
+
+      sbo->dump_on_unmap = 0;
+
+      brw_aub_gtt_data( iws->aubfile, 
+			sbo->offset,
+			sbo->data,
+			sbo->size,
+			0,
+			0);
+   }
+}
+
+
+static void
+aub_buffer_destroy(struct pipe_winsys *winsys,
+		   struct pipe_buffer *buf)
+{
+   free(buf);
+}
+
+
+void xmesa_buffer_subdata_aub(struct pipe_winsys *winsys, 
+			      struct pipe_buffer *buf,
+			      unsigned long offset, 
+			      unsigned long size, 
+			      const void *data,
+			      unsigned aub_type,
+			      unsigned aub_sub_type)
+{
+   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   struct aub_buffer *sbo = aub_bo(buf);
+
+   assert(sbo->size > offset + size);
+   memcpy(sbo->data + offset, data, size);
+
+   brw_aub_gtt_data( iws->aubfile, 
+		     sbo->offset + offset,
+		     sbo->data + offset,
+		     size,
+		     aub_type,
+		     aub_sub_type );
+}
+
+void xmesa_commands_aub(struct pipe_winsys *winsys,
+			unsigned *cmds,
+			unsigned nr_dwords)
+{
+   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   unsigned size = nr_dwords * 4;
+
+   assert(iws->used + size < iws->size);
+
+   brw_aub_gtt_cmds( iws->aubfile, 
+		     AUB_BUF_START + iws->used,
+		     cmds,
+		     nr_dwords * sizeof(int) );
+
+   iws->used += align(size, 4096);
+}
+
+
+static struct aub_pipe_winsys *global_winsys = NULL;
+
+void xmesa_display_aub( /* struct pipe_winsys *winsys, */
+		       struct pipe_surface *surface )
+{
+//   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   brw_aub_dump_bmp( global_winsys->aubfile, 
+		     surface,
+		     aub_bo(surface->buffer)->offset );
+}
+
+
+
+/* Pipe has no concept of pools.  We choose the tex/region pool
+ * for all buffers.
+ */
+static struct pipe_buffer *
+aub_buffer_create(struct pipe_winsys *winsys,
+                  unsigned alignment,
+                  unsigned usage,
+                  unsigned size)
+{
+   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   struct aub_buffer *sbo = CALLOC_STRUCT(aub_buffer);
+
+   sbo->refcount = 1;
+
+   /* Could reuse buffers that are not referenced in current
+    * batchbuffer.  Can't do that atm, so always reallocate:
+    */
+   assert(iws->used + size < iws->size);
+   sbo->data = iws->pool + iws->used;
+   sbo->offset = AUB_BUF_START + iws->used;
+   iws->used += align(size, 4096);
+
+   sbo->size = size;
+
+   return pipe_bo(sbo);
+}
+
+
+static struct pipe_buffer *
+aub_user_buffer_create(struct pipe_winsys *winsys, void *ptr, unsigned bytes)
+{
+   struct aub_buffer *sbo;
+
+   /* Lets hope this is meant for upload, not as a result!  
+    */
+   sbo = aub_bo(aub_buffer_create( winsys, 0, 0, 0 ));
+
+   sbo->data = ptr;
+   sbo->size = bytes;
+
+   return pipe_bo(sbo);
+}
+
+
+/* The state tracker (should!) keep track of whether the fake
+ * frontbuffer has been touched by any rendering since the last time
+ * we copied its contents to the real frontbuffer.  Our task is easy:
+ */
+static void
+aub_flush_frontbuffer( struct pipe_winsys *winsys,
+                         struct pipe_surface *surf,
+                         void *context_private)
+{
+   xmesa_display_aub( surf );
+}
+
+static struct pipe_surface *
+aub_i915_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_surface *surf = CALLOC_STRUCT(pipe_surface);
+   if (surf) {
+      surf->refcount = 1;
+      surf->winsys = winsys;
+   }
+   return surf;
+}
+
+
+/**
+ * Round n up to next multiple.
+ */
+static INLINE unsigned
+round_up(unsigned n, unsigned multiple)
+{
+   return (n + multiple - 1) & ~(multiple - 1);
+}
+
+static int
+aub_i915_surface_alloc_storage(struct pipe_winsys *winsys,
+                               struct pipe_surface *surf,
+                               unsigned width, unsigned height,
+                               enum pipe_format format,
+                               unsigned flags,
+                               unsigned tex_usage)
+{
+   const unsigned alignment = 64;
+
+   surf->width = width;
+   surf->height = height;
+   surf->format = format;
+   pf_get_block(format, &surf->block);
+   surf->nblocksx = pf_get_nblocksx(&surf->block, width);
+   surf->nblocksy = pf_get_nblocksy(&surf->block, height);
+   surf->stride = round_up(surf->nblocksx * surf->block.size, alignment);
+   surf->usage = flags;
+
+   assert(!surf->buffer);
+   surf->buffer = winsys->buffer_create(winsys, alignment,
+                                        PIPE_BUFFER_USAGE_PIXEL,
+                                        surf->stride * surf->nblocksy);
+    if(!surf->buffer)
+       return -1;
+
+   return 0;
+}
+
+static void
+aub_i915_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+   struct pipe_surface *surf = *s;
+   surf->refcount--;
+   if (surf->refcount == 0) {
+      if (surf->buffer)
+         winsys_buffer_reference(winsys, &surf->buffer, NULL);
+      free(surf);
+   }
+   *s = NULL;
+}
+
+
+
+static const char *
+aub_get_name( struct pipe_winsys *winsys )
+{
+   return "Aub/xlib";
+}
+
+struct pipe_winsys *
+xmesa_create_pipe_winsys_aub( void )
+{
+   struct aub_pipe_winsys *iws = CALLOC_STRUCT( aub_pipe_winsys );
+   
+   /* Fill in this struct with callbacks that pipe will need to
+    * communicate with the window system, buffer manager, etc. 
+    *
+    * Pipe would be happy with a malloc based memory manager, but
+    * the SwapBuffers implementation in this winsys driver requires
+    * that rendering be done to an appropriate _DriBufferObject.  
+    */
+   iws->winsys.buffer_create = aub_buffer_create;
+   iws->winsys.user_buffer_create = aub_user_buffer_create;
+   iws->winsys.buffer_map = aub_buffer_map;
+   iws->winsys.buffer_unmap = aub_buffer_unmap;
+   iws->winsys.buffer_destroy = aub_buffer_destroy;
+   iws->winsys.flush_frontbuffer = aub_flush_frontbuffer;
+   iws->winsys.get_name = aub_get_name;
+
+   iws->winsys.surface_alloc = aub_i915_surface_alloc;
+   iws->winsys.surface_alloc_storage = aub_i915_surface_alloc_storage;
+   iws->winsys.surface_release = aub_i915_surface_release;
+
+   iws->aubfile = brw_aubfile_create();
+   iws->size = AUB_BUF_SIZE;
+   iws->pool = malloc(AUB_BUF_SIZE);
+
+   /* HACK: static copy of this pointer:
+    */
+   assert(global_winsys == NULL);
+   global_winsys = iws;
+
+   return &iws->winsys;
+}
+
+
+void
+xmesa_destroy_pipe_winsys_aub( struct pipe_winsys *winsys )
+
+{
+   struct aub_pipe_winsys *iws = aub_pipe_winsys(winsys);
+   brw_aub_destroy(iws->aubfile);
+   free(iws->pool);
+   free(iws);
+}
+
+
+
+
+
+
+
+#define IWS_BATCHBUFFER_SIZE 1024
+
+struct aub_brw_winsys {
+   struct brw_winsys winsys;   /**< batch buffer funcs */
+   struct aub_context *aub;
+                         
+   struct pipe_winsys *pipe_winsys;
+
+   unsigned batch_data[IWS_BATCHBUFFER_SIZE];
+   unsigned batch_nr;
+   unsigned batch_size;
+   unsigned batch_alloc;
+};
+
+
+/* Turn a i965simple winsys into an aub/i965simple winsys:
+ */
+static inline struct aub_brw_winsys *
+aub_brw_winsys( struct brw_winsys *sws )
+{
+   return (struct aub_brw_winsys *)sws;
+}
+
+
+/* Simple batchbuffer interface:
+ */
+
+static unsigned *aub_i965_batch_start( struct brw_winsys *sws,
+					 unsigned dwords,
+					 unsigned relocs )
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(sws);
+
+   if (iws->batch_size < iws->batch_nr + dwords)
+      return NULL;
+
+   iws->batch_alloc = iws->batch_nr + dwords;
+   return (void *)1;			/* not a valid pointer! */
+}
+
+static void aub_i965_batch_dword( struct brw_winsys *sws,
+				    unsigned dword )
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(sws);
+
+   assert(iws->batch_nr < iws->batch_alloc);
+   iws->batch_data[iws->batch_nr++] = dword;
+}
+
+static void aub_i965_batch_reloc( struct brw_winsys *sws,
+			     struct pipe_buffer *buf,
+			     unsigned access_flags,
+			     unsigned delta )
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(sws);
+
+   assert(iws->batch_nr < iws->batch_alloc);
+   iws->batch_data[iws->batch_nr++] = aub_bo(buf)->offset + delta;
+}
+
+static unsigned aub_i965_get_buffer_offset( struct brw_winsys *sws,
+					    struct pipe_buffer *buf,
+					    unsigned access_flags )
+{
+   return aub_bo(buf)->offset;
+}
+
+static void aub_i965_batch_end( struct brw_winsys *sws )
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(sws);
+
+   assert(iws->batch_nr <= iws->batch_alloc);
+   iws->batch_alloc = 0;
+}
+
+static void aub_i965_batch_flush( struct brw_winsys *sws,
+				    struct pipe_fence_handle **fence )
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(sws);
+   assert(iws->batch_nr <= iws->batch_size);
+
+   if (iws->batch_nr) {
+      xmesa_commands_aub( iws->pipe_winsys,
+			  iws->batch_data,
+			  iws->batch_nr );
+   }
+
+   iws->batch_nr = 0;
+}
+
+
+
+static void aub_i965_buffer_subdata_typed(struct brw_winsys *winsys, 
+					    struct pipe_buffer *buf,
+					    unsigned long offset, 
+					    unsigned long size, 
+					    const void *data,
+					    unsigned data_type)
+{
+   struct aub_brw_winsys *iws = aub_brw_winsys(winsys);
+   unsigned aub_type = DW_GENERAL_STATE;
+   unsigned aub_sub_type;
+
+   switch (data_type) {
+   case BRW_CC_VP:
+      aub_sub_type = DWGS_COLOR_CALC_VIEWPORT_STATE;
+      break;
+   case BRW_CC_UNIT:
+      aub_sub_type = DWGS_COLOR_CALC_STATE;
+      break;
+   case BRW_WM_PROG:
+      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
+      break;
+   case BRW_SAMPLER_DEFAULT_COLOR:
+      aub_sub_type = DWGS_SAMPLER_DEFAULT_COLOR;
+      break;
+   case BRW_SAMPLER:
+      aub_sub_type = DWGS_SAMPLER_STATE;
+      break;
+   case BRW_WM_UNIT:
+      aub_sub_type = DWGS_WINDOWER_IZ_STATE;
+      break;
+   case BRW_SF_PROG:
+      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
+      break;
+   case BRW_SF_VP:
+      aub_sub_type = DWGS_STRIPS_FANS_VIEWPORT_STATE;
+      break;
+   case BRW_SF_UNIT:
+      aub_sub_type = DWGS_STRIPS_FANS_STATE;
+      break;
+   case BRW_VS_UNIT:
+      aub_sub_type = DWGS_VERTEX_SHADER_STATE;
+      break;
+   case BRW_VS_PROG:
+      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
+      break;
+   case BRW_GS_UNIT:
+      aub_sub_type = DWGS_GEOMETRY_SHADER_STATE;
+      break;
+   case BRW_GS_PROG:
+      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
+      break;
+   case BRW_CLIP_VP:
+      aub_sub_type = DWGS_CLIPPER_VIEWPORT_STATE;
+      break;
+   case BRW_CLIP_UNIT:
+      aub_sub_type = DWGS_CLIPPER_STATE;
+      break;
+   case BRW_CLIP_PROG:
+      aub_sub_type = DWGS_KERNEL_INSTRUCTIONS;
+      break;
+   case BRW_SS_SURFACE:
+      aub_type = DW_SURFACE_STATE;
+      aub_sub_type = DWSS_SURFACE_STATE; 
+      break;
+   case BRW_SS_SURF_BIND:
+      aub_type = DW_SURFACE_STATE;
+      aub_sub_type = DWSS_BINDING_TABLE_STATE; 
+      break;
+   case BRW_CONSTANT_BUFFER:
+      aub_type = DW_CONSTANT_URB_ENTRY;
+      aub_sub_type = 0; 
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   xmesa_buffer_subdata_aub( iws->pipe_winsys,
+			     buf,
+			     offset,
+			     size,
+			     data,
+			     aub_type,
+			     aub_sub_type );
+}
+   
+/**
+ * Create i965 hardware rendering context.
+ */
+struct pipe_context *
+xmesa_create_i965simple( struct pipe_winsys *winsys )
+{
+   struct aub_brw_winsys *iws = CALLOC_STRUCT( aub_brw_winsys );
+   struct pipe_screen *screen = brw_create_screen(winsys, 0/* XXX pci_id */);
+   
+   /* Fill in this struct with callbacks that i965simple will need to
+    * communicate with the window system, buffer manager, etc. 
+    */
+   iws->winsys.batch_start = aub_i965_batch_start;
+   iws->winsys.batch_dword = aub_i965_batch_dword;
+   iws->winsys.batch_reloc = aub_i965_batch_reloc;
+   iws->winsys.batch_end = aub_i965_batch_end;
+   iws->winsys.batch_flush = aub_i965_batch_flush;
+   iws->winsys.buffer_subdata_typed = aub_i965_buffer_subdata_typed;
+   iws->winsys.get_buffer_offset = aub_i965_get_buffer_offset;
+
+   iws->pipe_winsys = winsys;
+
+   iws->batch_size = IWS_BATCHBUFFER_SIZE;
+
+   /* Create the i965simple context:
+    */
+   return brw_create( screen,
+		      &iws->winsys,
+		      0 );
+}
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.h b/src/gallium/winsys/xlib/xm_winsys_aub.h
new file mode 100644
index 0000000000..cc2a755277
--- /dev/null
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef AUB_WINSYS_H
+#define AUB_WINSYS_H
+
+struct pipe_context;
+struct pipe_winsys;
+struct pipe_buffer;
+struct pipe_surface;
+
+struct pipe_winsys *
+xmesa_create_pipe_winsys_aub( void );
+
+void
+xmesa_destroy_pipe_winsys_aub( struct pipe_winsys *winsys );
+
+
+
+struct pipe_context *
+xmesa_create_i965simple( struct pipe_winsys *winsys );
+
+
+
+void xmesa_buffer_subdata_aub(struct pipe_winsys *winsys, 
+			      struct pipe_buffer *buf,
+			      unsigned long offset, 
+			      unsigned long size, 
+			      const void *data,
+			      unsigned aub_type,
+			      unsigned aub_sub_type);
+
+void xmesa_commands_aub(struct pipe_winsys *winsys,
+			unsigned *cmds,
+			unsigned nr_dwords);
+
+
+void xmesa_display_aub( /* struct pipe_winsys *winsys, */
+   struct pipe_surface *surface );
+
+extern struct pipe_winsys *
+xmesa_get_pipe_winsys_aub(struct xmesa_visual *xm_vis);
+
+#endif
diff --git a/src/gallium/winsys/xlib/xmesaP.h b/src/gallium/winsys/xlib/xmesaP.h
new file mode 100644
index 0000000000..fcaeee52bc
--- /dev/null
+++ b/src/gallium/winsys/xlib/xmesaP.h
@@ -0,0 +1,180 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  7.1
+ *
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef XMESAP_H
+#define XMESAP_H
+
+
+#include "GL/xmesa.h"
+#include "mtypes.h"
+#ifdef XFree86Server
+#include "xm_image.h"
+#endif
+
+#include "state_tracker/st_context.h"
+#include "state_tracker/st_public.h"
+#include "pipe/p_thread.h"
+
+
+extern pipe_mutex _xmesa_lock;
+
+extern XMesaBuffer XMesaBufferList;
+
+/*
+ */
+#define XMESA_SOFTPIPE 1
+#define XMESA_AUB      2
+extern int xmesa_mode;
+
+
+/**
+ * Visual inforation, derived from GLvisual.
+ * Basically corresponds to an XVisualInfo.
+ */
+struct xmesa_visual {
+   GLvisual mesa_visual;	/* Device independent visual parameters */
+   XMesaDisplay *display;	/* The X11 display */
+#ifdef XFree86Server
+   GLint ColormapEntries;
+   GLint nplanes;
+#else
+   XMesaVisualInfo visinfo;	/* X's visual info (pointer to private copy) */
+   XVisualInfo *vishandle;	/* Only used in fakeglx.c */
+#endif
+   GLint BitsPerPixel;		/* True bits per pixel for XImages */
+
+   GLboolean ximage_flag;	/* Use XImage for back buffer (not pixmap)? */
+};
+
+
+/**
+ * Context info, derived from st_context.
+ * Basically corresponds to a GLXContext.
+ */
+struct xmesa_context {
+   struct st_context *st;
+   XMesaVisual xm_visual;	/** pixel format info */
+   XMesaBuffer xm_buffer;	/** current drawbuffer */
+};
+
+
+/**
+ * Types of X/GLX drawables we might render into.
+ */
+typedef enum {
+   WINDOW,          /* An X window */
+   GLXWINDOW,       /* GLX window */
+   PIXMAP,          /* GLX pixmap */
+   PBUFFER          /* GLX Pbuffer */
+} BufferType;
+
+
+/**
+ * Framebuffer information, derived from.
+ * Basically corresponds to a GLXDrawable.
+ */
+struct xmesa_buffer {
+   struct st_framebuffer *stfb;
+
+   GLboolean wasCurrent;	/* was ever the current buffer? */
+   XMesaVisual xm_visual;	/* the X/Mesa visual */
+   XMesaDrawable drawable;	/* Usually the X window ID */
+   XMesaColormap cmap;		/* the X colormap */
+   BufferType type;             /* window, pixmap, pbuffer or glxwindow */
+
+   XMesaImage *tempImage;
+   unsigned long selectedEvents;/* for pbuffers only */
+
+   GLuint shm;			/* X Shared Memory extension status:	*/
+				/*    0 = not available			*/
+				/*    1 = XImage support available	*/
+				/*    2 = Pixmap support available too	*/
+#if defined(USE_XSHM) && !defined(XFree86Server)
+   XShmSegmentInfo shminfo;
+#endif
+
+   XMesaGC gc;			/* scratch GC for span, line, tri drawing */
+
+   /* GLX_EXT_texture_from_pixmap */
+   GLint TextureTarget; /** GLX_TEXTURE_1D_EXT, for example */
+   GLint TextureFormat; /** GLX_TEXTURE_FORMAT_RGB_EXT, for example */
+   GLint TextureMipmap; /** 0 or 1 */
+
+   struct xmesa_buffer *Next;	/* Linked list pointer: */
+};
+
+
+
+/** cast wrapper */
+static INLINE XMesaContext
+xmesa_context(GLcontext *ctx)
+{
+   return (XMesaContext) ctx->DriverCtx;
+}
+
+
+/** cast wrapper */
+static INLINE XMesaBuffer
+xmesa_buffer(GLframebuffer *fb)
+{
+   struct st_framebuffer *stfb = (struct st_framebuffer *) fb;
+   return (XMesaBuffer) st_framebuffer_private(stfb);
+}
+
+
+extern void
+xmesa_delete_framebuffer(struct gl_framebuffer *fb);
+
+extern XMesaBuffer
+xmesa_find_buffer(XMesaDisplay *dpy, XMesaColormap cmap, XMesaBuffer notThis);
+
+extern void
+xmesa_check_and_update_buffer_size(XMesaContext xmctx, XMesaBuffer drawBuffer);
+
+extern void
+xmesa_destroy_buffers_on_display(XMesaDisplay *dpy);
+
+extern struct pipe_context *
+xmesa_create_pipe_context(XMesaContext xm, uint pixelformat);
+
+static INLINE GLuint
+xmesa_buffer_width(XMesaBuffer b)
+{
+   return b->stfb->Base.Width;
+}
+
+static INLINE GLuint
+xmesa_buffer_height(XMesaBuffer b)
+{
+   return b->stfb->Base.Height;
+}
+
+extern void
+xmesa_display_surface(XMesaBuffer b, const struct pipe_surface *surf);
+
+extern int
+xmesa_check_for_xshm(XMesaDisplay *display);
+
+#endif