54 files changed, 4511 insertions, 261 deletions
diff --git a/src/gallium/auxiliary/cso_cache/Makefile b/src/gallium/auxiliary/cso_cache/Makefile
index 3e49266163..6bd6602088 100644
--- a/src/gallium/auxiliary/cso_cache/Makefile
+++ b/src/gallium/auxiliary/cso_cache/Makefile
@@ -4,6 +4,7 @@ include $(TOP)/configs/current
 LIBNAME = cso_cache
 
 C_SOURCES = \
+	cso_context.c \
 	cso_cache.c \
 	cso_hash.c
 
diff --git a/src/gallium/auxiliary/cso_cache/SConscript b/src/gallium/auxiliary/cso_cache/SConscript
index 9751881613..651e68a191 100644
--- a/src/gallium/auxiliary/cso_cache/SConscript
+++ b/src/gallium/auxiliary/cso_cache/SConscript
@@ -3,6 +3,7 @@ Import('*')
 cso_cache = env.ConvenienceLibrary(
 	target = 'cso_cache',
 	source = [
+		'cso_context.c',
 		'cso_cache.c',
 		'cso_hash.c',
 	])
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index b427b509f8..a2764b4265 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -207,8 +207,11 @@ static INLINE void sanitize_hash(struct cso_hash *hash, enum cso_cache_type type
 {
    /* if we're approach the maximum size, remove fourth of the entries
     * otherwise every subsequent call will go through the same */
-   int max_entries = (max_size > cso_hash_size(hash)) ? max_size : cso_hash_size(hash);
+   int hash_size = cso_hash_size(hash);
+   int max_entries = (max_size > hash_size) ? max_size : hash_size;
    int to_remove =  (max_size < max_entries) * max_entries/4;
+   if (hash_size > max_size)
+      to_remove += hash_size - max_size;
    while (to_remove) {
       /*remove elements until we're good */
       /*fixme: currently we pick the nodes to remove at random*/
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.h b/src/gallium/auxiliary/cso_cache/cso_cache.h
index 44ee128a4a..e5edbbb556 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.h
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.h
@@ -84,47 +84,49 @@
 extern "C" {
 #endif
 
+typedef void (*cso_state_callback)(void *ctx, void *obj);
+
 struct cso_cache;
 
 struct cso_blend {
    struct pipe_blend_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
 struct cso_depth_stencil_alpha {
    struct pipe_depth_stencil_alpha_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
 struct cso_rasterizer {
    struct pipe_rasterizer_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
 struct cso_fragment_shader {
    struct pipe_shader_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
 struct cso_vertex_shader {
    struct pipe_shader_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
 struct cso_sampler {
    struct pipe_sampler_state state;
    void *data;
-   void (*delete_state)(struct pipe_context *, void *);
+   cso_state_callback delete_state;
    struct pipe_context *context;
 };
 
@@ -138,8 +140,6 @@ enum cso_cache_type {
    CSO_VERTEX_SHADER
 };
 
-typedef void (*cso_state_callback)(void *, void *);
-
 unsigned cso_construct_key(void *item, int item_size);
 
 struct cso_cache *cso_cache_create(void);
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
new file mode 100644
index 0000000000..f7f4aebb16
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -0,0 +1,354 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /* Wrap the cso cache & hash mechanisms in a simplified
+  * pipe-driver-specific interface.
+  *
+  * Authors:
+  *   Zack Rusin <zack@tungstengraphics.com>
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+
+#include "cso_cache/cso_context.h"
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct cso_context {
+   struct pipe_context *pipe;
+   struct cso_cache *cache;
+
+   struct {
+      void *samplers[PIPE_MAX_SAMPLERS];
+      unsigned nr_samplers;
+   } hw;
+
+   void *samplers[PIPE_MAX_SAMPLERS];
+   unsigned nr_samplers;
+
+   void *blend;
+   void *depth_stencil;
+   void *rasterizer;
+   void *fragment_shader;
+   void *vertex_shader;
+};
+
+
+struct cso_context *cso_create_context( struct pipe_context *pipe )
+{
+   struct cso_context *ctx = CALLOC_STRUCT(cso_context);
+   if (ctx == NULL)
+      goto out;
+
+   ctx->cache = cso_cache_create();
+   if (ctx->cache == NULL)
+      goto out;
+
+   ctx->pipe = pipe;
+
+   /* Enable for testing: */
+   if (0) cso_set_maximum_cache_size( ctx->cache, 4 );
+
+   return ctx;
+
+out:
+   cso_destroy_context( ctx );      
+   return NULL;
+}
+
+static void cso_release_all( struct cso_context *ctx )
+{
+   if (ctx->pipe) {
+      ctx->pipe->bind_blend_state( ctx->pipe, NULL );
+      ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
+      ctx->pipe->bind_sampler_states( ctx->pipe, 0, NULL );
+      ctx->pipe->bind_depth_stencil_alpha_state( ctx->pipe, NULL );
+      ctx->pipe->bind_fs_state( ctx->pipe, NULL );
+      ctx->pipe->bind_vs_state( ctx->pipe, NULL );
+   }
+
+   if (ctx->cache) {
+      cso_cache_delete( ctx->cache );
+      ctx->cache = NULL;
+   }
+}
+
+
+void cso_destroy_context( struct cso_context *ctx )
+{
+   debug_printf("%s\n", __FUNCTION__);
+
+   if (ctx)
+      cso_release_all( ctx );
+
+   FREE( ctx );
+}
+
+
+/* Those function will either find the state of the given template
+ * in the cache or they will create a new state from the given
+ * template, insert it in the cache and return it.
+ */
+
+/*
+ * If the driver returns 0 from the create method then they will assign
+ * the data member of the cso to be the template itself.
+ */
+
+void cso_set_blend(struct cso_context *ctx,
+                   const struct pipe_blend_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_blend_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_BLEND,
+                                                       (void*)templ);
+   void *handle;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_blend *cso = MALLOC(sizeof(struct cso_blend));
+
+      cso->state = *templ;
+      cso->data = ctx->pipe->create_blend_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_blend_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_BLEND, cso);
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_blend *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->blend != handle) {
+      ctx->blend = handle;
+      ctx->pipe->bind_blend_state(ctx->pipe, handle);
+   }
+}
+
+void cso_single_sampler(struct cso_context *ctx,
+                        unsigned idx,
+                        const struct pipe_sampler_state *templ)
+{
+   void *handle = NULL;
+   
+   if (templ != NULL) {
+      unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_sampler_state));
+      struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                          hash_key, CSO_SAMPLER,
+                                                          (void*)templ);
+
+      if (cso_hash_iter_is_null(iter)) {
+         struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
+         
+         cso->state = *templ;
+         cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
+         cso->delete_state = (cso_state_callback)ctx->pipe->delete_sampler_state;
+         cso->context = ctx->pipe;
+
+         iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
+         handle = cso->data;
+      }
+      else {
+         handle = ((struct cso_sampler *)cso_hash_iter_data(iter))->data;
+      }
+   }
+
+   ctx->samplers[idx] = handle;
+}
+
+void cso_single_sampler_done( struct cso_context *ctx )
+{
+   unsigned i; 
+
+   for (i = 0; i < 8; i++)
+      if (ctx->samplers[i] == NULL)
+         break;
+
+   ctx->nr_samplers = i;
+
+   if (ctx->hw.nr_samplers != ctx->nr_samplers ||
+       memcmp(ctx->hw.samplers, 
+              ctx->samplers, 
+              ctx->nr_samplers * sizeof(void *)) != 0) 
+   {
+      memcpy(ctx->hw.samplers, ctx->samplers, ctx->nr_samplers * sizeof(void *));
+      ctx->hw.nr_samplers = ctx->nr_samplers;
+
+      ctx->pipe->bind_sampler_states(ctx->pipe, ctx->nr_samplers, ctx->samplers);
+   }
+}
+
+void cso_set_samplers( struct cso_context *ctx,
+                       unsigned nr,
+                       const struct pipe_sampler_state **templates )
+{
+   unsigned i;
+   
+   /* TODO: fastpath
+    */
+
+   for (i = 0; i < nr; i++)
+      cso_single_sampler( ctx, i, templates[i] );
+
+   for ( ; i < ctx->nr_samplers; i++)
+      cso_single_sampler( ctx, i, NULL );
+   
+   cso_single_sampler_done( ctx );
+}
+
+void cso_set_depth_stencil_alpha(struct cso_context *ctx,
+                                 const struct pipe_depth_stencil_alpha_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_depth_stencil_alpha_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, 
+						       CSO_DEPTH_STENCIL_ALPHA,
+                                                       (void*)templ);
+   void *handle;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_depth_stencil_alpha *cso = MALLOC(sizeof(struct cso_depth_stencil_alpha));
+
+      cso->state = *templ;
+      cso->data = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_depth_stencil_alpha_state;
+      cso->context = ctx->pipe;
+
+      cso_insert_state(ctx->cache, hash_key, CSO_DEPTH_STENCIL_ALPHA, cso);
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_depth_stencil_alpha *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->depth_stencil != handle) {
+      ctx->depth_stencil = handle;
+      ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, handle);
+   }
+}
+
+
+
+void cso_set_rasterizer(struct cso_context *ctx,
+                              const struct pipe_rasterizer_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_rasterizer_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_RASTERIZER,
+                                                       (void*)templ);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_rasterizer *cso = MALLOC(sizeof(struct cso_rasterizer));
+
+      cso->state = *templ;
+      cso->data = ctx->pipe->create_rasterizer_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_rasterizer_state;
+      cso->context = ctx->pipe;
+
+      cso_insert_state(ctx->cache, hash_key, CSO_RASTERIZER, cso);
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_rasterizer *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->rasterizer != handle) {
+      ctx->rasterizer = handle;
+      ctx->pipe->bind_rasterizer_state(ctx->pipe, handle);
+   }
+}
+
+
+
+
+
+void cso_set_fragment_shader(struct cso_context *ctx,
+                             const struct pipe_shader_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_shader_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_FRAGMENT_SHADER,
+                                                       (void*)templ);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_fragment_shader *cso = MALLOC(sizeof(struct cso_fragment_shader));
+
+      cso->state = *templ;
+      cso->data = ctx->pipe->create_fs_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_fs_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_FRAGMENT_SHADER, cso);
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_fragment_shader *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->fragment_shader != handle) {
+      ctx->fragment_shader = handle;
+      ctx->pipe->bind_fs_state(ctx->pipe, handle);
+   }
+}
+
+void cso_set_vertex_shader(struct cso_context *ctx,
+                           const struct pipe_shader_state *templ)
+{
+   unsigned hash_key = cso_construct_key((void*)templ,
+                                         sizeof(struct pipe_shader_state));
+   struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
+                                                       hash_key, CSO_VERTEX_SHADER,
+                                                       (void*)templ);
+   void *handle = NULL;
+
+   if (cso_hash_iter_is_null(iter)) {
+      struct cso_vertex_shader *cso = MALLOC(sizeof(struct cso_vertex_shader));
+
+      cso->state = *templ;
+      cso->data = ctx->pipe->create_vs_state(ctx->pipe, &cso->state);
+      cso->delete_state = (cso_state_callback)ctx->pipe->delete_vs_state;
+      cso->context = ctx->pipe;
+
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_VERTEX_SHADER, cso);
+      handle = cso->data;
+   }
+   else {
+      handle = ((struct cso_vertex_shader *)cso_hash_iter_data(iter))->data;
+   }
+
+   if (ctx->vertex_shader != handle) {
+      ctx->vertex_shader = handle;
+      ctx->pipe->bind_vs_state(ctx->pipe, handle);
+   }
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
new file mode 100644
index 0000000000..1f2a630804
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -0,0 +1,85 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#ifndef CSO_CONTEXT_H
+#define CSO_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct cso_context;
+
+struct cso_context *cso_create_context( struct pipe_context *pipe );
+
+void cso_set_blend( struct cso_context *cso,
+                    const struct pipe_blend_state *blend );
+
+void cso_set_depth_stencil_alpha( struct cso_context *cso,
+                                  const struct pipe_depth_stencil_alpha_state *dsa );
+
+void cso_set_rasterizer( struct cso_context *cso,
+                         const struct pipe_rasterizer_state *rasterizer );
+
+void cso_set_samplers( struct cso_context *cso,
+                       unsigned count,
+                       const struct pipe_sampler_state **states );
+
+/* Alternate interface to support state trackers that like to modify
+ * samplers one at a time:
+ */
+void cso_single_sampler( struct cso_context *cso,
+                         unsigned nr,
+                         const struct pipe_sampler_state *states );
+
+void cso_single_sampler_done( struct cso_context *cso );
+
+
+/* These aren't really sensible -- most of the time the api provides
+ * object semantics for shaders anyway, and the cases where it doesn't
+ * (eg mesa's internall-generated texenv programs), it will be up to
+ * the state tracker to implement their own specialized caching.
+ */
+void cso_set_fragment_shader( struct cso_context *cso,
+                              const struct pipe_shader_state *shader );
+
+void cso_set_vertex_shader( struct cso_context *cso,
+                            const struct pipe_shader_state *shader );
+
+void cso_destroy_context( struct cso_context *cso );
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index b3b4d667d2..5cad5d3be7 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -101,13 +101,6 @@ static void *cso_data_allocate_node(struct cso_hash_data *hash)
 
 static void cso_data_free_node(struct cso_node *node)
 {
-   /* XXX still a leak here.
-    * Need to cast value ptr to original cso type, then free the
-    * driver-specific data hanging off of it.  For example:
-   struct cso_sampler *csamp = (struct cso_sampler *) node->value;
-   FREE(csamp->data);
-   */
-   FREE(node->value);
    FREE(node);
 }
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index d5bca9d591..84b45a5963 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -25,6 +25,16 @@
  *
  **************************************************************************/
 
+/**
+ This file provides a hash implementation that is capable of dealing
+ with collisions. It stores colliding entries in linked list. All
+ functions operating on the hash return an iterator. The iterator
+ itself points to the collision list. If there wasn't any collision
+ the list will have just one entry, otherwise client code should
+ iterate over the entries to find the exact entry among ones that
+ had the same key (e.g. memcmp could be used on the data to check
+ that)
+*/
  /*
   * Authors:
   *   Zack Rusin <zack@tungstengraphics.com>
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index c18dcb2927..5cb7664c85 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -16,6 +16,7 @@ draw = env.ConvenienceLibrary(
 		'draw_offset.c',
 		'draw_prim.c',
 		'draw_pstipple.c',
+		'draw_passthrough.c',
 		'draw_stipple.c',
 		'draw_twoside.c',
 		'draw_unfilled.c',
diff --git a/src/gallium/auxiliary/draw/draw_aaline.c b/src/gallium/auxiliary/draw/draw_aaline.c
index 7660e56fe6..6b1e640ae9 100644
--- a/src/gallium/auxiliary/draw/draw_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_aaline.c
@@ -78,7 +78,8 @@ struct aaline_stage
 
    void *sampler_cso;
    struct pipe_texture *texture;
-   uint sampler_unit;
+   uint num_samplers;
+   uint num_textures;
 
 
    /*
@@ -98,11 +99,10 @@ struct aaline_stage
    void (*driver_bind_fs_state)(struct pipe_context *, void *);
    void (*driver_delete_fs_state)(struct pipe_context *, void *);
 
-   void (*driver_bind_sampler_state)(struct pipe_context *, unsigned, void *);
-
-   void (*driver_set_sampler_texture)(struct pipe_context *,
-                                      unsigned sampler,
-                                      struct pipe_texture *);
+   void (*driver_bind_sampler_states)(struct pipe_context *, unsigned,
+                                      void **);
+   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
+                                       struct pipe_texture **);
 
    struct pipe_context *pipe;
 };
@@ -607,6 +607,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    auto struct aaline_stage *aaline = aaline_stage(stage);
    struct draw_context *draw = stage->draw;
    struct pipe_context *pipe = aaline->pipe;
+   uint num = MAX2(aaline->num_textures, aaline->num_samplers);
 
    assert(draw->rasterizer->line_smooth);
 
@@ -624,8 +625,11 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
     */
    bind_aaline_fragment_shader(aaline);
 
-   aaline->driver_bind_sampler_state(pipe, aaline->sampler_unit, aaline->sampler_cso);
-   aaline->driver_set_sampler_texture(pipe, aaline->sampler_unit, aaline->texture);
+   aaline->state.sampler[num] = aaline->sampler_cso;
+   aaline->state.texture[num] = aaline->texture;
+
+   aaline->driver_bind_sampler_states(pipe, num + 1, aaline->state.sampler);
+   aaline->driver_set_sampler_textures(pipe, num + 1, aaline->state.texture);
 
    /* now really draw first line */
    stage->line = aaline_line;
@@ -647,10 +651,10 @@ aaline_flush(struct draw_stage *stage, unsigned flags)
    aaline->driver_bind_fs_state(pipe, aaline->fs->driver_fs);
 
    /* XXX restore original texture, sampler state */
-   aaline->driver_bind_sampler_state(pipe, aaline->sampler_unit,
-                                 aaline->state.sampler[aaline->sampler_unit]);
-   aaline->driver_set_sampler_texture(pipe, aaline->sampler_unit,
-                                 aaline->state.texture[aaline->sampler_unit]);
+   aaline->driver_bind_sampler_states(pipe, aaline->num_samplers,
+                                      aaline->state.sampler);
+   aaline->driver_set_sampler_textures(pipe, aaline->num_textures,
+                                       aaline->state.texture);
 
    draw->extra_vp_outputs.slot = 0;
 }
@@ -729,7 +733,8 @@ aaline_bind_fs_state(struct pipe_context *pipe, void *fs)
    /* save current */
    aaline->fs = aafs;
    /* pass-through */
-   aaline->driver_bind_fs_state(aaline->pipe, aafs->driver_fs);
+   aaline->driver_bind_fs_state(aaline->pipe,
+                                (aafs ? aafs->driver_fs : NULL));
 }
 
 
@@ -745,26 +750,28 @@ aaline_delete_fs_state(struct pipe_context *pipe, void *fs)
 
 
 static void
-aaline_bind_sampler_state(struct pipe_context *pipe,
-                          unsigned unit, void *sampler)
+aaline_bind_sampler_states(struct pipe_context *pipe,
+                           unsigned num, void **sampler)
 {
    struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
    /* save current */
-   aaline->state.sampler[unit] = sampler;
+   memcpy(aaline->state.sampler, sampler, num * sizeof(void *));
+   aaline->num_samplers = num;
    /* pass-through */
-   aaline->driver_bind_sampler_state(aaline->pipe, unit, sampler);
+   aaline->driver_bind_sampler_states(aaline->pipe, num, sampler);
 }
 
 
 static void
-aaline_set_sampler_texture(struct pipe_context *pipe,
-                           unsigned sampler, struct pipe_texture *texture)
+aaline_set_sampler_textures(struct pipe_context *pipe,
+                            unsigned num, struct pipe_texture **texture)
 {
    struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
    /* save current */
-   aaline->state.texture[sampler] = texture;
+   memcpy(aaline->state.texture, texture, num * sizeof(struct pipe_texture *));
+   aaline->num_textures = num;
    /* pass-through */
-   aaline->driver_set_sampler_texture(aaline->pipe, sampler, texture);
+   aaline->driver_set_sampler_textures(aaline->pipe, num, texture);
 }
 
 
@@ -798,14 +805,14 @@ draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
    aaline->driver_bind_fs_state = pipe->bind_fs_state;
    aaline->driver_delete_fs_state = pipe->delete_fs_state;
 
-   aaline->driver_bind_sampler_state = pipe->bind_sampler_state;
-   aaline->driver_set_sampler_texture = pipe->set_sampler_texture;
+   aaline->driver_bind_sampler_states = pipe->bind_sampler_states;
+   aaline->driver_set_sampler_textures = pipe->set_sampler_textures;
 
    /* override the driver's functions */
    pipe->create_fs_state = aaline_create_fs_state;
    pipe->bind_fs_state = aaline_bind_fs_state;
    pipe->delete_fs_state = aaline_delete_fs_state;
 
-   pipe->bind_sampler_state = aaline_bind_sampler_state;
-   pipe->set_sampler_texture = aaline_set_sampler_texture;
+   pipe->bind_sampler_states = aaline_bind_sampler_states;
+   pipe->set_sampler_textures = aaline_set_sampler_textures;
 }
diff --git a/src/gallium/auxiliary/draw/draw_aapoint.c b/src/gallium/auxiliary/draw/draw_aapoint.c
index 70f696475f..99e9e9fe34 100644
--- a/src/gallium/auxiliary/draw/draw_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_aapoint.c
@@ -800,7 +800,8 @@ aapoint_bind_fs_state(struct pipe_context *pipe, void *fs)
    /* save current */
    aapoint->fs = aafs;
    /* pass-through */
-   aapoint->driver_bind_fs_state(aapoint->pipe, aafs->driver_fs);
+   aapoint->driver_bind_fs_state(aapoint->pipe,
+                                 (aafs ? aafs->driver_fs : NULL));
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 428b6209e0..fed2b6e759 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,6 +34,7 @@
 #include "pipe/p_util.h"
 #include "draw_context.h"
 #include "draw_private.h"
+#include "draw_vbuf.h"
 
 
 
@@ -114,6 +115,13 @@ void draw_destroy( struct draw_context *draw )
       draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
    tgsi_exec_machine_free_data(&draw->machine);
    align_free( draw->vs.queue[0].vertex ); /* Frees all the vertices. */
+
+   /* Not so fast -- we're just borrowing this at the moment.
+    * 
+   if (draw->render)
+      draw->render->destroy( draw->render );
+   */
+
    FREE( draw );
 }
 
@@ -349,3 +357,10 @@ void draw_reset_vertex_ids(struct draw_context *draw)
 
    draw_vertex_cache_reset_vertex_ids(draw);
 }
+
+
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render )
+{
+   draw->render = render;
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index ab87b4127c..df63e91a22 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -168,4 +168,9 @@ unsigned draw_trim_prim( unsigned mode, unsigned count );
 
 
 
+
+struct vbuf_render;
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render );
+
 #endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_passthrough.c b/src/gallium/auxiliary/draw/draw_passthrough.c
new file mode 100644
index 0000000000..a51fa0ab23
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_passthrough.c
@@ -0,0 +1,222 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+/* This code is a prototype of what a passhthrough vertex shader might
+ * look like.
+ *
+ * Probably the best approach for us is to do:
+ *    - vertex fetch
+ *    - vertex shader
+ *    - cliptest / viewport transform
+ *
+ * in one step, then examine the clipOrMask & choose between two paths:
+ *
+ * Either:
+ *    - build primitive headers
+ *    - clip and the primitive path
+ *    - build clipped vertex buffers,
+ *    - vertex-emit to vbuf buffers
+ *
+ * Or, if no clipping:
+ *    - vertex-emit directly to vbuf buffers
+ *
+ * But when bypass clipping is enabled, we just take the latter
+ * choice.  If (some new) passthrough-vertex-shader flag is also set,
+ * the pipeline degenerates to:
+ *
+ *    - vertex fetch
+ *    - vertex emit to vbuf buffers
+ *
+ * Which is what is prototyped here.
+ */
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+
+
+
+/* Example of a fetch/emit passthrough shader which could be
+ * generated when bypass_clipping is enabled on a passthrough vertex
+ * shader.
+ */
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      float *out,
+			      unsigned start,
+			      unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   unsigned i;
+
+   const ubyte *xyzw = src[0] + start * pitch[0];
+   const ubyte *rgba = src[1] + start * pitch[1];
+   const ubyte *st = src[2] + start * pitch[2];
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      {
+	 const float *in = (const float *)xyzw; xyzw += pitch[0];
+         /* decode input, encode output.  Assume both are float[4] */
+	 out[0] = in[0];
+	 out[1] = in[1];
+	 out[2] = in[2];
+	 out[3] = in[3];
+      }
+
+      {
+	 const float *in = (const float *)rgba; rgba += pitch[1];
+         /* decode input, encode output.  Assume both are float[4] */
+	 out[4] = in[0];
+	 out[5] = in[1];
+	 out[6] = in[2];
+ 	 out[7] = in[3];
+      }
+
+      {
+	 const float *in = (const float *)st; st += pitch[2];
+         /* decode input, encode output.  Assume both are float[2] */
+	 out[8] = in[0];
+	 out[9] = in[1];
+      }
+
+      out += 10;
+   }
+}
+
+			       
+static boolean update_shader( struct draw_context *draw )
+{
+   const struct vertex_info *vinfo = draw->render->get_vertex_info(draw->render);
+
+   unsigned nr_attrs = vinfo->num_attribs;
+   unsigned i;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = NULL;
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+   draw->vertex_fetch.fetch_func = NULL;
+   draw->vertex_fetch.pt_fetch = NULL;
+
+   draw->pt.hw_vertex_size = vinfo->size * 4;
+
+   /* Just trying to figure out how this would work:
+    */
+   if (nr_attrs == 3 &&
+       0 /* some other tests */)
+   {
+      draw->vertex_fetch.pt_fetch = fetch_xyz_rgb_st;
+      assert(vinfo->size == 10);
+      return TRUE;
+   }
+   
+   return FALSE;
+}
+
+
+
+static boolean set_prim( struct draw_context *draw,
+		      unsigned prim )
+{
+   assert(!draw->user.elts);   
+
+   draw->pt.prim = prim;
+
+   switch (prim) { 
+   case PIPE_PRIM_LINE_LOOP:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      return FALSE;
+   default:
+      draw->render->set_primitive( draw->render, prim );
+      return TRUE;
+   }
+}
+
+
+
+boolean
+draw_passthrough_arrays(struct draw_context *draw, 
+                        unsigned prim,
+                        unsigned start, 
+                        unsigned count)
+{
+   float *hw_verts;
+
+   if (!set_prim(draw, prim))
+      return FALSE;
+
+   if (!update_shader( draw ))
+      return FALSE;
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               draw->pt.hw_vertex_size,
+                                               count );
+
+   if (!hw_verts)
+      return FALSE;
+					
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    * Clipping and viewport transformation are done on hardware.
+    */
+   draw->vertex_fetch.pt_fetch( draw, 
+				hw_verts,
+				start, count );
+
+   /* Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw_arrays( draw->render,
+                              start,
+                              count );
+   
+
+   draw->render->release_vertices( draw->render, 
+				   hw_verts, 
+				   draw->pt.hw_vertex_size, 
+				   count );
+
+   return TRUE;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index c732d723a7..4147472d45 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -133,7 +133,7 @@ struct draw_vertex_shader {
 
    /* This member will disappear shortly:
     */
-   const struct pipe_shader_state   *state;
+   struct pipe_shader_state   state;
 
    struct tgsi_shader_info info;
 
@@ -162,8 +162,14 @@ typedef void (*full_fetch_func)( struct draw_context *draw,
 				 const unsigned *elts,
 				 unsigned count );
 
+typedef void (*pt_fetch_func)( struct draw_context *draw,
+			      float *out,
+			      unsigned start,
+			       unsigned count );
 
 
+struct vbuf_render;
+
 /**
  * Private context for the drawing module.
  */
@@ -191,6 +197,17 @@ struct draw_context
       struct draw_stage *rasterize;
    } pipeline;
 
+
+   struct vbuf_render *render;
+
+   /* Support prototype passthrough path:
+    */
+   struct {
+      unsigned prim;
+      unsigned hw_vertex_size;
+   } pt;
+
+
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
@@ -244,6 +261,7 @@ struct draw_context
       fetch_func fetch[PIPE_ATTRIB_MAX];
       unsigned nr_attrs;
       full_fetch_func fetch_func;
+      pt_fetch_func pt_fetch;
    } vertex_fetch;
 
    /* Post-tnl vertex cache:
@@ -331,6 +349,15 @@ struct tgsi_exec_machine;
 extern void draw_update_vertex_fetch( struct draw_context *draw );
 
 
+/* Prototype/hack
+ */
+boolean
+draw_passthrough_arrays(struct draw_context *draw, 
+                        unsigned prim,
+                        unsigned start, 
+                        unsigned count);
+
+
 #define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
 #define DRAW_FLUSH_PRIM_QUEUE                0x2
 #define DRAW_FLUSH_VERTEX_CACHE              0x4
diff --git a/src/gallium/auxiliary/draw/draw_pstipple.c b/src/gallium/auxiliary/draw/draw_pstipple.c
index 2cfeb813b3..8b3e84a9a0 100644
--- a/src/gallium/auxiliary/draw/draw_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pstipple.c
@@ -67,16 +67,18 @@ struct pstip_stage
    struct draw_stage stage;
 
    void *sampler_cso;
-   struct pipe_texture *texture;
    uint sampler_unit;
+   struct pipe_texture *texture;
+   uint num_samplers;
+   uint num_textures;
 
    /*
     * Currently bound state
     */
    struct pstip_fragment_shader *fs;
    struct {
-      void *sampler[PIPE_MAX_SAMPLERS];
-      struct pipe_texture *texture[PIPE_MAX_SAMPLERS];
+      void *samplers[PIPE_MAX_SAMPLERS];
+      struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
       const struct pipe_poly_stipple *stipple;
    } state;
 
@@ -88,11 +90,10 @@ struct pstip_stage
    void (*driver_bind_fs_state)(struct pipe_context *, void *);
    void (*driver_delete_fs_state)(struct pipe_context *, void *);
 
-   void (*driver_bind_sampler_state)(struct pipe_context *, unsigned, void *);
+   void (*driver_bind_sampler_states)(struct pipe_context *, unsigned, void **);
 
-   void (*driver_set_sampler_texture)(struct pipe_context *,
-                                      unsigned sampler,
-                                      struct pipe_texture *);
+   void (*driver_set_sampler_textures)(struct pipe_context *, unsigned,
+                                       struct pipe_texture **);
 
    void (*driver_set_polygon_stipple)(struct pipe_context *,
                                       const struct pipe_poly_stipple *);
@@ -484,18 +485,25 @@ static void
 pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
 {
    struct pstip_stage *pstip = pstip_stage(stage);
-   struct draw_context *draw = stage->draw;
    struct pipe_context *pipe = pstip->pipe;
+   uint num_samplers;
 
-   assert(draw->rasterizer->poly_stipple_enable);
+   /* how many samplers? */
+   /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
+   num_samplers = MAX2(pstip->num_textures, pstip->num_samplers);
+   num_samplers = MAX2(num_samplers, pstip->sampler_unit + 1);
 
-   /*
-    * Bind our fragprog, sampler and texture
-    */
+   assert(stage->draw->rasterizer->poly_stipple_enable);
+
+   /* bind our fragprog */
    bind_pstip_fragment_shader(pstip);
 
-   pstip->driver_bind_sampler_state(pipe, pstip->sampler_unit, pstip->sampler_cso);
-   pstip->driver_set_sampler_texture(pipe, pstip->sampler_unit, pstip->texture);
+   /* plug in our sampler, texture */
+   pstip->state.samplers[pstip->sampler_unit] = pstip->sampler_cso;
+   pstip->state.textures[pstip->sampler_unit] = pstip->texture;
+
+   pstip->driver_bind_sampler_states(pipe, num_samplers, pstip->state.samplers);
+   pstip->driver_set_sampler_textures(pipe, num_samplers, pstip->state.textures);
 
    /* now really draw first line */
    stage->tri = passthrough_tri;
@@ -517,10 +525,10 @@ pstip_flush(struct draw_stage *stage, unsigned flags)
    pstip->driver_bind_fs_state(pipe, pstip->fs->driver_fs);
 
    /* XXX restore original texture, sampler state */
-   pstip->driver_bind_sampler_state(pipe, pstip->sampler_unit,
-                                 pstip->state.sampler[pstip->sampler_unit]);
-   pstip->driver_set_sampler_texture(pipe, pstip->sampler_unit,
-                                 pstip->state.texture[pstip->sampler_unit]);
+   pstip->driver_bind_sampler_states(pipe, pstip->num_samplers,
+                                     pstip->state.samplers);
+   pstip->driver_set_sampler_textures(pipe, pstip->num_textures,
+                                      pstip->state.textures);
 }
 
 
@@ -597,7 +605,8 @@ pstip_bind_fs_state(struct pipe_context *pipe, void *fs)
    /* save current */
    pstip->fs = aafs;
    /* pass-through */
-   pstip->driver_bind_fs_state(pstip->pipe, aafs->driver_fs);
+   pstip->driver_bind_fs_state(pstip->pipe,
+                               (aafs ? aafs->driver_fs : NULL));
 }
 
 
@@ -613,26 +622,28 @@ pstip_delete_fs_state(struct pipe_context *pipe, void *fs)
 
 
 static void
-pstip_bind_sampler_state(struct pipe_context *pipe,
-                         unsigned unit, void *sampler)
+pstip_bind_sampler_states(struct pipe_context *pipe,
+                          unsigned num, void **sampler)
 {
    struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
    /* save current */
-   pstip->state.sampler[unit] = sampler;
+   memcpy(pstip->state.samplers, sampler, num * sizeof(void *));
+   pstip->num_samplers = num;
    /* pass-through */
-   pstip->driver_bind_sampler_state(pstip->pipe, unit, sampler);
+   pstip->driver_bind_sampler_states(pstip->pipe, num, sampler);
 }
 
 
 static void
-pstip_set_sampler_texture(struct pipe_context *pipe,
-                          unsigned sampler, struct pipe_texture *texture)
+pstip_set_sampler_textures(struct pipe_context *pipe,
+                           unsigned num, struct pipe_texture **texture)
 {
    struct pstip_stage *pstip = pstip_stage_from_pipe(pipe);
    /* save current */
-   pstip->state.texture[sampler] = texture;
+   memcpy(pstip->state.textures, texture, num * sizeof(struct pipe_texture *));
+   pstip->num_textures = num;
    /* pass-through */
-   pstip->driver_set_sampler_texture(pstip->pipe, sampler, texture);
+   pstip->driver_set_sampler_textures(pstip->pipe, num, texture);
 }
 
 
@@ -682,8 +693,8 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pstip->driver_bind_fs_state = pipe->bind_fs_state;
    pstip->driver_delete_fs_state = pipe->delete_fs_state;
 
-   pstip->driver_bind_sampler_state = pipe->bind_sampler_state;
-   pstip->driver_set_sampler_texture = pipe->set_sampler_texture;
+   pstip->driver_bind_sampler_states = pipe->bind_sampler_states;
+   pstip->driver_set_sampler_textures = pipe->set_sampler_textures;
    pstip->driver_set_polygon_stipple = pipe->set_polygon_stipple;
 
    /* override the driver's functions */
@@ -691,7 +702,7 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pipe->bind_fs_state = pstip_bind_fs_state;
    pipe->delete_fs_state = pstip_delete_fs_state;
 
-   pipe->bind_sampler_state = pstip_bind_sampler_state;
-   pipe->set_sampler_texture = pstip_set_sampler_texture;
+   pipe->bind_sampler_states = pstip_bind_sampler_states;
+   pipe->set_sampler_textures = pstip_set_sampler_textures;
    pipe->set_polygon_stipple = pstip_set_polygon_stipple;
 }
diff --git a/src/gallium/auxiliary/draw/draw_unfilled.c b/src/gallium/auxiliary/draw/draw_unfilled.c
index 4d718d514c..b07860cd9e 100644
--- a/src/gallium/auxiliary/draw/draw_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_unfilled.c
@@ -129,7 +129,7 @@ static void unfilled_tri( struct draw_stage *stage,
       points( stage, header );
       break;
    default:
-      abort();
+      assert(0);
    }   
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
index cfd2b9820c..5e7de905c1 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.h
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -85,6 +85,12 @@ struct vbuf_render {
 		 const ushort *indices,
 		 uint nr_indices );
 
+   /* Draw Arrays path too.
+    */
+   void (*draw_arrays)( struct vbuf_render *,
+			unsigned start,
+			uint nr );
+
    /**
     * Called when vbuf is done with this set of vertices:
     */
diff --git a/src/gallium/auxiliary/draw/draw_vertex_cache.c b/src/gallium/auxiliary/draw/draw_vertex_cache.c
index 53f8bbec44..161b247d4e 100644
--- a/src/gallium/auxiliary/draw/draw_vertex_cache.c
+++ b/src/gallium/auxiliary/draw/draw_vertex_cache.c
@@ -41,7 +41,11 @@ void draw_vertex_cache_invalidate( struct draw_context *draw )
    assert(draw->vs.queue_nr == 0);
    assert(draw->vcache.referenced == 0);
 
-//   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
+   /* There's an error somewhere in the vcache code that requires this
+    * memset.  The bug is exposed in q3demo demo001, but probably
+    * elsewhere as well.  Will track it down later.
+    */
+   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex_fetch.c b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
index cb8cdd04a3..b56d85396d 100644
--- a/src/gallium/auxiliary/draw/draw_vertex_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
@@ -54,7 +54,7 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    int i;						\
 							\
    for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT;					\
+      attrib[i] = CVT(i);                              \
    }							\
 							\
    for (; i < 4; i++) {					\
@@ -62,24 +62,24 @@ fetch_##NAME(const void *ptr, float *attrib)		\
    }							\
 }
 
-#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
-#define CVT_32_FLOAT   ((float *) ptr)[i]
+#define CVT_64_FLOAT(i)   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT(i)   ((float *) ptr)[i]
 
-#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
-#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
-#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+#define CVT_8_USCALED(i)  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED(i) (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED(i) (float) ((unsigned int *) ptr)[i]
 
-#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
-#define CVT_16_SSCALED (float) ((short *) ptr)[i]
-#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+#define CVT_8_SSCALED(i)  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED(i) (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED(i) (float) ((int *) ptr)[i]
 
-#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
-#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
-#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+#define CVT_8_UNORM(i)    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM(i)   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM(i)   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
 
-#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
-#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
-#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+#define CVT_8_SNORM(i)    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM(i)   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM(i)   (float) ((int *) ptr)[i] / 2147483647.0f
 
 FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
 FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
@@ -156,6 +156,16 @@ FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
 
 
 
+static void
+fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
+{
+   attrib[2] = CVT_8_UNORM(0);
+   attrib[1] = CVT_8_UNORM(1);
+   attrib[0] = CVT_8_UNORM(2);
+   attrib[3] = CVT_8_UNORM(3);
+}
+
+
 static fetch_func get_fetch_func( enum pipe_format format )
 {
 #if 0
@@ -296,6 +306,10 @@ static fetch_func get_fetch_func( enum pipe_format format )
    case PIPE_FORMAT_A8R8G8B8_UNORM:
       return fetch_A8R8G8B8_UNORM;
 
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return fetch_B8G8R8A8_UNORM;
+
    case 0:
       return NULL;		/* not sure why this is needed */
 
diff --git a/src/gallium/auxiliary/draw/draw_vertex_shader.c b/src/gallium/auxiliary/draw/draw_vertex_shader.c
index 1e95355555..133418baca 100644
--- a/src/gallium/auxiliary/draw/draw_vertex_shader.c
+++ b/src/gallium/auxiliary/draw/draw_vertex_shader.c
@@ -110,13 +110,20 @@ draw_bind_vertex_shader(struct draw_context *draw,
                         struct draw_vertex_shader *dvs)
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   
+   if (dvs) 
+   {
+      draw->vertex_shader = dvs;
+      draw->num_vs_outputs = dvs->info.num_outputs;
 
-   draw->vertex_shader = dvs;
-   draw->num_vs_outputs = dvs->info.num_outputs;
+      tgsi_exec_machine_init(&draw->machine);
 
-   tgsi_exec_machine_init(&draw->machine);
-
-   dvs->prepare( dvs, draw );
+      dvs->prepare( dvs, draw );
+   }
+   else {
+      draw->vertex_shader = NULL;
+      draw->num_vs_outputs = 0;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 583812aadd..55bec14116 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -71,7 +71,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
 {
    /* specify the vertex program to interpret/execute */
    tgsi_exec_machine_bind_shader(&draw->machine,
-				 shader->state->tokens,
+				 shader->state.tokens,
 				 PIPE_MAX_SAMPLERS,
 				 NULL /*samplers*/ );
 
@@ -132,20 +132,30 @@ vs_exec_run( struct draw_vertex_shader *shader,
       z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
       w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
-      vOut[j]->edgeflag = 1;
-
-      /* divide by w */
-      w = 1.0f / w;
-      x *= w;
-      y *= w;
-      z *= w;
-
-      /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
+      if (!draw->rasterizer->bypass_clipping) {
+         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+         vOut[j]->edgeflag = 1;
+
+         /* divide by w */
+         w = 1.0f / w;
+         x *= w;
+         y *= w;
+         z *= w;
+         
+         /* Viewport mapping */
+         vOut[j]->data[0][0] = x * scale[0] + trans[0];
+         vOut[j]->data[0][1] = y * scale[1] + trans[1];
+         vOut[j]->data[0][2] = z * scale[2] + trans[2];
+         vOut[j]->data[0][3] = w;
+      }
+      else {
+         vOut[j]->clipmask = 0;
+         vOut[j]->edgeflag = 1;
+         vOut[j]->data[0][0] = x;
+         vOut[j]->data[0][1] = y;
+         vOut[j]->data[0][2] = z;
+         vOut[j]->data[0][3] = w;
+      }
 
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
@@ -177,7 +187,7 @@ draw_create_vs_exec(struct draw_context *draw,
    if (vs == NULL) 
       return NULL;
 
-   vs->state = state;
+   vs->state = *state;
    vs->prepare = vs_exec_prepare;
    vs->run = vs_exec_run;
    vs->delete = vs_exec_delete;
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 0fd557d667..53c260be53 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -135,25 +135,35 @@ vs_llvm_run( struct draw_vertex_shader *base,
       unsigned slot;
       float x, y, z, w;
 
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
-      vOut[j]->edgeflag = 1;
-
-      /* divide by w */
-      w = 1.0f / w;
-      x *= w;
-      y *= w;
-      z *= w;
-
-      /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
+      if (!draw->rasterizer->bypass_clipping) {
+         x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+         y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+         z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+         w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+         vOut[j]->edgeflag = 1;
+         
+         /* divide by w */
+         w = 1.0f / w;
+         x *= w;
+         y *= w;
+         z *= w;
+         
+         /* Viewport mapping */
+         vOut[j]->data[0][0] = x * scale[0] + trans[0];
+         vOut[j]->data[0][1] = y * scale[1] + trans[1];
+         vOut[j]->data[0][2] = z * scale[2] + trans[2];
+         vOut[j]->data[0][3] = w;
+      }
+      else {
+         vOut[j]->clipmask = 0;
+         vOut[j]->edgeflag = 1;
+         vOut[j]->data[0][0] = x;
+         vOut[j]->data[0][1] = y;
+         vOut[j]->data[0][2] = z;
+         vOut[j]->data[0][3] = w;
+      }
 
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0b8bc2bf14..5ee2adb344 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -158,20 +158,30 @@ vs_sse_run( struct draw_vertex_shader *base,
       z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
       w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
 
-      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
-      vOut[j]->edgeflag = 1;
-
-      /* divide by w */
-      w = 1.0f / w;
-      x *= w;
-      y *= w;
-      z *= w;
-
-      /* Viewport mapping */
-      vOut[j]->data[0][0] = x * scale[0] + trans[0];
-      vOut[j]->data[0][1] = y * scale[1] + trans[1];
-      vOut[j]->data[0][2] = z * scale[2] + trans[2];
-      vOut[j]->data[0][3] = w;
+      if (!draw->rasterizer->bypass_clipping) {
+         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+         vOut[j]->edgeflag = 1;
+
+         /* divide by w */
+         w = 1.0f / w;
+         x *= w;
+         y *= w;
+         z *= w;
+         
+         /* Viewport mapping */
+         vOut[j]->data[0][0] = x * scale[0] + trans[0];
+         vOut[j]->data[0][1] = y * scale[1] + trans[1];
+         vOut[j]->data[0][2] = z * scale[2] + trans[2];
+         vOut[j]->data[0][3] = w;
+      }
+      else {
+         vOut[j]->clipmask = 0;
+         vOut[j]->edgeflag = 1;
+         vOut[j]->data[0][0] = x;
+         vOut[j]->data[0][1] = y;
+         vOut[j]->data[0][2] = z;
+         vOut[j]->data[0][3] = w;
+      }
 
       /* Remaining attributes are packed into sequential post-transform
        * vertex attrib slots.
@@ -211,14 +221,14 @@ draw_create_vs_sse(struct draw_context *draw,
    if (vs == NULL) 
       return NULL;
 
-   vs->base.state = templ;
+   vs->base.state = *templ;
    vs->base.prepare = vs_sse_prepare;
    vs->base.run = vs_sse_run;
    vs->base.delete = vs_sse_delete;
    
    x86_init_func( &vs->sse2_program );
 
-   if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state->tokens,
+   if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
 			&vs->sse2_program )) 
       goto fail;
       
diff --git a/src/gallium/auxiliary/gallivm/Makefile b/src/gallium/auxiliary/gallivm/Makefile
index 39fac6ea4a..c24e19e062 100644
--- a/src/gallium/auxiliary/gallivm/Makefile
+++ b/src/gallium/auxiliary/gallivm/Makefile
@@ -15,7 +15,7 @@ GALLIVM_SOURCES = \
         storagesoa.cpp \
         instructionssoa.cpp
 
-INC_SOURCES = gallivm_builtins.cpp
+INC_SOURCES = gallivm_builtins.cpp gallivmsoabuiltins.cpp
 
 CPP_SOURCES = \
 	$(GALLIVM_SOURCES)
@@ -65,8 +65,10 @@ depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(INC_SOURCES)
 
 
 gallivm_builtins.cpp: llvm_builtins.c
-	clang --emit-llvm $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-contents -o=$@ -f -for=shader -funcname=createGallivmBuiltins
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-contents -o=$@ -f -for=shader -funcname=createGallivmBuiltins
 
+gallivmsoabuiltins.cpp: soabuiltins.c
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-module -o=$@ -f -for=shader -funcname=createSoaBuiltins
 
 # Emacs tags
 tags:
@@ -78,6 +80,7 @@ clean:
 	-rm -f *.o */*.o *~ *.so *~ server/*.o
 	-rm -f depend depend.bak
 	-rm -f gallivm_builtins.cpp
+	-rm -f gallivmsoabuiltins.cpp
 
 symlinks:
 
diff --git a/src/gallium/auxiliary/gallivm/gallivm.cpp b/src/gallium/auxiliary/gallivm/gallivm.cpp
index d14bb3b99a..b6f641a3f8 100644
--- a/src/gallium/auxiliary/gallivm/gallivm.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm.cpp
@@ -306,11 +306,19 @@ struct gallivm_prog * gallivm_ir_compile(struct gallivm_ir *ir)
 {
    struct gallivm_prog *prog =
       (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
+   
+   std::cout << "Before optimizations:"<<std::endl;
+   ir->module->dump();
+   std::cout<<"-------------------------------"<<std::endl;
+   
+   PassManager veri;
+   veri.add(createVerifierPass());
+   veri.run(*ir->module);
    llvm::Module *mod = llvm::CloneModule(ir->module);
    prog->num_consts = ir->num_consts;
    memcpy(prog->interpolators, ir->interpolators, sizeof(prog->interpolators));
    prog->num_interp = ir->num_interp;
-
+   
    /* Run optimization passes over it */
    PassManager passes;
    passes.add(new TargetData(mod));
diff --git a/src/gallium/auxiliary/gallivm/gallivm.h b/src/gallium/auxiliary/gallivm/gallivm.h
index 57912a952f..b4d6555d2f 100644
--- a/src/gallium/auxiliary/gallivm/gallivm.h
+++ b/src/gallium/auxiliary/gallivm/gallivm.h
@@ -33,6 +33,16 @@
 #ifndef GALLIVM_H
 #define GALLIVM_H
 
+/*
+  LLVM representation consists of two stages - layout independent
+  intermediate representation gallivm_ir and driver specific
+  gallivm_prog. TGSI is first being translated into gallivm_ir
+  after that driver can set number of options on gallivm_ir and
+  have it compiled into gallivm_prog. gallivm_prog can be either
+  executed (assuming there's LLVM JIT backend for the current
+  target) or machine code generation can be done (assuming there's
+  a LLVM code generator for thecurrent target)
+ */
 #if defined __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 55d39fa5f1..8919491792 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -42,6 +42,7 @@
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ParameterAttributes.h>
+#include <llvm/ParamAttrsList.h>
 
 #include <sstream>
 #include <fstream>
@@ -51,6 +52,15 @@ using namespace llvm;
 
 #include "gallivm_builtins.cpp"
 
+#if 0
+
+llvm::Value *arrayFromChannels(std::vector<llvm::Value*> &vals)
+{
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
+}
+#endif
+
 static inline std::string createFuncName(int label)
 {
    std::ostringstream stream;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index a4d5046637..89d513afd0 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -2,9 +2,28 @@
 
 #include "storagesoa.h"
 
+#include "pipe/p_shader_tokens.h"
+
+#include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
+#include <llvm/Module.h>
+#include <llvm/Function.h>
+#include <llvm/Instructions.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/ParamAttrsList.h>
+
+#include <iostream>
 
+
+/* disable some warnings. this file is autogenerated */
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
 using namespace llvm;
+#include "gallivmsoabuiltins.cpp"
+#if defined(__GNUC__)
+#pragma GCC diagnostic warning "-Wunused-variable"
+#endif
 
 InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
                                  llvm::BasicBlock *block, StorageSoa *storage)
@@ -12,6 +31,8 @@ InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
      m_storage(storage),
      m_idx(0)
 {
+   createFunctionMap();
+   createBuiltins();
 }
 
 const char * InstructionsSoa::name(const char *prefix) const
@@ -119,3 +140,167 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
 
    return res;
 }
+
+void InstructionsSoa::createFunctionMap()
+{
+   m_functionsMap[TGSI_OPCODE_DP3] = "dp3";
+   m_functionsMap[TGSI_OPCODE_DP4] = "dp4";
+}
+
+llvm::Function * InstructionsSoa::function(int op)
+{
+    if (m_functions.find(op) != m_functions.end())
+       return m_functions[op];
+
+    std::string name = m_functionsMap[op];
+
+    llvm::Function *originalFunc = m_builtins->getFunction(name);
+    llvm::Function *func = CloneFunction(originalFunc);
+    currentModule()->getFunctionList().push_back(func);
+    std::cout << "Func parent is "<<func->getParent()
+              <<", cur is "<<currentModule() <<std::endl;
+    func->dump();
+       //func->setParent(currentModule());
+    m_functions[op] = func;
+    return func;
+}
+
+llvm::Module * InstructionsSoa::currentModule() const
+{
+   BasicBlock *block = m_builder.GetInsertBlock();
+   if (!block || !block->getParent())
+      return 0;
+
+   return block->getParent()->getParent();
+}
+
+void InstructionsSoa::createBuiltins()
+{
+   m_builtins = createSoaBuiltins();
+}
+
+std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_DP3);
+   return callBuiltin(func, in1, in2);
+}
+
+llvm::Value * InstructionsSoa::allocaTemp()
+{
+   VectorType *vector   = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vecArray = ArrayType::get(vector, 4);
+   AllocaInst *alloca = new AllocaInst(vecArray, name("tmpRes"),
+                                       m_builder.GetInsertBlock());
+
+   std::vector<Value*> indices;
+   indices.push_back(m_storage->constantInt(0));
+   indices.push_back(m_storage->constantInt(0));
+   GetElementPtrInst *getElem = new GetElementPtrInst(alloca,
+                                                      indices.begin(),
+                                                      indices.end(),
+                                                      name("allocaPtr"),
+                                                      m_builder.GetInsertBlock());
+   return getElem;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::allocaToResult(llvm::Value *allocaPtr)
+{
+   GetElementPtrInst *xElemPtr =  new GetElementPtrInst(allocaPtr,
+                                                        m_storage->constantInt(0),
+                                                        name("xPtr"),
+                                                        m_builder.GetInsertBlock());
+   GetElementPtrInst *yElemPtr =  new GetElementPtrInst(allocaPtr,
+                                                        m_storage->constantInt(1),
+                                                        name("yPtr"),
+                                                        m_builder.GetInsertBlock());
+   GetElementPtrInst *zElemPtr =  new GetElementPtrInst(allocaPtr,
+                                                        m_storage->constantInt(2),
+                                                        name("zPtr"),
+                                                        m_builder.GetInsertBlock());
+   GetElementPtrInst *wElemPtr =  new GetElementPtrInst(allocaPtr,
+                                                        m_storage->constantInt(3),
+                                                        name("wPtr"),
+                                                        m_builder.GetInsertBlock());
+
+   std::vector<llvm::Value*> res(4);
+   res[0] = new LoadInst(xElemPtr, name("xRes"), false, m_builder.GetInsertBlock());
+   res[1] = new LoadInst(yElemPtr, name("yRes"), false, m_builder.GetInsertBlock());
+   res[2] = new LoadInst(zElemPtr, name("zRes"), false, m_builder.GetInsertBlock());
+   res[3] = new LoadInst(wElemPtr, name("wRes"), false, m_builder.GetInsertBlock());
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::dp4(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_DP4);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
+                                                 const std::vector<llvm::Value*> in2)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   params.push_back(in2[0]);
+   params.push_back(in2[1]);
+   params.push_back(in2[2]);
+   params.push_back(in2[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
+
+std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std::vector<llvm::Value*> in1,
+                                                 const std::vector<llvm::Value*> in2,
+                                                 const std::vector<llvm::Value*> in3)
+{
+   std::vector<Value*> params;
+
+   llvm::Value *allocaPtr = allocaTemp();
+   params.push_back(allocaPtr);
+   params.push_back(in1[0]);
+   params.push_back(in1[1]);
+   params.push_back(in1[2]);
+   params.push_back(in1[3]);
+   params.push_back(in2[0]);
+   params.push_back(in2[1]);
+   params.push_back(in2[2]);
+   params.push_back(in2[3]);
+   params.push_back(in3[0]);
+   params.push_back(in3[1]);
+   params.push_back(in3[2]);
+   params.push_back(in3[3]);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end());
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+
+   return allocaToResult(allocaPtr);
+}
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 4169dcbb2e..3ef51dcaff 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -30,6 +30,7 @@
 
 #include <llvm/Support/LLVMBuilder.h>
 
+#include <map>
 #include <vector>
 
 namespace llvm {
@@ -47,9 +48,12 @@ public:
                    llvm::BasicBlock *block, StorageSoa *storage);
 
    std::vector<llvm::Value*> arl(const std::vector<llvm::Value*> in);
-
    std::vector<llvm::Value*> add(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> dp3(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> dp4(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> madd(const std::vector<llvm::Value*> in1,
                                   const std::vector<llvm::Value*> in2,
                                   const std::vector<llvm::Value*> in3);
@@ -62,9 +66,29 @@ private:
    const char * name(const char *prefix) const;
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
                                llvm::Value *z, llvm::Value *w);
+   void createFunctionMap();
+   void createBuiltins();
+   llvm::Function *function(int);
+   llvm::Module *currentModule() const;
+   llvm::Value *allocaTemp();
+   std::vector<llvm::Value*> allocaToResult(llvm::Value *allocaPtr);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1,
+                                         const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> callBuiltin(llvm::Function *func,
+                                         const std::vector<llvm::Value*> in1,
+                                         const std::vector<llvm::Value*> in2,
+                                         const std::vector<llvm::Value*> in3);
 private:
    llvm::LLVMFoldingBuilder  m_builder;
    StorageSoa *m_storage;
+
+   std::map<int, std::string> m_functionsMap;
+   std::map<int, llvm::Function*> m_functions;
+   llvm::Module *m_builtins;
+
 private:
    mutable int  m_idx;
    mutable char m_name[32];
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
new file mode 100644
index 0000000000..24c14e1b69
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * This file is compiled with clang into the LLVM bitcode
+  *
+  * Authors:
+  *   Zack Rusin zack@tungstengraphics.com
+  */
+typedef __attribute__(( ocu_vector_type(4) )) float float4;
+
+void dp3(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
+                (tmp0z * tmp1z);
+
+   res[0] = dot;
+   res[1] = dot;
+   res[2] = dot;
+   res[3] = dot;
+}
+
+
+void dp4(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   float4 dot = (tmp0x * tmp1x) + (tmp0y * tmp1y) +
+                (tmp0z * tmp1z) + (tmp0w * tmp1w);
+
+   res[0] = dot;
+   res[1] = dot;
+   res[2] = dot;
+   res[3] = dot;
+}
+
+#if 0
+void yo(float4 *out, float4 *in)
+{
+   float4 res[4];
+
+   dp3(res, in[0], in[1], in[2], in[3],
+       in[4], in[5], in[6], in[7]);
+   out[1] = res[1];
+}
+#endif
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index ed0674a96f..bb6fe3d7e1 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -277,7 +277,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
    return constVector;
 }
 
-std::vector<llvm::Value*> StorageSoa::load(Argument type, int idx, int swizzle,
+std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
                                            llvm::Value *indIdx)
 {
    std::vector<llvm::Value*> val(4);
@@ -292,25 +292,29 @@ std::vector<llvm::Value*> StorageSoa::load(Argument type, int idx, int swizzle,
    debug_printf("XXXXXXXXX realIdx = %p, indIdx = %p\n", realIndex, indIdx);
 
    switch(type) {
-   case Input:
+   case TGSI_FILE_INPUT:
       val = inputElement(realIndex);
       break;
-   case Output:
+   case TGSI_FILE_OUTPUT:
       val = outputElement(realIndex);
       break;
-   case Temp:
+   case TGSI_FILE_TEMPORARY:
       val = tempElement(realIndex);
       break;
-   case Const:
+   case TGSI_FILE_CONSTANT:
       val = constElement(realIndex);
       break;
-   case Immediate:
+   case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
       break;
-   case Address:
+   case TGSI_FILE_ADDRESS:
       debug_printf("Address not handled in the load phase!\n");
       assert(0);
       break;
+   default:
+      debug_printf("Unknown load!\n");
+      assert(0);
+      break;
    }
    if (!gallivm_is_swizzle(swizzle))
       return val;
@@ -324,21 +328,21 @@ std::vector<llvm::Value*> StorageSoa::load(Argument type, int idx, int swizzle,
    return res;
 }
 
-void StorageSoa::store(Argument type, int idx, const std::vector<llvm::Value*> &val,
+void StorageSoa::store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
                        int mask)
 {
    llvm::Value *out = 0;
    switch(type) {
-   case Output:
+   case TGSI_FILE_OUTPUT:
       out = m_output;
       break;
-   case Temp:
+   case TGSI_FILE_TEMPORARY:
       out = m_temps;
       break;
-   case Input:
+   case TGSI_FILE_INPUT:
       out = m_input;
       break;
-   case Address: {
+   case TGSI_FILE_ADDRESS: {
       llvm::Value *addr = m_addresses[idx];
       if (!addr) {
          addAddress(idx);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index 6443351f27..ae2fc7c6ae 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -28,6 +28,8 @@
 #ifndef STORAGESOA_H
 #define STORAGESOA_H
 
+#include <pipe/p_shader_tokens.h>
+
 #include <vector>
 #include <list>
 #include <map>
@@ -46,15 +48,6 @@ namespace llvm {
 class StorageSoa
 {
 public:
-   enum Argument {
-      Input,
-      Output,
-      Temp,
-      Const,
-      Immediate,
-      Address
-   };
-public:
    StorageSoa(llvm::BasicBlock *block,
               llvm::Value *input,
               llvm::Value *output,
@@ -62,9 +55,9 @@ public:
               llvm::Value *temps);
 
 
-   std::vector<llvm::Value*> load(Argument type, int idx, int swizzle, 
+   std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
                                   llvm::Value *indIdx =0);
-   void store(Argument type, int idx, const std::vector<llvm::Value*> &val,
+   void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
               int mask);
 
    void addImmediate(float *vec);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index 2cb4acce32..3f65865a5a 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -708,25 +708,9 @@ translate_instructionir(llvm::Module *module,
       if (src->SrcRegister.Indirect) {
          indIdx = storage->addrElement(src->SrcRegisterInd.Index);
       }
-      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
-         val = storage->load(StorageSoa::Const,
-                             src->SrcRegister.Index, swizzle, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
-         val = storage->load(StorageSoa::Input,
-                             src->SrcRegister.Index, swizzle, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
-         val = storage->load(StorageSoa::Temp,
-                             src->SrcRegister.Index, swizzle, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
-         val = storage->load(StorageSoa::Output,
-                             src->SrcRegister.Index, swizzle, indIdx);
-      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
-         val = storage->load(StorageSoa::Immediate,
-                             src->SrcRegister.Index, swizzle, indIdx);
-      } else {
-         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
-         return;
-      }
+
+      val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
+                          src->SrcRegister.Index, swizzle, indIdx);
 
       inputs[i] = val;
    }
@@ -763,9 +747,11 @@ translate_instructionir(llvm::Module *module,
    }
       break;
    case TGSI_OPCODE_DP3: {
+      out = instr->dp3(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_DP4: {
+      out = instr->dp4(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_DST: {
@@ -1067,19 +1053,8 @@ translate_instructionir(llvm::Module *module,
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
 
-      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
-         storage->store(StorageSoa::Output,
-                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
-         storage->store(StorageSoa::Temp,
-                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
-         storage->store(StorageSoa::Address,
-                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
-      } else {
-         fprintf(stderr, "ERROR: unsupported LLVM destination!");
-         assert(!"wrong destination");
-      }
+      storage->store((enum tgsi_file_type)dst->DstRegister.File,
+                     dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
    }
 }
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 95a2d6fcbb..a996218ce7 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -306,6 +306,11 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
     p->csr = p->store;
+    
+    /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
+     */
+    p->regs[0] = ~7;
+    p->regs[1] = (1U << (80 - 64)) - 1;
 }
 
 
@@ -317,6 +322,48 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+int spe_allocate_available_register(struct spe_function *p)
+{
+   unsigned i;
+   for (i = 0; i < 128; i++) {
+      const uint64_t mask = (1ULL << (i % 128));
+      const unsigned idx = i / 128;
+
+      if ((p->regs[idx] & mask) != 0) {
+         p->regs[idx] &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+int spe_allocate_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) != 0);
+
+   p->regs[idx] &= ~(1ULL << bit);
+   return reg;
+}
+
+
+void spe_release_register(struct spe_function *p, int reg)
+{
+   const unsigned idx = reg / 128;
+   const unsigned bit = reg % 128;
+
+   assert((p->regs[idx] & (1ULL << bit)) == 0);
+
+   p->regs[idx] |= (1ULL << bit);
+}
+
+
+
+
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 10ce44b3a0..5a1eb1ed8d 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -39,11 +39,27 @@ struct spe_function {
     uint32_t *store;
     uint32_t *csr;
     const char *fn;
+
+    /**
+     * Mask of used / unused registers
+     *
+     * Each set bit corresponds to an available register.  Each cleared bit
+     * corresponds to an allocated register.
+     *
+     * \sa
+     * spe_allocate_register, spe_allocate_available_register,
+     * spe_release_register
+     */
+    uint64_t regs[2];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
 
+extern int spe_allocate_available_register(struct spe_function *p);
+extern int spe_allocate_register(struct spe_function *p, int reg);
+extern void spe_release_register(struct spe_function *p, int reg);
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
diff --git a/src/gallium/auxiliary/sct/Makefile b/src/gallium/auxiliary/sct/Makefile
new file mode 100644
index 0000000000..516d1756cf
--- /dev/null
+++ b/src/gallium/auxiliary/sct/Makefile
@@ -0,0 +1,12 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = sct
+
+C_SOURCES = \
+	sct.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/sct/SConscript b/src/gallium/auxiliary/sct/SConscript
new file mode 100644
index 0000000000..76927d973f
--- /dev/null
+++ b/src/gallium/auxiliary/sct/SConscript
@@ -0,0 +1,9 @@
+Import('*')
+
+sct = env.ConvenienceLibrary(
+	target = 'sct',
+	source = [
+		'sct.c'
+	])
+
+auxiliaries.insert(0, sct)
diff --git a/src/gallium/auxiliary/sct/sct.c b/src/gallium/auxiliary/sct/sct.c
new file mode 100644
index 0000000000..97ee5882a1
--- /dev/null
+++ b/src/gallium/auxiliary/sct/sct.c
@@ -0,0 +1,453 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+#include "sct.h"
+
+
+struct texture_list
+{
+   struct pipe_texture *texture;
+   struct texture_list *next;
+};
+
+
+
+#define MAX_SURFACES  ((PIPE_MAX_COLOR_BUFS) + 1)
+
+struct sct_context
+{
+   const struct pipe_context *context;
+
+   /** surfaces the context is drawing into */
+   struct pipe_surface *surfaces[MAX_SURFACES];
+
+   /** currently bound textures */
+   struct pipe_texture *textures[PIPE_MAX_SAMPLERS];
+
+   /** previously bound textures, used but not flushed */
+   struct texture_list *textures_used;
+
+   boolean needs_flush;
+
+   struct sct_context *next;
+};
+
+
+
+struct sct_surface
+{
+   const struct pipe_surface *surface;
+
+   /** list of contexts drawing to this surface */
+   struct sct_context_list *contexts;
+
+   struct sct_surface *next;
+};
+
+
+
+/**
+ * Find the surface_info for the given pipe_surface
+ */
+static struct sct_surface *
+find_surface_info(struct surface_context_tracker *sct,
+                  const struct pipe_surface *surface)
+{
+   struct sct_surface *si;
+   for (si = sct->surfaces; si; si = si->next)
+      if (si->surface == surface)
+         return si;
+   return NULL;
+}
+
+
+/**
+ * As above, but create new surface_info if surface is new.
+ */
+static struct sct_surface *
+find_create_surface_info(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surface)
+{
+   struct sct_surface *si = find_surface_info(sct, surface);
+   if (si)
+      return si;
+
+   /* alloc new */
+   si = CALLOC_STRUCT(sct_surface);
+   if (si) {
+      si->surface = surface;
+
+      /* insert at head */
+      si->next = sct->surfaces;
+      sct->surfaces = si;
+   }
+
+   return si;
+}
+
+
+/**
+ * Find a context_info for the given context.
+ */
+static struct sct_context *
+find_context_info(struct surface_context_tracker *sct,
+                  const struct pipe_context *context)
+{
+   struct sct_context *ci;
+   for (ci = sct->contexts; ci; ci = ci->next)
+      if (ci->context == context)
+         return ci;
+   return NULL;
+}
+
+
+/**
+ * As above, but create new context_info if context is new.
+ */
+static struct sct_context *
+find_create_context_info(struct surface_context_tracker *sct,
+                         const struct pipe_context *context)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+   if (ci)
+      return ci;
+
+   /* alloc new */
+   ci = CALLOC_STRUCT(sct_context);
+   if (ci) {
+      ci->context = context;
+
+      /* insert at head */
+      ci->next = sct->contexts;
+      sct->contexts = ci;
+   }
+
+   return ci;
+}
+
+
+/**
+ * Is the context already bound to the surface?
+ */
+static boolean
+find_surface_context(const struct sct_surface *si,
+                     const struct pipe_context *context)
+{
+   const struct sct_context_list *cl;
+   for (cl = si->contexts; cl; cl = cl->next) {
+      if (cl->context == context) {
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+/**
+ * Add a context to the list of contexts associated with a surface.
+ */
+static void
+add_context_to_surface(struct sct_surface *si,
+                       const struct pipe_context *context)
+{
+   struct sct_context_list *cl = CALLOC_STRUCT(sct_context_list);
+   if (cl) {
+      cl->context = context;
+      /* insert at head of list of contexts */
+      cl->next = si->contexts;
+      si->contexts = cl;
+   }
+}
+
+
+/**
+ * Remove a context from the list of contexts associated with a surface.
+ */
+static void
+remove_context_from_surface(struct sct_surface *si,
+                            const struct pipe_context *context)
+{
+   struct sct_context_list *prev = NULL, *curr, *next;
+
+   for (curr = si->contexts; curr; curr = next) {
+      if (curr->context == context) {
+         /* remove */
+         if (prev)
+            prev->next = curr->next;
+         else
+            si->contexts = curr->next;
+         next = curr->next;
+         FREE(curr);
+      }
+      else {
+         prev = curr;
+      }
+   }
+}
+
+
+/**
+ * Unbind context from surface.
+ */
+static void
+unbind_context_surface(struct surface_context_tracker *sct,
+                       struct pipe_context *context,
+                       struct pipe_surface *surface)
+{
+   struct sct_surface *si = find_surface_info(sct, surface);
+   if (si) {
+      remove_context_from_surface(si, context);
+   }
+}
+
+
+/**
+ * Bind context to a set of surfaces (color + Z).
+ * Like MakeCurrent().
+ */
+void
+sct_bind_surfaces(struct surface_context_tracker *sct,
+                  struct pipe_context *context,
+                  uint num_surf,
+                  struct pipe_surface **surfaces)
+{
+   struct sct_context *ci = find_create_context_info(sct, context);
+   uint i;
+
+   if (!ci) {
+      return; /* out of memory */
+   }
+
+   /* unbind currently bound surfaces */
+   for (i = 0; i < MAX_SURFACES; i++) {
+      if (ci->surfaces[i]) {
+         unbind_context_surface(sct, context, ci->surfaces[i]);
+      }
+   }
+
+   /* bind new surfaces */
+   for (i = 0; i < num_surf; i++) {
+      struct sct_surface *si = find_create_surface_info(sct, surfaces[i]);
+      if (!find_surface_context(si, context)) {
+         add_context_to_surface(si, context);
+      }
+   }
+}
+
+
+/**
+ * Return list of contexts bound to a surface.
+ */
+const struct sct_context_list *
+sct_get_surface_contexts(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surface)
+{
+   const struct sct_surface *si = find_surface_info(sct, surface);
+   return si->contexts;
+}
+
+
+
+static boolean
+find_texture(const struct sct_context *ci,
+             const struct pipe_texture *texture)
+{
+   const struct texture_list *tl;
+
+   for (tl = ci->textures_used; tl; tl = tl->next) {
+      if (tl->texture == texture) {
+         return TRUE;
+      }
+   }
+   return FALSE;
+}
+
+
+/**
+ * Add the given texture to the context's list of used textures.
+ */
+static void
+add_texture_used(struct sct_context *ci,
+                 struct pipe_texture *texture)
+{
+   if (!find_texture(ci, texture)) {
+      /* add to list */
+      struct texture_list *tl = CALLOC_STRUCT(texture_list);
+      if (tl) {
+         pipe_texture_reference(&tl->texture, texture);
+         /* insert at head */
+         tl->next = ci->textures_used;
+         ci->textures_used = tl;
+      }
+   }
+}
+
+
+/**
+ * Bind a texture to a rendering context.
+ */
+void
+sct_bind_texture(struct surface_context_tracker *sct,
+                 struct pipe_context *context,
+                 uint unit,
+                 struct pipe_texture *tex)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+
+   if (ci->textures[unit] != tex) {
+      /* put texture on the 'used' list */
+      add_texture_used(ci, tex);
+      /* bind new */
+      pipe_texture_reference(&ci->textures[unit], tex);
+   }
+}
+
+
+/**
+ * Check if the given texture has been used by the rendering context
+ * since the last call to sct_flush_textures().
+ */
+boolean
+sct_is_texture_used(struct surface_context_tracker *sct,
+                    const struct pipe_context *context,
+                    const struct pipe_texture *texture)
+{
+   const struct sct_context *ci = find_context_info(sct, context);
+   return find_texture(ci, texture);
+}
+
+
+/**
+ * To be called when the image contents of a texture are changed, such
+ * as for gl[Copy]TexSubImage().
+ * XXX this may not be needed
+ */
+void
+sct_update_texture(struct pipe_texture *tex)
+{
+
+}
+
+
+/**
+ * When a scene is flushed/rendered we can release the list of
+ * used textures.
+ */
+void
+sct_flush_textures(struct surface_context_tracker *sct,
+                   struct pipe_context *context)
+{
+   struct sct_context *ci = find_context_info(sct, context);
+   struct texture_list *tl, *next;
+   uint i;
+
+   for (tl = ci->textures_used; tl; tl = next) {
+      next = tl->next;
+      pipe_texture_release(&tl->texture);
+      FREE(tl);
+   }
+   ci->textures_used = NULL;
+
+   /* put the currently bound textures on the 'used' list */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      add_texture_used(ci, ci->textures[i]);
+   }
+}
+
+
+
+void
+sct_destroy_context(struct surface_context_tracker *sct,
+                    struct pipe_context *context)
+{
+   /* XXX should we require an unbinding first? */
+   {
+      struct sct_surface *si;
+      for (si = sct->surfaces; si; si = si->next) {
+         remove_context_from_surface(si, context);
+      }
+   }
+
+   /* remove context from context_info list */
+   {
+      struct sct_context *ci, *next, *prev = NULL;
+      for (ci = sct->contexts; ci; ci = next) {
+         next = ci->next;
+         if (ci->context == context) {
+            if (prev)
+               prev->next = ci->next;
+            else
+               sct->contexts = ci->next;
+            FREE(ci);
+         }
+         else {
+            prev = ci;
+         }
+      }
+   }
+
+}
+
+
+void
+sct_destroy_surface(struct surface_context_tracker *sct,
+                    struct pipe_surface *surface)
+{
+   if (1) {
+      /* debug/sanity: no context should be bound to surface */
+      struct sct_context *ci;
+      uint i;
+      for (ci = sct->contexts; ci; ci = ci->next) {
+         for (i = 0; i < MAX_SURFACES; i++) {
+            assert(ci->surfaces[i] != surface);
+         }
+      }
+   }
+
+   /* remove surface from sct_surface list */
+   {
+      struct sct_surface *si, *next, *prev = NULL;
+      for (si = sct->surfaces; si; si = next) {
+         next = si->next;
+         if (si->surface == surface) {
+            /* unlink */
+            if (prev)
+               prev->next = si->next;
+            else
+               sct->surfaces = si->next;
+            FREE(si);
+         }
+         else {
+            prev = si;
+         }
+      }
+   }
+}
diff --git a/src/gallium/auxiliary/sct/sct.h b/src/gallium/auxiliary/sct/sct.h
new file mode 100644
index 0000000000..cf7c4d3bdf
--- /dev/null
+++ b/src/gallium/auxiliary/sct/sct.h
@@ -0,0 +1,123 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Surface/Context Tracking
+ *
+ * For some drivers, we need to monitor the binding between contexts and
+ * surfaces/textures.
+ * This code may evolve quite a bit...
+ */
+
+
+#ifndef SCT_H
+#define SCT_H
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+struct pipe_context;
+struct pipe_surface;
+
+struct sct_context;
+struct sct_surface;
+
+
+/**
+ * Per-device info, basically
+ */
+struct surface_context_tracker
+{
+   struct sct_context *contexts;
+   struct sct_surface *surfaces;
+};
+
+
+
+/**
+ * Simple linked list of contexts
+ */
+struct sct_context_list
+{
+   const struct pipe_context *context;
+   struct sct_context_list *next;
+};
+
+
+
+extern void
+sct_bind_surfaces(struct surface_context_tracker *sct,
+                  struct pipe_context *context,
+                  uint num_surf,
+                  struct pipe_surface **surfaces);
+
+
+extern void
+sct_bind_texture(struct surface_context_tracker *sct,
+                 struct pipe_context *context,
+                 uint unit,
+                 struct pipe_texture *texture);
+
+
+extern void
+sct_update_texture(struct pipe_texture *tex);
+
+
+extern boolean
+sct_is_texture_used(struct surface_context_tracker *sct,
+                    const struct pipe_context *context,
+                    const struct pipe_texture *texture);
+
+extern void
+sct_flush_textures(struct surface_context_tracker *sct,
+                   struct pipe_context *context);
+
+
+extern const struct sct_context_list *
+sct_get_surface_contexts(struct surface_context_tracker *sct,
+                         const struct pipe_surface *surf);
+
+
+extern void
+sct_destroy_context(struct surface_context_tracker *sct,
+                    struct pipe_context *context);
+
+
+extern void
+sct_destroy_surface(struct surface_context_tracker *sct,
+                    struct pipe_surface *surface);
+
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* SCT_H */
diff --git a/src/gallium/auxiliary/sct/usage.c b/src/gallium/auxiliary/sct/usage.c
new file mode 100644
index 0000000000..6227f19962
--- /dev/null
+++ b/src/gallium/auxiliary/sct/usage.c
@@ -0,0 +1,61 @@
+/* surface / context tracking */
+
+
+/*
+
+context A:
+  render to texture T
+
+context B:
+  texture from T
+
+-----------------------
+
+flush surface:
+  which contexts are bound to the surface?
+
+-----------------------
+
+glTexSubImage():
+  which contexts need to be flushed?
+
+ */
+
+
+/*
+
+in MakeCurrent():
+
+  call sct_bind_surfaces(context, list of surfaces) to update the
+  dependencies between context and surfaces
+
+
+in SurfaceFlush(), or whatever it is in D3D:
+
+  call sct_get_surface_contexts(surface) to get a list of contexts
+  which are currently bound to the surface.
+
+
+
+in BindTexture():
+
+  call sct_bind_texture(context, texture) to indicate that the texture
+  is used in the scene.
+
+
+in glTexSubImage() or RenderToTexture():
+
+  call sct_is_texture_used(context, texture) to determine if the texture
+  has been used in the scene, but the scene's not flushed.  If TRUE is
+  returned it means the scene has to be rendered/flushed before the contents
+  of the texture can be changed.
+
+
+in psb_scene_flush/terminate():
+
+  call sct_flush_textures(context) to tell the SCT that the textures which
+  were used in the scene can be released.
+
+
+
+*/
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index ac52441400..f2ed9e0353 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -1220,7 +1220,8 @@ fetch_texel( struct tgsi_sampler *sampler,
 static void
 exec_tex(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
-         boolean biasLod)
+         boolean biasLod,
+         boolean projected)
 {
    const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
    union tgsi_exec_channel r[8];
@@ -1234,17 +1235,9 @@ exec_tex(struct tgsi_exec_machine *mach,
 
       FETCH(&r[0], 0, CHAN_X);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[1], 0, CHAN_W);
          micro_div( &r[0], &r[0], &r[1] );
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -1266,19 +1259,11 @@ exec_tex(struct tgsi_exec_machine *mach,
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[3], 0, CHAN_W);
          micro_div( &r[0], &r[0], &r[3] );
          micro_div( &r[1], &r[1], &r[3] );
          micro_div( &r[2], &r[2], &r[3] );
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -1300,19 +1285,11 @@ exec_tex(struct tgsi_exec_machine *mach,
       FETCH(&r[1], 0, CHAN_Y);
       FETCH(&r[2], 0, CHAN_Z);
 
-      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
-      case TGSI_EXTSWIZZLE_W:
+      if (projected) {
          FETCH(&r[3], 0, CHAN_W);
          micro_div( &r[0], &r[0], &r[3] );
          micro_div( &r[1], &r[1], &r[3] );
          micro_div( &r[2], &r[2], &r[3] );
-         break;
-
-      case TGSI_EXTSWIZZLE_ONE:
-         break;
-
-      default:
-         assert (0);
       }
 
       if (biasLod) {
@@ -2007,14 +1984,14 @@ exec_instruction(
       /* simple texture lookup */
       /* src[0] = texcoord */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, FALSE);
+      exec_tex(mach, inst, FALSE, FALSE);
       break;
 
    case TGSI_OPCODE_TXB:
       /* Texture lookup with lod bias */
       /* src[0] = texcoord (src[0].w = LOD bias) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE);
+      exec_tex(mach, inst, TRUE, FALSE);
       break;
 
    case TGSI_OPCODE_TXD:
@@ -2030,7 +2007,14 @@ exec_instruction(
       /* Texture lookup with explit LOD */
       /* src[0] = texcoord (src[0].w = LOD) */
       /* src[1] = sampler unit */
-      exec_tex(mach, inst, TRUE);
+      exec_tex(mach, inst, TRUE, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXP:
+      /* Texture lookup with projection */
+      /* src[0] = texcoord (src[0].w = projection) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE, TRUE);
       break;
 
    case TGSI_OPCODE_UP2H:
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.c b/src/gallium/auxiliary/tgsi/util/tgsi_build.c
index a00ff1c2a5..9c883ab704 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_build.c
@@ -719,7 +719,6 @@ tgsi_build_full_instruction(
             reg->SrcRegisterExtSwz.NegateY,
             reg->SrcRegisterExtSwz.NegateZ,
             reg->SrcRegisterExtSwz.NegateW,
-            reg->SrcRegisterExtSwz.ExtDivide,
             prev_token,
             instruction,
             header );
@@ -1057,7 +1056,6 @@ tgsi_default_src_register_ext_swz( void )
    src_register_ext_swz.NegateY = 0;
    src_register_ext_swz.NegateZ = 0;
    src_register_ext_swz.NegateW = 0;
-   src_register_ext_swz.ExtDivide = TGSI_EXTSWIZZLE_ONE;
    src_register_ext_swz.Padding = 0;
    src_register_ext_swz.Extended = 0;
 
@@ -1084,7 +1082,6 @@ tgsi_build_src_register_ext_swz(
    unsigned negate_y,
    unsigned negate_z,
    unsigned negate_w,
-   unsigned ext_divide,
    struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header )
@@ -1099,7 +1096,6 @@ tgsi_build_src_register_ext_swz(
    assert( negate_y <= 1 );
    assert( negate_z <= 1 );
    assert( negate_w <= 1 );
-   assert( ext_divide <= TGSI_EXTSWIZZLE_ONE );
 
    src_register_ext_swz = tgsi_default_src_register_ext_swz();
    src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
@@ -1110,7 +1106,6 @@ tgsi_build_src_register_ext_swz(
    src_register_ext_swz.NegateY = negate_y;
    src_register_ext_swz.NegateZ = negate_z;
    src_register_ext_swz.NegateW = negate_w;
-   src_register_ext_swz.ExtDivide = ext_divide;
 
    prev_token->Extended = 1;
    instruction_grow( instruction, header );
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.h b/src/gallium/auxiliary/tgsi/util/tgsi_build.h
index 607860e7fc..80bffc4ae7 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_build.h
@@ -229,7 +229,6 @@ tgsi_build_src_register_ext_swz(
    unsigned negate_y,
    unsigned negate_z,
    unsigned negate_w,
-   unsigned ext_divide,
    struct tgsi_token *prev_token,
    struct tgsi_instruction *instruction,
    struct tgsi_header *header );
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
index 59be14a748..ceb407b884 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
@@ -459,7 +459,8 @@ static const char *TGSI_OPCODES[] =
    "OPCODE_IFC",
    "OPCODE_BREAKC",
    "OPCODE_KIL",
-   "OPCODE_END"
+   "OPCODE_END",
+   "OPCODE_TXP"
 };
 
 static const char *TGSI_OPCODES_SHORT[] =
@@ -597,7 +598,8 @@ static const char *TGSI_OPCODES_SHORT[] =
    "IFC",
    "BREAKC",
    "KIL",
-   "END"
+   "END",
+   "TXP"
 };
 
 static const char *TGSI_SATS[] =
@@ -1361,10 +1363,6 @@ dump_instruction_verbose(
             TXT( "\nNegateW   : " );
             UID( src->SrcRegisterExtSwz.NegateW );
          }
-         if( deflt || fs->SrcRegisterExtSwz.ExtDivide != src->SrcRegisterExtSwz.ExtDivide ) {
-            TXT( "\nExtDivide  : " );
-            ENM( src->SrcRegisterExtSwz.ExtDivide, TGSI_EXTSWIZZLES );
-         }
          if( ignored ) {
             TXT( "\nPadding   : " );
             UIX( src->SrcRegisterExtSwz.Padding );
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index 906a46d6b4..2abbe9500e 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -7,7 +7,9 @@ C_SOURCES = \
 	p_debug.c \
 	p_tile.c \
 	p_util.c \
-	u_mm.c
+	u_handle_table.c \
+	u_mm.c \
+	u_snprintf.c
 
 include ../../Makefile.template
 
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index 4717941434..2030214aa7 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -6,7 +6,9 @@ util = env.ConvenienceLibrary(
 		'p_debug.c',
 		'p_tile.c',
 		'p_util.c',
+		'u_handle_table.c',
 		'u_mm.c',
+		'u_snprintf.c',
 	])
 
 auxiliaries.insert(0, util)
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index b9607a6ba7..09cabdae25 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -36,14 +36,37 @@
 #include <stdlib.h>
 #endif
 
-#include "pipe/p_debug.h" 
 #include "pipe/p_compiler.h" 
+#include "pipe/p_util.h" 
+#include "pipe/p_debug.h" 
+
+
+#ifdef WIN32
+static INLINE void 
+rpl_EngDebugPrint(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   EngDebugPrint("", (PCHAR)format, ap);
+   va_end(ap);
+}
+
+int rpl_vsnprintf(char *, size_t, const char *, va_list);
+#endif
 
 
 void debug_vprintf(const char *format, va_list ap)
 {
 #ifdef WIN32
-   EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
+#ifndef WINCE
+   /* EngDebugPrint does not handle float point arguments, so we need to use
+    * our own vsnprintf implementation */
+   char buf[512 + 1];
+   rpl_vsnprintf(buf, sizeof(buf), format, ap);
+   rpl_EngDebugPrint("%s", buf);
+#else
+   /* TODO: Implement debug print for WINCE */
+#endif
 #else
    vfprintf(stderr, format, ap);
 #endif
@@ -59,18 +82,92 @@ void debug_printf(const char *format, ...)
 }
 
 
-static INLINE void debug_abort(void) 
+/* TODO: implement a debug_abort that calls EngBugCheckEx on WIN32 */
+
+
+static INLINE void debug_break(void) 
 {
-#ifdef WIN32
+#if (defined(__i386__) || defined(__386__)) && defined(__GNUC__)
+   __asm("int3");
+#elif (defined(__i386__) || defined(__386__)) && defined(__MSC__)
+   _asm {int 3};
+#elif defined(WIN32) && !defined(WINCE)
    EngDebugBreak();
 #else
    abort();
 #endif
 }
 
+#if defined(WIN32)
+ULONG_PTR debug_config_file = 0;
+void *mapped_config_file = 0;
+
+enum {
+	eAssertAbortEn = 0x1,
+};
+
+/* Check for aborts enabled. */
+static unsigned abort_en()
+{
+	if (!mapped_config_file)
+	{
+		/* Open an 8 byte file for configuration data. */
+		mapped_config_file = EngMapFile(L"\\??\\c:\\gaDebug.cfg", 8, &debug_config_file);
+	}
+	/* An value of "0" (ascii) in the configuration file will clear the first 8 bits in the test byte. */
+	/* An value of "1" (ascii) in the configuration file will set the first bit in the test byte. */
+	/* An value of "2" (ascii) in the configuration file will set the second bit in the test byte. */
+	return ((((char *)mapped_config_file)[0]) - 0x30) & eAssertAbortEn;
+}
+#else /* WIN32 */
+static unsigned abort_en()
+{
+	return !GETENV("GALLIUM_ABORT_ON_ASSERT");
+}
+#endif
 
 void debug_assert_fail(const char *expr, const char *file, unsigned line) 
 {
    debug_printf("%s:%i: Assertion `%s' failed.\n", file, line, expr);
-   debug_abort();
+   if (abort_en())
+   {
+      debug_break();
+   } else
+   {
+      debug_printf("continuing...\n");
+   }
+}
+
+
+#define DEBUG_MASK_TABLE_SIZE 256
+
+
+/**
+ * Mask hash table.
+ * 
+ * For now we just take the lower bits of the key, and do no attempt to solve
+ * collisions. Use a proper hash table when we have dozens of drivers. 
+ */
+static uint32_t debug_mask_table[DEBUG_MASK_TABLE_SIZE];
+
+
+void debug_mask_set(uint32_t uuid, uint32_t mask) 
+{
+   unsigned hash = uuid & (DEBUG_MASK_TABLE_SIZE - 1);
+   debug_mask_table[hash] = mask;
+}
+
+
+uint32_t debug_mask_get(uint32_t uuid)
+{
+   unsigned hash = uuid & (DEBUG_MASK_TABLE_SIZE - 1);
+   return debug_mask_table[hash];
+}
+
+
+void debug_mask_vprintf(uint32_t uuid, uint32_t what, const char *format, va_list ap)
+{
+   uint32_t mask = debug_mask_get(uuid);
+   if(mask & what)
+      debug_vprintf(format, ap);
 }
diff --git a/src/gallium/auxiliary/util/p_tile.h b/src/gallium/auxiliary/util/p_tile.h
index 318b6d11a6..fdc80a13b3 100644
--- a/src/gallium/auxiliary/util/p_tile.h
+++ b/src/gallium/auxiliary/util/p_tile.h
@@ -52,44 +52,50 @@ pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps)
    return FALSE;
 }
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-extern void
+void
 pipe_get_tile_raw(struct pipe_context *pipe,
                   struct pipe_surface *ps,
                   uint x, uint y, uint w, uint h,
                   void *p, int dst_stride);
 
-extern void
+void
 pipe_put_tile_raw(struct pipe_context *pipe,
                   struct pipe_surface *ps,
                   uint x, uint y, uint w, uint h,
                   const void *p, int src_stride);
 
 
-extern void
+void
 pipe_get_tile_rgba(struct pipe_context *pipe,
                    struct pipe_surface *ps,
                    uint x, uint y, uint w, uint h,
                    float *p);
 
-extern void
+void
 pipe_put_tile_rgba(struct pipe_context *pipe,
                    struct pipe_surface *ps,
                    uint x, uint y, uint w, uint h,
                    const float *p);
 
 
-extern void
+void
 pipe_get_tile_z(struct pipe_context *pipe,
                 struct pipe_surface *ps,
                 uint x, uint y, uint w, uint h,
                 uint *z);
 
-extern void
+void
 pipe_put_tile_z(struct pipe_context *pipe,
                 struct pipe_surface *ps,
                 uint x, uint y, uint w, uint h,
                 const uint *z);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.c b/src/gallium/auxiliary/util/u_cpu_detect.c
new file mode 100644
index 0000000000..d9f2f8fc28
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_cpu_detect.c
@@ -0,0 +1,506 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Dennis Smit
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+ */
+
+/* FIXME: clean this entire file up */
+
+#include "u_cpu_detect.h"
+
+#ifdef __linux__
+#define OS_LINUX
+#endif
+#ifdef WIN32
+#define OS_WIN32
+#endif
+
+#if defined(ARCH_POWERPC)
+#if defined(OS_DARWIN)
+#include <sys/sysctl.h>
+#else
+#include <signal.h>
+#include <setjmp.h>
+#endif
+#endif
+
+#if defined(OS_NETBSD) || defined(OS_OPENBSD)
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <machine/cpu.h>
+#endif
+
+#if defined(OS_FREEBSD)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
+
+#if defined(OS_LINUX)
+#include <signal.h>
+#endif
+
+#if defined(OS_WIN32)
+#include <windows.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+
+static struct cpu_detect_caps __cpu_detect_caps;
+static int __cpu_detect_initialized = 0;
+
+static int has_cpuid(void);
+static int cpuid(unsigned int ax, unsigned int *p);
+
+/* The sigill handlers */
+#if defined(ARCH_X86) /* x86 (linux katmai handler check thing) */
+#if defined(OS_LINUX) && defined(_POSIX_SOURCE) && defined(X86_FXSR_MAGIC)
+static void sigill_handler_sse(int signal, struct sigcontext sc)
+{
+	/* Both the "xorps %%xmm0,%%xmm0" and "divps %xmm0,%%xmm1"
+	 * instructions are 3 bytes long.  We must increment the instruction
+	 * pointer manually to avoid repeated execution of the offending
+	 * instruction.
+	 *
+	 * If the SIGILL is caused by a divide-by-zero when unmasked
+	 * exceptions aren't supported, the SIMD FPU status and control
+	 * word will be restored at the end of the test, so we don't need
+	 * to worry about doing it here.  Besides, we may not be able to...
+	 */
+	sc.eip += 3;
+
+	__cpu_detect_caps.hasSSE=0;
+}
+
+static void sigfpe_handler_sse(int signal, struct sigcontext sc)
+{
+	if (sc.fpstate->magic != 0xffff) {
+		/* Our signal context has the extended FPU state, so reset the
+		 * divide-by-zero exception mask and clear the divide-by-zero
+		 * exception bit.
+		 */
+		sc.fpstate->mxcsr |= 0x00000200;
+		sc.fpstate->mxcsr &= 0xfffffffb;
+	} else {
+		/* If we ever get here, we're completely hosed.
+		*/
+	}
+}
+#endif
+#endif /* OS_LINUX && _POSIX_SOURCE && X86_FXSR_MAGIC */
+
+#if defined(OS_WIN32)
+LONG CALLBACK win32_sig_handler_sse(EXCEPTION_POINTERS* ep)
+{
+	if(ep->ExceptionRecord->ExceptionCode==EXCEPTION_ILLEGAL_INSTRUCTION){
+		ep->ContextRecord->Eip +=3;
+		__cpu_detect_caps.hasSSE=0;
+		return EXCEPTION_CONTINUE_EXECUTION;
+	}
+	return EXCEPTION_CONTINUE_SEARCH;
+}
+#endif /* OS_WIN32 */
+
+
+#if defined(ARCH_POWERPC) && !defined(OS_DARWIN)
+static sigjmp_buf __lv_powerpc_jmpbuf;
+static volatile sig_atomic_t __lv_powerpc_canjump = 0;
+
+static void sigill_handler (int sig);
+
+static void sigill_handler (int sig)
+{
+	if (!__lv_powerpc_canjump) {
+		signal (sig, SIG_DFL);
+		raise (sig);
+	}
+
+	__lv_powerpc_canjump = 0;
+	siglongjmp(__lv_powerpc_jmpbuf, 1);
+}
+
+static void check_os_altivec_support(void)
+{
+#if defined(OS_DARWIN)
+	int sels[2] = {CTL_HW, HW_VECTORUNIT};
+	int has_vu = 0;
+	int len = sizeof (has_vu);
+	int err;
+
+	err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
+
+	if (err == 0) {
+		if (has_vu != 0) {
+			__cpu_detect_caps.hasAltiVec = 1;
+		}
+	}
+#else /* !OS_DARWIN */
+	/* no Darwin, do it the brute-force way */
+	/* this is borrowed from the libmpeg2 library */
+	signal(SIGILL, sigill_handler);
+	if (sigsetjmp(__lv_powerpc_jmpbuf, 1)) {
+		signal(SIGILL, SIG_DFL);
+	} else {
+		__lv_powerpc_canjump = 1;
+
+		__asm __volatile
+			("mtspr 256, %0\n\t"
+			 "vand %%v0, %%v0, %%v0"
+			 :
+			 : "r" (-1));
+
+		signal(SIGILL, SIG_DFL);
+		__cpu_detect_caps.hasAltiVec = 1;
+	}
+#endif
+}
+#endif
+
+/* If we're running on a processor that can do SSE, let's see if we
+ * are allowed to or not.  This will catch 2.4.0 or later kernels that
+ * haven't been configured for a Pentium III but are running on one,
+ * and RedHat patched 2.2 kernels that have broken exception handling
+ * support for user space apps that do SSE.
+ */
+static void check_os_katmai_support(void)
+{
+#if defined(ARCH_X86)
+#if defined(OS_FREEBSD)
+	int has_sse=0, ret;
+	int len = sizeof (has_sse);
+
+	ret = sysctlbyname("hw.instruction_sse", &has_sse, &len, NULL, 0);
+	if (ret || !has_sse)
+		__cpu_detect_caps.hasSSE=0;
+
+#elif defined(OS_NETBSD) || defined(OS_OPENBSD)
+	int has_sse, has_sse2, ret, mib[2];
+	int varlen;
+
+	mib[0] = CTL_MACHDEP;
+	mib[1] = CPU_SSE;
+	varlen = sizeof (has_sse);
+
+	ret = sysctl(mib, 2, &has_sse, &varlen, NULL, 0);
+	if (ret < 0 || !has_sse) {
+		__cpu_detect_caps.hasSSE = 0;
+	} else {
+		__cpu_detect_caps.hasSSE = 1;
+	}
+
+	mib[1] = CPU_SSE2;
+	varlen = sizeof (has_sse2);
+	ret = sysctl(mib, 2, &has_sse2, &varlen, NULL, 0);
+	if (ret < 0 || !has_sse2) {
+		__cpu_detect_caps.hasSSE2 = 0;
+	} else {
+		__cpu_detect_caps.hasSSE2 = 1;
+	}
+	__cpu_detect_caps.hasSSE = 0; /* FIXME ?!?!? */
+
+#elif defined(OS_WIN32)
+	LPTOP_LEVEL_EXCEPTION_FILTER exc_fil;
+	if (__cpu_detect_caps.hasSSE) {
+		exc_fil = SetUnhandledExceptionFilter(win32_sig_handler_sse);
+		__asm __volatile ("xorps %xmm0, %xmm0");
+		SetUnhandledExceptionFilter(exc_fil);
+	}
+#elif defined(OS_LINUX)
+	struct sigaction saved_sigill;
+	struct sigaction saved_sigfpe;
+
+	/* Save the original signal handlers.
+	*/
+	sigaction(SIGILL, NULL, &saved_sigill);
+	sigaction(SIGFPE, NULL, &saved_sigfpe);
+
+	signal(SIGILL, (void (*)(int))sigill_handler_sse);
+	signal(SIGFPE, (void (*)(int))sigfpe_handler_sse);
+
+	/* Emulate test for OSFXSR in CR4.  The OS will set this bit if it
+	 * supports the extended FPU save and restore required for SSE.  If
+	 * we execute an SSE instruction on a PIII and get a SIGILL, the OS
+	 * doesn't support Streaming SIMD Exceptions, even if the processor
+	 * does.
+	 */
+	if (__cpu_detect_caps.hasSSE) {
+		__asm __volatile ("xorps %xmm1, %xmm0");
+	}
+
+	/* Emulate test for OSXMMEXCPT in CR4.  The OS will set this bit if
+	 * it supports unmasked SIMD FPU exceptions.  If we unmask the
+	 * exceptions, do a SIMD divide-by-zero and get a SIGILL, the OS
+	 * doesn't support unmasked SIMD FPU exceptions.  If we get a SIGFPE
+	 * as expected, we're okay but we need to clean up after it.
+	 *
+	 * Are we being too stringent in our requirement that the OS support
+	 * unmasked exceptions?  Certain RedHat 2.2 kernels enable SSE by
+	 * setting CR4.OSFXSR but don't support unmasked exceptions.  Win98
+	 * doesn't even support them.  We at least know the user-space SSE
+	 * support is good in kernels that do support unmasked exceptions,
+	 * and therefore to be safe I'm going to leave this test in here.
+	 */
+	if (__cpu_detect_caps.hasSSE) {
+		//      test_os_katmai_exception_support();
+	}
+
+	/* Restore the original signal handlers.
+	*/
+	sigaction(SIGILL, &saved_sigill, NULL);
+	sigaction(SIGFPE, &saved_sigfpe, NULL);
+
+#else
+	/* We can't use POSIX signal handling to test the availability of
+	 * SSE, so we disable it by default.
+	 */
+	__cpu_detect_caps.hasSSE = 0;
+#endif /* __linux__ */
+#endif
+}
+
+
+static int has_cpuid(void)
+{
+#if defined(ARCH_X86)
+	int a, c;
+
+	__asm __volatile
+		("pushf\n"
+		 "popl %0\n"
+		 "movl %0, %1\n"
+		 "xorl $0x200000, %0\n"
+		 "push %0\n"
+		 "popf\n"
+		 "pushf\n"
+		 "popl %0\n"
+		 : "=a" (a), "=c" (c)
+		 :
+		 : "cc");
+
+	return a != c;
+#else
+	return 0;
+#endif
+}
+
+static int cpuid(unsigned int ax, unsigned int *p)
+{
+#if defined(ARCH_X86)
+	unsigned int flags;
+
+	__asm __volatile
+		("movl %%ebx, %%esi\n\t"
+		 "cpuid\n\t"
+		 "xchgl %%ebx, %%esi"
+		 : "=a" (p[0]), "=S" (p[1]),
+		 "=c" (p[2]), "=d" (p[3])
+		 : "0" (ax));
+
+	return 0;
+#else
+	return -1;
+#endif
+}
+
+void cpu_detect_initialize()
+{
+	unsigned int regs[4];
+	unsigned int regs2[4];
+
+	int mib[2], ncpu;
+	int len;
+
+	memset(&__cpu_detect_caps, 0, sizeof (struct cpu_detect_caps));
+
+	/* Check for arch type */
+#if defined(ARCH_MIPS)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_MIPS;
+#elif defined(ARCH_ALPHA)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_ALPHA;
+#elif defined(ARCH_SPARC)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_SPARC;
+#elif defined(ARCH_X86)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_X86;
+#elif defined(ARCH_POWERPC)
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_POWERPC;
+#else
+	__cpu_detect_caps.type = CPU_DETECT_TYPE_OTHER;
+#endif
+
+	/* Count the number of CPUs in system */
+#if !defined(OS_WIN32) && !defined(OS_UNKNOWN) && defined(_SC_NPROCESSORS_ONLN)
+	__cpu_detect_caps.nrcpu = sysconf(_SC_NPROCESSORS_ONLN);
+	if (__cpu_detect_caps.nrcpu == -1)
+		__cpu_detect_caps.nrcpu = 1;
+
+#elif defined(OS_NETBSD) || defined(OS_FREEBSD) || defined(OS_OPENBSD)
+
+	mib[0] = CTL_HW;
+	mib[1] = HW_NCPU;
+
+	len = sizeof (ncpu);
+	sysctl(mib, 2, &ncpu, &len, NULL, 0);
+	__cpu_detect_caps.nrcpu = ncpu;
+
+#else
+	__cpu_detect_caps.nrcpu = 1;
+#endif
+
+#if defined(ARCH_X86)
+	/* No cpuid, old 486 or lower */
+	if (has_cpuid() == 0)
+		return;
+
+	__cpu_detect_caps.cacheline = 32;
+
+	/* Get max cpuid level */
+	cpuid(0x00000000, regs);
+
+	if (regs[0] >= 0x00000001) {
+		unsigned int cacheline;
+
+		cpuid (0x00000001, regs2);
+
+		__cpu_detect_caps.x86cpuType = (regs2[0] >> 8) & 0xf;
+		if (__cpu_detect_caps.x86cpuType == 0xf)
+		    __cpu_detect_caps.x86cpuType = 8 + ((regs2[0] >> 20) & 255); /* use extended family (P4, IA64) */
+
+		/* general feature flags */
+		__cpu_detect_caps.hasTSC  = (regs2[3] & (1 << 8  )) >>  8; /* 0x0000010 */
+		__cpu_detect_caps.hasMMX  = (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+		__cpu_detect_caps.hasSSE  = (regs2[3] & (1 << 25 )) >> 25; /* 0x2000000 */
+		__cpu_detect_caps.hasSSE2 = (regs2[3] & (1 << 26 )) >> 26; /* 0x4000000 */
+		__cpu_detect_caps.hasSSE3 = (regs2[2] & (1));	       /* 0x0000001 */
+		__cpu_detect_caps.hasSSSE3 = (regs2[2] & (1 << 9 )) >> 9;   /* 0x0000020 */
+		__cpu_detect_caps.hasMMX2 = __cpu_detect_caps.hasSSE; /* SSE cpus supports mmxext too */
+
+		cacheline = ((regs2[1] >> 8) & 0xFF) * 8;
+		if (cacheline > 0)
+			__cpu_detect_caps.cacheline = cacheline;
+	}
+
+	cpuid(0x80000000, regs);
+
+	if (regs[0] >= 0x80000001) {
+
+		cpuid(0x80000001, regs2);
+
+		__cpu_detect_caps.hasMMX  |= (regs2[3] & (1 << 23 )) >> 23; /* 0x0800000 */
+		__cpu_detect_caps.hasMMX2 |= (regs2[3] & (1 << 22 )) >> 22; /* 0x400000 */
+		__cpu_detect_caps.has3DNow    = (regs2[3] & (1 << 31 )) >> 31; /* 0x80000000 */
+		__cpu_detect_caps.has3DNowExt = (regs2[3] & (1 << 30 )) >> 30;
+	}
+
+	if (regs[0] >= 0x80000006) {
+		cpuid(0x80000006, regs2);
+		__cpu_detect_caps.cacheline = regs2[2] & 0xFF;
+	}
+
+
+#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_CYGWIN) || defined(OS_OPENBSD)
+	if (__cpu_detect_caps.hasSSE)
+		check_os_katmai_support();
+
+	if (!__cpu_detect_caps.hasSSE) {
+		__cpu_detect_caps.hasSSE2 = 0;
+		__cpu_detect_caps.hasSSE3 = 0;
+		__cpu_detect_caps.hasSSSE3 = 0;
+	}
+#else
+	__cpu_detect_caps.hasSSE = 0;
+	__cpu_detect_caps.hasSSE2 = 0;
+	__cpu_detect_caps.hasSSE3 = 0;
+	__cpu_detect_caps.hasSSSE3 = 0;
+#endif
+#endif /* ARCH_X86 */
+
+#if defined(ARCH_POWERPC)
+	check_os_altivec_support();
+#endif /* ARCH_POWERPC */
+
+	__cpu_detect_initialized = 1;
+}
+
+struct cpu_detect_caps *cpu_detect_get_caps()
+{
+	return &__cpu_detect_caps;
+}
+
+/* The getters and setters for feature flags */
+int cpu_detect_get_tsc()
+{
+	return __cpu_detect_caps.hasTSC;
+}
+
+int cpu_detect_get_mmx()
+{
+	return __cpu_detect_caps.hasMMX;
+}
+
+int cpu_detect_get_mmx2()
+{
+	return __cpu_detect_caps.hasMMX2;
+}
+
+int cpu_detect_get_sse()
+{
+	return __cpu_detect_caps.hasSSE;
+}
+
+int cpu_detect_get_sse2()
+{
+	return __cpu_detect_caps.hasSSE2;
+}
+
+int cpu_detect_get_sse3()
+{
+	return __cpu_detect_caps.hasSSE3;
+}
+
+int cpu_detect_get_ssse3()
+{
+	return __cpu_detect_caps.hasSSSE3;
+}
+
+int cpu_detect_get_3dnow()
+{
+	return __cpu_detect_caps.has3DNow;
+}
+
+int cpu_detect_get_3dnow2()
+{
+	return __cpu_detect_caps.has3DNowExt;
+}
+
+int cpu_detect_get_altivec()
+{
+	return __cpu_detect_caps.hasAltiVec;
+}
+
diff --git a/src/gallium/auxiliary/util/u_cpu_detect.h b/src/gallium/auxiliary/util/u_cpu_detect.h
new file mode 100644
index 0000000000..1612d49286
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_cpu_detect.h
@@ -0,0 +1,78 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Dennis Smit
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ ***************************************************************************/
+
+/*
+ * Based on the work of Eric Anholt <anholt@FreeBSD.org>
+ */
+
+#ifndef _CPU_DETECT_H
+#define _CPU_DETECT_H
+
+typedef enum {
+	CPU_DETECT_TYPE_MIPS,
+	CPU_DETECT_TYPE_ALPHA,
+	CPU_DETECT_TYPE_SPARC,
+	CPU_DETECT_TYPE_X86,
+	CPU_DETECT_TYPE_POWERPC,
+	CPU_DETECT_TYPE_OTHER
+} cpu_detect_type;
+
+struct cpu_detect_caps {
+	cpu_detect_type	type;
+	int		nrcpu;
+
+	/* Feature flags */
+	int		x86cpuType;
+	int		cacheline;
+
+	int		hasTSC;
+	int		hasMMX;
+	int		hasMMX2;
+	int		hasSSE;
+	int		hasSSE2;
+	int		hasSSE3;
+	int		hasSSSE3;
+	int		has3DNow;
+	int		has3DNowExt;
+	int		hasAltiVec;
+};
+
+/* prototypes */
+void cpu_detect_initialize(void);
+struct cpu_detect_caps *cpu_detect_get_caps(void);
+
+int cpu_detect_get_tsc(void);
+int cpu_detect_get_mmx(void);
+int cpu_detect_get_mmx2(void);
+int cpu_detect_get_sse(void);
+int cpu_detect_get_sse2(void);
+int cpu_detect_get_sse3(void);
+int cpu_detect_get_ssse3(void);
+int cpu_detect_get_3dnow(void);
+int cpu_detect_get_3dnow2(void);
+int cpu_detect_get_altivec(void);
+
+#endif /* _CPU_DETECT_H */
diff --git a/src/gallium/auxiliary/util/u_handle_table.c b/src/gallium/auxiliary/util/u_handle_table.c
new file mode 100644
index 0000000000..8a298f7c41
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_handle_table.c
@@ -0,0 +1,207 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Generic handle table implementation.
+ *  
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "u_handle_table.h"
+
+
+#define HANDLE_TABLE_INITIAL_SIZE 16  
+
+
+struct handle_table
+{
+   /** Object array. Empty handles have a null object */
+   void **objects;
+   
+   /** Number of objects the handle can currently hold */
+   unsigned size;
+   /** Number of consecutive objects allocated at the start of the table */
+   unsigned filled;
+   
+   /** Optional object destructor */
+   void (*destroy)(void *object);
+};
+
+
+struct handle_table *
+handle_table_create(void)
+{
+   struct handle_table *ht;
+   
+   ht = MALLOC_STRUCT(handle_table);
+   if(!ht)
+      return NULL;
+   
+   ht->objects = (void **)CALLOC(HANDLE_TABLE_INITIAL_SIZE, sizeof(void *));
+   if(!ht->objects) {
+      FREE(ht);
+      return NULL;
+   }
+   
+   ht->size = HANDLE_TABLE_INITIAL_SIZE;
+   ht->filled = 0;
+   
+   ht->destroy = NULL;
+   
+   return ht;
+}
+
+
+void
+handle_table_set_destroy(struct handle_table *ht,
+                         void (*destroy)(void *object))
+{
+   assert(ht);
+   ht->destroy = destroy;
+}
+
+
+unsigned
+handle_table_add(struct handle_table *ht, 
+                 void *object)
+{
+   unsigned index;
+   unsigned handle;
+   
+   assert(ht);
+   assert(object);
+   if(!object)
+      return 0;
+
+   /* linear search for an empty handle */
+   while(ht->filled < ht->size) {
+      if(!ht->objects[ht->filled])
+	 break;
+      ++ht->filled;
+   }
+  
+   /* grow the table */
+   if(ht->filled == ht->size) {
+      unsigned new_size;
+      void **new_objects;
+      
+      new_size = ht->size*2;
+      assert(new_size);
+      
+      new_objects = (void **)REALLOC((void *)ht->objects,
+                                     ht->size*sizeof(void *),
+                                     new_size*sizeof(void *));
+      if(!new_objects)
+	 return 0;
+      
+      memset(new_objects + ht->size, 0, (new_size - ht->size)*sizeof(void *));
+      
+      ht->size = new_size;
+      ht->objects = new_objects;
+   }
+
+   index = ht->filled;
+   
+   handle = index + 1;
+   
+   /* check integer overflow */
+   if(!handle)
+      return 0;
+   
+   assert(!ht->objects[index]);
+   ht->objects[index] = object;
+   ++ht->filled;
+   
+   return handle;
+}
+
+
+void *
+handle_table_get(struct handle_table *ht, 
+                 unsigned handle)
+{
+   void *object;
+   
+   assert(ht);
+   assert(handle > 0);
+   assert(handle <= ht->size);
+   if(!handle || handle > ht->size)
+      return NULL;
+
+   object = ht->objects[handle - 1];
+   assert(object);
+   
+   return object;
+}
+
+
+void
+handle_table_remove(struct handle_table *ht, 
+                    unsigned handle)
+{
+   void *object;
+   unsigned index;
+   
+   assert(ht);
+   assert(handle > 0);
+   assert(handle <= ht->size);
+   if(!handle || handle > ht->size)
+      return;
+
+   index = handle - 1;
+   object = ht->objects[index];
+   assert(object);
+   
+   if(object && ht->destroy)
+      ht->destroy(object);
+
+   ht->objects[index] = NULL;
+   if(index < ht->filled)
+      ht->filled = index;
+}
+
+
+void
+handle_table_destroy(struct handle_table *ht)
+{
+   unsigned index;
+   assert(ht);
+
+   if(ht->destroy)
+      for(index = 0; index < ht->size; ++index)
+	 if(ht->objects[index])
+	    ht->destroy(ht->objects[index]);
+   
+   FREE(ht->objects);
+   FREE(ht);
+}
+
diff --git a/src/gallium/auxiliary/util/u_handle_table.h b/src/gallium/auxiliary/util/u_handle_table.h
new file mode 100644
index 0000000000..51fc273865
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_handle_table.h
@@ -0,0 +1,96 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Generic handle table.
+ *  
+ * @author José Fonseca <jrfonseca@tungstengraphics.com>
+ */
+
+#ifndef U_HANDLE_TABLE_H_
+#define U_HANDLE_TABLE_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+   
+/**
+ * Abstract data type to map integer handles to objects.
+ */
+struct handle_table;
+
+
+struct handle_table *
+handle_table_create(void);
+
+
+/**
+ * Set an optional destructor callback.
+ * 
+ * If set, it will be called during handle_table_remove and 
+ * handle_table_destroy calls.
+ */
+void
+handle_table_set_destroy(struct handle_table *ht,
+                         void (*destroy)(void *object));
+
+
+/**
+ * Add a new object.
+ * 
+ * Returns a zero handle on failure (out of memory).
+ */
+unsigned
+handle_table_add(struct handle_table *ht, 
+                 void *object);
+
+/**
+ * Fetch an existing object.
+ * 
+ * Returns NULL for an invalid handle.
+ */
+void *
+handle_table_get(struct handle_table *ht, 
+                 unsigned handle);
+
+
+void
+handle_table_remove(struct handle_table *ht, 
+                    unsigned handle);
+
+
+void
+handle_table_destroy(struct handle_table *ht);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* U_HANDLE_TABLE_H_ */
diff --git a/src/gallium/auxiliary/util/u_snprintf.c b/src/gallium/auxiliary/util/u_snprintf.c
new file mode 100644
index 0000000000..61c20b48f7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_snprintf.c
@@ -0,0 +1,1478 @@
+/*
+ * Copyright (c) 1995 Patrick Powell.
+ *
+ * This code is based on code written by Patrick Powell <papowell@astart.com>.
+ * It may be used for any purpose as long as this notice remains intact on all
+ * source code distributions.
+ */
+
+/*
+ * Copyright (c) 2008 Holger Weiss.
+ *
+ * This version of the code is maintained by Holger Weiss <holger@jhweiss.de>.
+ * My changes to the code may freely be used, modified and/or redistributed for
+ * any purpose.  It would be nice if additions and fixes to this file (including
+ * trivial code cleanups) would be sent back in order to let me include them in
+ * the version available at <http://www.jhweiss.de/software/snprintf.html>.
+ * However, this is not a requirement for using or redistributing (possibly
+ * modified) versions of this file, nor is leaving this notice intact mandatory.
+ */
+
+/*
+ * History
+ *
+ * 2008-01-20 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.1:
+ *
+ * 	Fixed the detection of infinite floating point values on IRIX (and
+ * 	possibly other systems) and applied another few minor cleanups.
+ *
+ * 2008-01-06 Holger Weiss <holger@jhweiss.de> for C99-snprintf 1.0:
+ *
+ * 	Added a lot of new features, fixed many bugs, and incorporated various
+ * 	improvements done by Andrew Tridgell <tridge@samba.org>, Russ Allbery
+ * 	<rra@stanford.edu>, Hrvoje Niksic <hniksic@xemacs.org>, Damien Miller
+ * 	<djm@mindrot.org>, and others for the Samba, INN, Wget, and OpenSSH
+ * 	projects.  The additions include: support the "e", "E", "g", "G", and
+ * 	"F" conversion specifiers (and use conversion style "f" or "F" for the
+ * 	still unsupported "a" and "A" specifiers); support the "hh", "ll", "j",
+ * 	"t", and "z" length modifiers; support the "#" flag and the (non-C99)
+ * 	"'" flag; use localeconv(3) (if available) to get both the current
+ * 	locale's decimal point character and the separator between groups of
+ * 	digits; fix the handling of various corner cases of field width and
+ * 	precision specifications; fix various floating point conversion bugs;
+ * 	handle infinite and NaN floating point values; don't attempt to write to
+ * 	the output buffer (which may be NULL) if a size of zero was specified;
+ * 	check for integer overflow of the field width, precision, and return
+ * 	values and during the floating point conversion; use the OUTCHAR() macro
+ * 	instead of a function for better performance; provide asprintf(3) and
+ * 	vasprintf(3) functions; add new test cases.  The replacement functions
+ * 	have been renamed to use an "rpl_" prefix, the function calls in the
+ * 	main project (and in this file) must be redefined accordingly for each
+ * 	replacement function which is needed (by using Autoconf or other means).
+ * 	Various other minor improvements have been applied and the coding style
+ * 	was cleaned up for consistency.
+ *
+ * 2007-07-23 Holger Weiss <holger@jhweiss.de> for Mutt 1.5.13:
+ *
+ * 	C99 compliant snprintf(3) and vsnprintf(3) functions return the number
+ * 	of characters that would have been written to a sufficiently sized
+ * 	buffer (excluding the '\0').  The original code simply returned the
+ * 	length of the resulting output string, so that's been fixed.
+ *
+ * 1998-03-05 Michael Elkins <me@mutt.org> for Mutt 0.90.8:
+ *
+ * 	The original code assumed that both snprintf(3) and vsnprintf(3) were
+ * 	missing.  Some systems only have snprintf(3) but not vsnprintf(3), so
+ * 	the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF.
+ *
+ * 1998-01-27 Thomas Roessler <roessler@does-not-exist.org> for Mutt 0.89i:
+ *
+ * 	The PGP code was using unsigned hexadecimal formats.  Unfortunately,
+ * 	unsigned formats simply didn't work.
+ *
+ * 1997-10-22 Brandon Long <blong@fiction.net> for Mutt 0.87.1:
+ *
+ * 	Ok, added some minimal floating point support, which means this probably
+ * 	requires libm on most operating systems.  Don't yet support the exponent
+ * 	(e,E) and sigfig (g,G).  Also, fmtint() was pretty badly broken, it just
+ * 	wasn't being exercised in ways which showed it, so that's been fixed.
+ * 	Also, formatted the code to Mutt conventions, and removed dead code left
+ * 	over from the original.  Also, there is now a builtin-test, run with:
+ * 	gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm && ./snprintf
+ *
+ * 2996-09-15 Brandon Long <blong@fiction.net> for Mutt 0.43:
+ *
+ * 	This was ugly.  It is still ugly.  I opted out of floating point
+ * 	numbers, but the formatter understands just about everything from the
+ * 	normal C string format, at least as far as I can tell from the Solaris
+ * 	2.5 printf(3S) man page.
+ */
+
+/*
+ * ToDo
+ *
+ * - Add wide character support.
+ * - Add support for "%a" and "%A" conversions.
+ * - Create test routines which predefine the expected results.  Our test cases
+ *   usually expose bugs in system implementations rather than in ours :-)
+ */
+
+/*
+ * Usage
+ *
+ * 1) The following preprocessor macros should be defined to 1 if the feature or
+ *    file in question is available on the target system (by using Autoconf or
+ *    other means), though basic functionality should be available as long as
+ *    HAVE_STDARG_H and HAVE_STDLIB_H are defined correctly:
+ *
+ *    	HAVE_VSNPRINTF
+ *    	HAVE_SNPRINTF
+ *    	HAVE_VASPRINTF
+ *    	HAVE_ASPRINTF
+ *    	HAVE_STDARG_H
+ *    	HAVE_STDDEF_H
+ *    	HAVE_STDINT_H
+ *    	HAVE_STDLIB_H
+ *    	HAVE_INTTYPES_H
+ *    	HAVE_LOCALE_H
+ *    	HAVE_LOCALECONV
+ *    	HAVE_LCONV_DECIMAL_POINT
+ *    	HAVE_LCONV_THOUSANDS_SEP
+ *    	HAVE_LONG_DOUBLE
+ *    	HAVE_LONG_LONG_INT
+ *    	HAVE_UNSIGNED_LONG_LONG_INT
+ *    	HAVE_INTMAX_T
+ *    	HAVE_UINTMAX_T
+ *    	HAVE_UINTPTR_T
+ *    	HAVE_PTRDIFF_T
+ *    	HAVE_VA_COPY
+ *    	HAVE___VA_COPY
+ *
+ * 2) The calls to the functions which should be replaced must be redefined
+ *    throughout the project files (by using Autoconf or other means):
+ *
+ *    	#define vsnprintf rpl_vsnprintf
+ *    	#define snprintf rpl_snprintf
+ *    	#define vasprintf rpl_vasprintf
+ *    	#define asprintf rpl_asprintf
+ *
+ * 3) The required replacement functions should be declared in some header file
+ *    included throughout the project files:
+ *
+ *    	#if HAVE_CONFIG_H
+ *    	#include <config.h>
+ *    	#endif
+ *    	#if HAVE_STDARG_H
+ *    	#include <stdarg.h>
+ *    	#if !HAVE_VSNPRINTF
+ *    	int rpl_vsnprintf(char *, size_t, const char *, va_list);
+ *    	#endif
+ *    	#if !HAVE_SNPRINTF
+ *    	int rpl_snprintf(char *, size_t, const char *, ...);
+ *    	#endif
+ *    	#if !HAVE_VASPRINTF
+ *    	int rpl_vasprintf(char **, const char *, va_list);
+ *    	#endif
+ *    	#if !HAVE_ASPRINTF
+ *    	int rpl_asprintf(char **, const char *, ...);
+ *    	#endif
+ *    	#endif
+ *
+ * Autoconf macros for handling step 1 and step 2 are available at
+ * <http://www.jhweiss.de/software/snprintf.html>.
+ */
+
+#if HAVE_CONFIG_H
+#include <config.h>
+#else
+#ifdef WIN32
+#define vsnprintf rpl_vsnprintf
+#define snprintf rpl_snprintf
+#define HAVE_VSNPRINTF 0
+#define HAVE_SNPRINTF 0
+#define HAVE_VASPRINTF 1 /* not needed */
+#define HAVE_ASPRINTF 1 /* not needed */
+#define HAVE_STDARG_H 1
+#define HAVE_STDDEF_H 1
+#define HAVE_STDINT_H 0
+#define HAVE_STDLIB_H 1
+#define HAVE_INTTYPES_H 0
+#define HAVE_LOCALE_H 0
+#define HAVE_LOCALECONV 0
+#define HAVE_LCONV_DECIMAL_POINT 0
+#define HAVE_LCONV_THOUSANDS_SEP 0
+#define HAVE_LONG_DOUBLE 0
+#define HAVE_LONG_LONG_INT 1
+#define HAVE_UNSIGNED_LONG_LONG_INT 1
+#define HAVE_INTMAX_T 0
+#define HAVE_UINTMAX_T 0
+#define HAVE_UINTPTR_T 1
+#define HAVE_PTRDIFF_T 1
+#define HAVE_VA_COPY 0
+#define HAVE___VA_COPY 0
+#else
+#define HAVE_VSNPRINTF 1
+#define HAVE_SNPRINTF 1
+#define HAVE_VASPRINTF 1
+#define HAVE_ASPRINTF 1
+#endif
+#endif	/* HAVE_CONFIG_H */
+
+#if !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || !HAVE_VASPRINTF
+#include <stdio.h>	/* For NULL, size_t, vsnprintf(3), and vasprintf(3). */
+#ifdef VA_START
+#undef VA_START
+#endif	/* defined(VA_START) */
+#ifdef VA_SHIFT
+#undef VA_SHIFT
+#endif	/* defined(VA_SHIFT) */
+#if HAVE_STDARG_H
+#include <stdarg.h>
+#define VA_START(ap, last) va_start(ap, last)
+#define VA_SHIFT(ap, value, type) /* No-op for ANSI C. */
+#else	/* Assume <varargs.h> is available. */
+#include <varargs.h>
+#define VA_START(ap, last) va_start(ap)	/* "last" is ignored. */
+#define VA_SHIFT(ap, value, type) value = va_arg(ap, type)
+#endif	/* HAVE_STDARG_H */
+
+#if !HAVE_VASPRINTF
+#if HAVE_STDLIB_H
+#include <stdlib.h>	/* For malloc(3). */
+#endif	/* HAVE_STDLIB_H */
+#ifdef VA_COPY
+#undef VA_COPY
+#endif	/* defined(VA_COPY) */
+#ifdef VA_END_COPY
+#undef VA_END_COPY
+#endif	/* defined(VA_END_COPY) */
+#if HAVE_VA_COPY
+#define VA_COPY(dest, src) va_copy(dest, src)
+#define VA_END_COPY(ap) va_end(ap)
+#elif HAVE___VA_COPY
+#define VA_COPY(dest, src) __va_copy(dest, src)
+#define VA_END_COPY(ap) va_end(ap)
+#else
+#define VA_COPY(dest, src) (void)mymemcpy(&dest, &src, sizeof(va_list))
+#define VA_END_COPY(ap) /* No-op. */
+#define NEED_MYMEMCPY 1
+static void *mymemcpy(void *, void *, size_t);
+#endif	/* HAVE_VA_COPY */
+#endif	/* !HAVE_VASPRINTF */
+
+#if !HAVE_VSNPRINTF
+#include <limits.h>	/* For *_MAX. */
+#if HAVE_INTTYPES_H
+#include <inttypes.h>	/* For intmax_t (if not defined in <stdint.h>). */
+#endif	/* HAVE_INTTYPES_H */
+#if HAVE_LOCALE_H
+#include <locale.h>	/* For localeconv(3). */
+#endif	/* HAVE_LOCALE_H */
+#if HAVE_STDDEF_H
+#include <stddef.h>	/* For ptrdiff_t. */
+#endif	/* HAVE_STDDEF_H */
+#if HAVE_STDINT_H
+#include <stdint.h>	/* For intmax_t. */
+#endif	/* HAVE_STDINT_H */
+
+/* Support for unsigned long long int.  We may also need ULLONG_MAX. */
+#ifndef ULONG_MAX	/* We may need ULONG_MAX as a fallback. */
+#ifdef UINT_MAX
+#define ULONG_MAX UINT_MAX
+#else
+#define ULONG_MAX INT_MAX
+#endif	/* defined(UINT_MAX) */
+#endif	/* !defined(ULONG_MAX) */
+#ifdef ULLONG
+#undef ULLONG
+#endif	/* defined(ULLONG) */
+#if HAVE_UNSIGNED_LONG_LONG_INT
+#define ULLONG unsigned long long int
+#ifndef ULLONG_MAX
+#define ULLONG_MAX ULONG_MAX
+#endif	/* !defined(ULLONG_MAX) */
+#else
+#define ULLONG unsigned long int
+#ifdef ULLONG_MAX
+#undef ULLONG_MAX
+#endif	/* defined(ULLONG_MAX) */
+#define ULLONG_MAX ULONG_MAX
+#endif	/* HAVE_LONG_LONG_INT */
+
+/* Support for uintmax_t.  We also need UINTMAX_MAX. */
+#ifdef UINTMAX_T
+#undef UINTMAX_T
+#endif	/* defined(UINTMAX_T) */
+#if HAVE_UINTMAX_T || defined(uintmax_t)
+#define UINTMAX_T uintmax_t
+#ifndef UINTMAX_MAX
+#define UINTMAX_MAX ULLONG_MAX
+#endif	/* !defined(UINTMAX_MAX) */
+#else
+#define UINTMAX_T ULLONG
+#ifdef UINTMAX_MAX
+#undef UINTMAX_MAX
+#endif	/* defined(UINTMAX_MAX) */
+#define UINTMAX_MAX ULLONG_MAX
+#endif	/* HAVE_UINTMAX_T || defined(uintmax_t) */
+
+/* Support for long double. */
+#ifndef LDOUBLE
+#if HAVE_LONG_DOUBLE
+#define LDOUBLE long double
+#else
+#define LDOUBLE double
+#endif	/* HAVE_LONG_DOUBLE */
+#endif	/* !defined(LDOUBLE) */
+
+/* Support for long long int. */
+#ifndef LLONG
+#if HAVE_LONG_LONG_INT
+#define LLONG long long int
+#else
+#define LLONG long int
+#endif	/* HAVE_LONG_LONG_INT */
+#endif	/* !defined(LLONG) */
+
+/* Support for intmax_t. */
+#ifndef INTMAX_T
+#if HAVE_INTMAX_T || defined(intmax_t)
+#define INTMAX_T intmax_t
+#else
+#define INTMAX_T LLONG
+#endif	/* HAVE_INTMAX_T || defined(intmax_t) */
+#endif	/* !defined(INTMAX_T) */
+
+/* Support for uintptr_t. */
+#ifndef UINTPTR_T
+#if HAVE_UINTPTR_T || defined(uintptr_t)
+#define UINTPTR_T uintptr_t
+#else
+#define UINTPTR_T unsigned long int
+#endif	/* HAVE_UINTPTR_T || defined(uintptr_t) */
+#endif	/* !defined(UINTPTR_T) */
+
+/* Support for ptrdiff_t. */
+#ifndef PTRDIFF_T
+#if HAVE_PTRDIFF_T || defined(ptrdiff_t)
+#define PTRDIFF_T ptrdiff_t
+#else
+#define PTRDIFF_T long int
+#endif	/* HAVE_PTRDIFF_T || defined(ptrdiff_t) */
+#endif	/* !defined(PTRDIFF_T) */
+
+/*
+ * We need an unsigned integer type corresponding to ptrdiff_t (cf. C99:
+ * 7.19.6.1, 7).  However, we'll simply use PTRDIFF_T and convert it to an
+ * unsigned type if necessary.  This should work just fine in practice.
+ */
+#ifndef UPTRDIFF_T
+#define UPTRDIFF_T PTRDIFF_T
+#endif	/* !defined(UPTRDIFF_T) */
+
+/*
+ * We need a signed integer type corresponding to size_t (cf. C99: 7.19.6.1, 7).
+ * However, we'll simply use size_t and convert it to a signed type if
+ * necessary.  This should work just fine in practice.
+ */
+#ifndef SSIZE_T
+#define SSIZE_T size_t
+#endif	/* !defined(SSIZE_T) */
+
+/* Either ERANGE or E2BIG should be available everywhere. */
+#ifndef ERANGE
+#define ERANGE E2BIG
+#endif	/* !defined(ERANGE) */
+#ifndef EOVERFLOW
+#define EOVERFLOW ERANGE
+#endif	/* !defined(EOVERFLOW) */
+
+/*
+ * Buffer size to hold the octal string representation of UINT128_MAX without
+ * nul-termination ("3777777777777777777777777777777777777777777").
+ */
+#ifdef MAX_CONVERT_LENGTH
+#undef MAX_CONVERT_LENGTH
+#endif	/* defined(MAX_CONVERT_LENGTH) */
+#define MAX_CONVERT_LENGTH      43
+
+/* Format read states. */
+#define PRINT_S_DEFAULT         0
+#define PRINT_S_FLAGS           1
+#define PRINT_S_WIDTH           2
+#define PRINT_S_DOT             3
+#define PRINT_S_PRECISION       4
+#define PRINT_S_MOD             5
+#define PRINT_S_CONV            6
+
+/* Format flags. */
+#define PRINT_F_MINUS           (1 << 0)
+#define PRINT_F_PLUS            (1 << 1)
+#define PRINT_F_SPACE           (1 << 2)
+#define PRINT_F_NUM             (1 << 3)
+#define PRINT_F_ZERO            (1 << 4)
+#define PRINT_F_QUOTE           (1 << 5)
+#define PRINT_F_UP              (1 << 6)
+#define PRINT_F_UNSIGNED        (1 << 7)
+#define PRINT_F_TYPE_G          (1 << 8)
+#define PRINT_F_TYPE_E          (1 << 9)
+
+/* Conversion flags. */
+#define PRINT_C_CHAR            1
+#define PRINT_C_SHORT           2
+#define PRINT_C_LONG            3
+#define PRINT_C_LLONG           4
+#define PRINT_C_LDOUBLE         5
+#define PRINT_C_SIZE            6
+#define PRINT_C_PTRDIFF         7
+#define PRINT_C_INTMAX          8
+
+#ifndef MAX
+#define MAX(x, y) ((x >= y) ? x : y)
+#endif	/* !defined(MAX) */
+#ifndef CHARTOINT
+#define CHARTOINT(ch) (ch - '0')
+#endif	/* !defined(CHARTOINT) */
+#ifndef ISDIGIT
+#define ISDIGIT(ch) ('0' <= (unsigned char)ch && (unsigned char)ch <= '9')
+#endif	/* !defined(ISDIGIT) */
+#ifndef ISNAN
+#define ISNAN(x) (x != x)
+#endif	/* !defined(ISNAN) */
+#ifndef ISINF
+#define ISINF(x) (x != 0.0 && x + x == x)
+#endif	/* !defined(ISINF) */
+
+#ifdef OUTCHAR
+#undef OUTCHAR
+#endif	/* defined(OUTCHAR) */
+#define OUTCHAR(str, len, size, ch)                                          \
+do {                                                                         \
+	if (len + 1 < size)                                                  \
+		str[len] = ch;                                               \
+	(len)++;                                                             \
+} while (/* CONSTCOND */ 0)
+
+static void fmtstr(char *, size_t *, size_t, const char *, int, int, int);
+static void fmtint(char *, size_t *, size_t, INTMAX_T, int, int, int, int);
+static void fmtflt(char *, size_t *, size_t, LDOUBLE, int, int, int, int *);
+static void printsep(char *, size_t *, size_t);
+static int getnumsep(int);
+static int getexponent(LDOUBLE);
+static int convert(UINTMAX_T, char *, size_t, int, int);
+static UINTMAX_T cast(LDOUBLE);
+static UINTMAX_T myround(LDOUBLE);
+static LDOUBLE mypow10(int);
+
+int
+rpl_vsnprintf(char *str, size_t size, const char *format, va_list args)
+{
+	LDOUBLE fvalue;
+	INTMAX_T value;
+	unsigned char cvalue;
+	const char *strvalue;
+	INTMAX_T *intmaxptr;
+	PTRDIFF_T *ptrdiffptr;
+	SSIZE_T *sizeptr;
+	LLONG *llongptr;
+	long int *longptr;
+	int *intptr;
+	short int *shortptr;
+	signed char *charptr;
+	size_t len = 0;
+	int overflow = 0;
+	int base = 0;
+	int cflags = 0;
+	int flags = 0;
+	int width = 0;
+	int precision = -1;
+	int state = PRINT_S_DEFAULT;
+	char ch = *format++;
+
+	/*
+	 * C99 says: "If `n' is zero, nothing is written, and `s' may be a null
+	 * pointer." (7.19.6.5, 2)  We're forgiving and allow a NULL pointer
+	 * even if a size larger than zero was specified.  At least NetBSD's
+	 * snprintf(3) does the same, as well as other versions of this file.
+	 * (Though some of these versions will write to a non-NULL buffer even
+	 * if a size of zero was specified, which violates the standard.)
+	 */
+	if (str == NULL && size != 0)
+		size = 0;
+
+	while (ch != '\0')
+		switch (state) {
+		case PRINT_S_DEFAULT:
+			if (ch == '%')
+				state = PRINT_S_FLAGS;
+			else
+				OUTCHAR(str, len, size, ch);
+			ch = *format++;
+			break;
+		case PRINT_S_FLAGS:
+			switch (ch) {
+			case '-':
+				flags |= PRINT_F_MINUS;
+				ch = *format++;
+				break;
+			case '+':
+				flags |= PRINT_F_PLUS;
+				ch = *format++;
+				break;
+			case ' ':
+				flags |= PRINT_F_SPACE;
+				ch = *format++;
+				break;
+			case '#':
+				flags |= PRINT_F_NUM;
+				ch = *format++;
+				break;
+			case '0':
+				flags |= PRINT_F_ZERO;
+				ch = *format++;
+				break;
+			case '\'':	/* SUSv2 flag (not in C99). */
+				flags |= PRINT_F_QUOTE;
+				ch = *format++;
+				break;
+			default:
+				state = PRINT_S_WIDTH;
+				break;
+			}
+			break;
+		case PRINT_S_WIDTH:
+			if (ISDIGIT(ch)) {
+				ch = CHARTOINT(ch);
+				if (width > (INT_MAX - ch) / 10) {
+					overflow = 1;
+					goto out;
+				}
+				width = 10 * width + ch;
+				ch = *format++;
+			} else if (ch == '*') {
+				/*
+				 * C99 says: "A negative field width argument is
+				 * taken as a `-' flag followed by a positive
+				 * field width." (7.19.6.1, 5)
+				 */
+				if ((width = va_arg(args, int)) < 0) {
+					flags |= PRINT_F_MINUS;
+					width = -width;
+				}
+				ch = *format++;
+				state = PRINT_S_DOT;
+			} else
+				state = PRINT_S_DOT;
+			break;
+		case PRINT_S_DOT:
+			if (ch == '.') {
+				state = PRINT_S_PRECISION;
+				ch = *format++;
+			} else
+				state = PRINT_S_MOD;
+			break;
+		case PRINT_S_PRECISION:
+			if (precision == -1)
+				precision = 0;
+			if (ISDIGIT(ch)) {
+				ch = CHARTOINT(ch);
+				if (precision > (INT_MAX - ch) / 10) {
+					overflow = 1;
+					goto out;
+				}
+				precision = 10 * precision + ch;
+				ch = *format++;
+			} else if (ch == '*') {
+				/*
+				 * C99 says: "A negative precision argument is
+				 * taken as if the precision were omitted."
+				 * (7.19.6.1, 5)
+				 */
+				if ((precision = va_arg(args, int)) < 0)
+					precision = -1;
+				ch = *format++;
+				state = PRINT_S_MOD;
+			} else
+				state = PRINT_S_MOD;
+			break;
+		case PRINT_S_MOD:
+			switch (ch) {
+			case 'h':
+				ch = *format++;
+				if (ch == 'h') {	/* It's a char. */
+					ch = *format++;
+					cflags = PRINT_C_CHAR;
+				} else
+					cflags = PRINT_C_SHORT;
+				break;
+			case 'l':
+				ch = *format++;
+				if (ch == 'l') {	/* It's a long long. */
+					ch = *format++;
+					cflags = PRINT_C_LLONG;
+				} else
+					cflags = PRINT_C_LONG;
+				break;
+			case 'L':
+				cflags = PRINT_C_LDOUBLE;
+				ch = *format++;
+				break;
+			case 'j':
+				cflags = PRINT_C_INTMAX;
+				ch = *format++;
+				break;
+			case 't':
+				cflags = PRINT_C_PTRDIFF;
+				ch = *format++;
+				break;
+			case 'z':
+				cflags = PRINT_C_SIZE;
+				ch = *format++;
+				break;
+			}
+			state = PRINT_S_CONV;
+			break;
+		case PRINT_S_CONV:
+			switch (ch) {
+			case 'd':
+				/* FALLTHROUGH */
+			case 'i':
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					value = (signed char)va_arg(args, int);
+					break;
+				case PRINT_C_SHORT:
+					value = (short int)va_arg(args, int);
+					break;
+				case PRINT_C_LONG:
+					value = va_arg(args, long int);
+					break;
+				case PRINT_C_LLONG:
+					value = va_arg(args, LLONG);
+					break;
+				case PRINT_C_SIZE:
+					value = va_arg(args, SSIZE_T);
+					break;
+				case PRINT_C_INTMAX:
+					value = va_arg(args, INTMAX_T);
+					break;
+				case PRINT_C_PTRDIFF:
+					value = va_arg(args, PTRDIFF_T);
+					break;
+				default:
+					value = va_arg(args, int);
+					break;
+				}
+				fmtint(str, &len, size, value, 10, width,
+				    precision, flags);
+				break;
+			case 'X':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'x':
+				base = 16;
+				/* FALLTHROUGH */
+			case 'o':
+				if (base == 0)
+					base = 8;
+				/* FALLTHROUGH */
+			case 'u':
+				if (base == 0)
+					base = 10;
+				flags |= PRINT_F_UNSIGNED;
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					value = (unsigned char)va_arg(args,
+					    unsigned int);
+					break;
+				case PRINT_C_SHORT:
+					value = (unsigned short int)va_arg(args,
+					    unsigned int);
+					break;
+				case PRINT_C_LONG:
+					value = va_arg(args, unsigned long int);
+					break;
+				case PRINT_C_LLONG:
+					value = va_arg(args, ULLONG);
+					break;
+				case PRINT_C_SIZE:
+					value = va_arg(args, size_t);
+					break;
+				case PRINT_C_INTMAX:
+					value = va_arg(args, UINTMAX_T);
+					break;
+				case PRINT_C_PTRDIFF:
+					value = va_arg(args, UPTRDIFF_T);
+					break;
+				default:
+					value = va_arg(args, unsigned int);
+					break;
+				}
+				fmtint(str, &len, size, value, base, width,
+				    precision, flags);
+				break;
+			case 'A':
+				/* Not yet supported, we'll use "%F". */
+				/* FALLTHROUGH */
+			case 'F':
+				flags |= PRINT_F_UP;
+			case 'a':
+				/* Not yet supported, we'll use "%f". */
+				/* FALLTHROUGH */
+			case 'f':
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'E':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'e':
+				flags |= PRINT_F_TYPE_E;
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'G':
+				flags |= PRINT_F_UP;
+				/* FALLTHROUGH */
+			case 'g':
+				flags |= PRINT_F_TYPE_G;
+				if (cflags == PRINT_C_LDOUBLE)
+					fvalue = va_arg(args, LDOUBLE);
+				else
+					fvalue = va_arg(args, double);
+				/*
+				 * If the precision is zero, it is treated as
+				 * one (cf. C99: 7.19.6.1, 8).
+				 */
+				if (precision == 0)
+					precision = 1;
+				fmtflt(str, &len, size, fvalue, width,
+				    precision, flags, &overflow);
+				if (overflow)
+					goto out;
+				break;
+			case 'c':
+				cvalue = (unsigned char)va_arg(args, int);
+				OUTCHAR(str, len, size, cvalue);
+				break;
+			case 's':
+				strvalue = va_arg(args, char *);
+				fmtstr(str, &len, size, strvalue, width,
+				    precision, flags);
+				break;
+			case 'p':
+				/*
+				 * C99 says: "The value of the pointer is
+				 * converted to a sequence of printing
+				 * characters, in an implementation-defined
+				 * manner." (C99: 7.19.6.1, 8)
+				 */
+				if ((strvalue = va_arg(args, void *)) == NULL)
+					/*
+					 * We use the glibc format.  BSD prints
+					 * "0x0", SysV "0".
+					 */
+					fmtstr(str, &len, size, "(nil)", width,
+					    -1, flags);
+				else {
+					/*
+					 * We use the BSD/glibc format.  SysV
+					 * omits the "0x" prefix (which we emit
+					 * using the PRINT_F_NUM flag).
+					 */
+					flags |= PRINT_F_NUM;
+					flags |= PRINT_F_UNSIGNED;
+					fmtint(str, &len, size,
+					    (UINTPTR_T)strvalue, 16, width,
+					    precision, flags);
+				}
+				break;
+			case 'n':
+				switch (cflags) {
+				case PRINT_C_CHAR:
+					charptr = va_arg(args, signed char *);
+					*charptr = len;
+					break;
+				case PRINT_C_SHORT:
+					shortptr = va_arg(args, short int *);
+					*shortptr = len;
+					break;
+				case PRINT_C_LONG:
+					longptr = va_arg(args, long int *);
+					*longptr = len;
+					break;
+				case PRINT_C_LLONG:
+					llongptr = va_arg(args, LLONG *);
+					*llongptr = len;
+					break;
+				case PRINT_C_SIZE:
+					/*
+					 * C99 says that with the "z" length
+					 * modifier, "a following `n' conversion
+					 * specifier applies to a pointer to a
+					 * signed integer type corresponding to
+					 * size_t argument." (7.19.6.1, 7)
+					 */
+					sizeptr = va_arg(args, SSIZE_T *);
+					*sizeptr = len;
+					break;
+				case PRINT_C_INTMAX:
+					intmaxptr = va_arg(args, INTMAX_T *);
+					*intmaxptr = len;
+					break;
+				case PRINT_C_PTRDIFF:
+					ptrdiffptr = va_arg(args, PTRDIFF_T *);
+					*ptrdiffptr = len;
+					break;
+				default:
+					intptr = va_arg(args, int *);
+					*intptr = len;
+					break;
+				}
+				break;
+			case '%':	/* Print a "%" character verbatim. */
+				OUTCHAR(str, len, size, ch);
+				break;
+			default:	/* Skip other characters. */
+				break;
+			}
+			ch = *format++;
+			state = PRINT_S_DEFAULT;
+			base = cflags = flags = width = 0;
+			precision = -1;
+			break;
+		}
+out:
+	if (len < size)
+		str[len] = '\0';
+	else if (size > 0)
+		str[size - 1] = '\0';
+
+	if (overflow || len >= INT_MAX) {
+		return -1;
+	}
+	return (int)len;
+}
+
+static void
+fmtstr(char *str, size_t *len, size_t size, const char *value, int width,
+       int precision, int flags)
+{
+	int padlen, strln;	/* Amount to pad. */
+	int noprecision = (precision == -1);
+
+	if (value == NULL)	/* We're forgiving. */
+		value = "(null)";
+
+	/* If a precision was specified, don't read the string past it. */
+	for (strln = 0; value[strln] != '\0' &&
+	    (noprecision || strln < precision); strln++)
+		continue;
+
+	if ((padlen = width - strln) < 0)
+		padlen = 0;
+	if (flags & PRINT_F_MINUS)	/* Left justify. */
+		padlen = -padlen;
+
+	while (padlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen--;
+	}
+	while (*value != '\0' && (noprecision || precision-- > 0)) {
+		OUTCHAR(str, *len, size, *value);
+		value++;
+	}
+	while (padlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen++;
+	}
+}
+
+static void
+fmtint(char *str, size_t *len, size_t size, INTMAX_T value, int base, int width,
+       int precision, int flags)
+{
+	UINTMAX_T uvalue;
+	char iconvert[MAX_CONVERT_LENGTH];
+	char sign = 0;
+	char hexprefix = 0;
+	int spadlen = 0;	/* Amount to space pad. */
+	int zpadlen = 0;	/* Amount to zero pad. */
+	int pos;
+	int separators = (flags & PRINT_F_QUOTE);
+	int noprecision = (precision == -1);
+
+	if (flags & PRINT_F_UNSIGNED)
+		uvalue = value;
+	else {
+		uvalue = (value >= 0) ? value : -value;
+		if (value < 0)
+			sign = '-';
+		else if (flags & PRINT_F_PLUS)	/* Do a sign. */
+			sign = '+';
+		else if (flags & PRINT_F_SPACE)
+			sign = ' ';
+	}
+
+	pos = convert(uvalue, iconvert, sizeof(iconvert), base,
+	    flags & PRINT_F_UP);
+
+	if (flags & PRINT_F_NUM && uvalue != 0) {
+		/*
+		 * C99 says: "The result is converted to an `alternative form'.
+		 * For `o' conversion, it increases the precision, if and only
+		 * if necessary, to force the first digit of the result to be a
+		 * zero (if the value and precision are both 0, a single 0 is
+		 * printed).  For `x' (or `X') conversion, a nonzero result has
+		 * `0x' (or `0X') prefixed to it." (7.19.6.1, 6)
+		 */
+		switch (base) {
+		case 8:
+			if (precision <= pos)
+				precision = pos + 1;
+			break;
+		case 16:
+			hexprefix = (flags & PRINT_F_UP) ? 'X' : 'x';
+			break;
+		}
+	}
+
+	if (separators)	/* Get the number of group separators we'll print. */
+		separators = getnumsep(pos);
+
+	zpadlen = precision - pos - separators;
+	spadlen = width                         /* Minimum field width. */
+	    - separators                        /* Number of separators. */
+	    - MAX(precision, pos)               /* Number of integer digits. */
+	    - ((sign != 0) ? 1 : 0)             /* Will we print a sign? */
+	    - ((hexprefix != 0) ? 2 : 0);       /* Will we print a prefix? */
+
+	if (zpadlen < 0)
+		zpadlen = 0;
+	if (spadlen < 0)
+		spadlen = 0;
+
+	/*
+	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
+	 * ignored.  For `d', `i', `o', `u', `x', and `X' conversions, if a
+	 * precision is specified, the `0' flag is ignored." (7.19.6.1, 6)
+	 */
+	if (flags & PRINT_F_MINUS)	/* Left justify. */
+		spadlen = -spadlen;
+	else if (flags & PRINT_F_ZERO && noprecision) {
+		zpadlen += spadlen;
+		spadlen = 0;
+	}
+	while (spadlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		spadlen--;
+	}
+	if (sign != 0)	/* Sign. */
+		OUTCHAR(str, *len, size, sign);
+	if (hexprefix != 0) {	/* A "0x" or "0X" prefix. */
+		OUTCHAR(str, *len, size, '0');
+		OUTCHAR(str, *len, size, hexprefix);
+	}
+	while (zpadlen > 0) {	/* Leading zeros. */
+		OUTCHAR(str, *len, size, '0');
+		zpadlen--;
+	}
+	while (pos > 0) {	/* The actual digits. */
+		pos--;
+		OUTCHAR(str, *len, size, iconvert[pos]);
+		if (separators > 0 && pos > 0 && pos % 3 == 0)
+			printsep(str, len, size);
+	}
+	while (spadlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		spadlen++;
+	}
+}
+
+static void
+fmtflt(char *str, size_t *len, size_t size, LDOUBLE fvalue, int width,
+       int precision, int flags, int *overflow)
+{
+	LDOUBLE ufvalue;
+	UINTMAX_T intpart;
+	UINTMAX_T fracpart;
+	UINTMAX_T mask;
+	const char *infnan = NULL;
+	char iconvert[MAX_CONVERT_LENGTH];
+	char fconvert[MAX_CONVERT_LENGTH];
+	char econvert[4];	/* "e-12" (without nul-termination). */
+	char esign = 0;
+	char sign = 0;
+	int leadfraczeros = 0;
+	int exponent = 0;
+	int emitpoint = 0;
+	int omitzeros = 0;
+	int omitcount = 0;
+	int padlen = 0;
+	int epos = 0;
+	int fpos = 0;
+	int ipos = 0;
+	int separators = (flags & PRINT_F_QUOTE);
+	int estyle = (flags & PRINT_F_TYPE_E);
+#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
+	struct lconv *lc = localeconv();
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
+
+	/*
+	 * AIX' man page says the default is 0, but C99 and at least Solaris'
+	 * and NetBSD's man pages say the default is 6, and sprintf(3) on AIX
+	 * defaults to 6.
+	 */
+	if (precision == -1)
+		precision = 6;
+
+	if (fvalue < 0.0)
+		sign = '-';
+	else if (flags & PRINT_F_PLUS)	/* Do a sign. */
+		sign = '+';
+	else if (flags & PRINT_F_SPACE)
+		sign = ' ';
+
+	if (ISNAN(fvalue))
+		infnan = (flags & PRINT_F_UP) ? "NAN" : "nan";
+	else if (ISINF(fvalue))
+		infnan = (flags & PRINT_F_UP) ? "INF" : "inf";
+
+	if (infnan != NULL) {
+		if (sign != 0)
+			iconvert[ipos++] = sign;
+		while (*infnan != '\0')
+			iconvert[ipos++] = *infnan++;
+		fmtstr(str, len, size, iconvert, width, ipos, flags);
+		return;
+	}
+
+	/* "%e" (or "%E") or "%g" (or "%G") conversion. */
+	if (flags & PRINT_F_TYPE_E || flags & PRINT_F_TYPE_G) {
+		if (flags & PRINT_F_TYPE_G) {
+			/*
+			 * For "%g" (and "%G") conversions, the precision
+			 * specifies the number of significant digits, which
+			 * includes the digits in the integer part.  The
+			 * conversion will or will not be using "e-style" (like
+			 * "%e" or "%E" conversions) depending on the precision
+			 * and on the exponent.  However, the exponent can be
+			 * affected by rounding the converted value, so we'll
+			 * leave this decision for later.  Until then, we'll
+			 * assume that we're going to do an "e-style" conversion
+			 * (in order to get the exponent calculated).  For
+			 * "e-style", the precision must be decremented by one.
+			 */
+			precision--;
+			/*
+			 * For "%g" (and "%G") conversions, trailing zeros are
+			 * removed from the fractional portion of the result
+			 * unless the "#" flag was specified.
+			 */
+			if (!(flags & PRINT_F_NUM))
+				omitzeros = 1;
+		}
+		exponent = getexponent(fvalue);
+		estyle = 1;
+	}
+
+again:
+	/*
+	 * Sorry, we only support 9, 19, or 38 digits (that is, the number of
+	 * digits of the 32-bit, the 64-bit, or the 128-bit UINTMAX_MAX value
+	 * minus one) past the decimal point due to our conversion method.
+	 */
+	switch (sizeof(UINTMAX_T)) {
+	case 16:
+		if (precision > 38)
+			precision = 38;
+		break;
+	case 8:
+		if (precision > 19)
+			precision = 19;
+		break;
+	default:
+		if (precision > 9)
+			precision = 9;
+		break;
+	}
+
+	ufvalue = (fvalue >= 0.0) ? fvalue : -fvalue;
+	if (estyle)	/* We want exactly one integer digit. */
+		ufvalue /= mypow10(exponent);
+
+	if ((intpart = cast(ufvalue)) == UINTMAX_MAX) {
+		*overflow = 1;
+		return;
+	}
+
+	/*
+	 * Factor of ten with the number of digits needed for the fractional
+	 * part.  For example, if the precision is 3, the mask will be 1000.
+	 */
+	mask = (UINTMAX_T)mypow10(precision);
+	/*
+	 * We "cheat" by converting the fractional part to integer by
+	 * multiplying by a factor of ten.
+	 */
+	if ((fracpart = myround(mask * (ufvalue - intpart))) >= mask) {
+		/*
+		 * For example, ufvalue = 2.99962, intpart = 2, and mask = 1000
+		 * (because precision = 3).  Now, myround(1000 * 0.99962) will
+		 * return 1000.  So, the integer part must be incremented by one
+		 * and the fractional part must be set to zero.
+		 */
+		intpart++;
+		fracpart = 0;
+		if (estyle && intpart == 10) {
+			/*
+			 * The value was rounded up to ten, but we only want one
+			 * integer digit if using "e-style".  So, the integer
+			 * part must be set to one and the exponent must be
+			 * incremented by one.
+			 */
+			intpart = 1;
+			exponent++;
+		}
+	}
+
+	/*
+	 * Now that we know the real exponent, we can check whether or not to
+	 * use "e-style" for "%g" (and "%G") conversions.  If we don't need
+	 * "e-style", the precision must be adjusted and the integer and
+	 * fractional parts must be recalculated from the original value.
+	 *
+	 * C99 says: "Let P equal the precision if nonzero, 6 if the precision
+	 * is omitted, or 1 if the precision is zero.  Then, if a conversion
+	 * with style `E' would have an exponent of X:
+	 *
+	 * - if P > X >= -4, the conversion is with style `f' (or `F') and
+	 *   precision P - (X + 1).
+	 *
+	 * - otherwise, the conversion is with style `e' (or `E') and precision
+	 *   P - 1." (7.19.6.1, 8)
+	 *
+	 * Note that we had decremented the precision by one.
+	 */
+	if (flags & PRINT_F_TYPE_G && estyle &&
+	    precision + 1 > exponent && exponent >= -4) {
+		precision -= exponent;
+		estyle = 0;
+		goto again;
+	}
+
+	if (estyle) {
+		if (exponent < 0) {
+			exponent = -exponent;
+			esign = '-';
+		} else
+			esign = '+';
+
+		/*
+		 * Convert the exponent.  The sizeof(econvert) is 4.  So, the
+		 * econvert buffer can hold e.g. "e+99" and "e-99".  We don't
+		 * support an exponent which contains more than two digits.
+		 * Therefore, the following stores are safe.
+		 */
+		epos = convert(exponent, econvert, 2, 10, 0);
+		/*
+		 * C99 says: "The exponent always contains at least two digits,
+		 * and only as many more digits as necessary to represent the
+		 * exponent." (7.19.6.1, 8)
+		 */
+		if (epos == 1)
+			econvert[epos++] = '0';
+		econvert[epos++] = esign;
+		econvert[epos++] = (flags & PRINT_F_UP) ? 'E' : 'e';
+	}
+
+	/* Convert the integer part and the fractional part. */
+	ipos = convert(intpart, iconvert, sizeof(iconvert), 10, 0);
+	if (fracpart != 0)	/* convert() would return 1 if fracpart == 0. */
+		fpos = convert(fracpart, fconvert, sizeof(fconvert), 10, 0);
+
+	leadfraczeros = precision - fpos;
+
+	if (omitzeros) {
+		if (fpos > 0)	/* Omit trailing fractional part zeros. */
+			while (omitcount < fpos && fconvert[omitcount] == '0')
+				omitcount++;
+		else {	/* The fractional part is zero, omit it completely. */
+			omitcount = precision;
+			leadfraczeros = 0;
+		}
+		precision -= omitcount;
+	}
+
+	/*
+	 * Print a decimal point if either the fractional part is non-zero
+	 * and/or the "#" flag was specified.
+	 */
+	if (precision > 0 || flags & PRINT_F_NUM)
+		emitpoint = 1;
+	if (separators)	/* Get the number of group separators we'll print. */
+		separators = getnumsep(ipos);
+
+	padlen = width                  /* Minimum field width. */
+	    - ipos                      /* Number of integer digits. */
+	    - epos                      /* Number of exponent characters. */
+	    - precision                 /* Number of fractional digits. */
+	    - separators                /* Number of group separators. */
+	    - (emitpoint ? 1 : 0)       /* Will we print a decimal point? */
+	    - ((sign != 0) ? 1 : 0);    /* Will we print a sign character? */
+
+	if (padlen < 0)
+		padlen = 0;
+
+	/*
+	 * C99 says: "If the `0' and `-' flags both appear, the `0' flag is
+	 * ignored." (7.19.6.1, 6)
+	 */
+	if (flags & PRINT_F_MINUS)	/* Left justifty. */
+		padlen = -padlen;
+	else if (flags & PRINT_F_ZERO && padlen > 0) {
+		if (sign != 0) {	/* Sign. */
+			OUTCHAR(str, *len, size, sign);
+			sign = 0;
+		}
+		while (padlen > 0) {	/* Leading zeros. */
+			OUTCHAR(str, *len, size, '0');
+			padlen--;
+		}
+	}
+	while (padlen > 0) {	/* Leading spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen--;
+	}
+	if (sign != 0)	/* Sign. */
+		OUTCHAR(str, *len, size, sign);
+	while (ipos > 0) {	/* Integer part. */
+		ipos--;
+		OUTCHAR(str, *len, size, iconvert[ipos]);
+		if (separators > 0 && ipos > 0 && ipos % 3 == 0)
+			printsep(str, len, size);
+	}
+	if (emitpoint) {	/* Decimal point. */
+#if HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT
+		if (lc->decimal_point != NULL && *lc->decimal_point != '\0')
+			OUTCHAR(str, *len, size, *lc->decimal_point);
+		else	/* We'll always print some decimal point character. */
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_DECIMAL_POINT */
+			OUTCHAR(str, *len, size, '.');
+	}
+	while (leadfraczeros > 0) {	/* Leading fractional part zeros. */
+		OUTCHAR(str, *len, size, '0');
+		leadfraczeros--;
+	}
+	while (fpos > omitcount) {	/* The remaining fractional part. */
+		fpos--;
+		OUTCHAR(str, *len, size, fconvert[fpos]);
+	}
+	while (epos > 0) {	/* Exponent. */
+		epos--;
+		OUTCHAR(str, *len, size, econvert[epos]);
+	}
+	while (padlen < 0) {	/* Trailing spaces. */
+		OUTCHAR(str, *len, size, ' ');
+		padlen++;
+	}
+}
+
+static void
+printsep(char *str, size_t *len, size_t size)
+{
+#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
+	struct lconv *lc = localeconv();
+	int i;
+
+	if (lc->thousands_sep != NULL)
+		for (i = 0; lc->thousands_sep[i] != '\0'; i++)
+			OUTCHAR(str, *len, size, lc->thousands_sep[i]);
+	else
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
+		OUTCHAR(str, *len, size, ',');
+}
+
+static int
+getnumsep(int digits)
+{
+	int separators = (digits - ((digits % 3 == 0) ? 1 : 0)) / 3;
+#if HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP
+	int strln;
+	struct lconv *lc = localeconv();
+
+	/* We support an arbitrary separator length (including zero). */
+	if (lc->thousands_sep != NULL) {
+		for (strln = 0; lc->thousands_sep[strln] != '\0'; strln++)
+			continue;
+		separators *= strln;
+	}
+#endif	/* HAVE_LOCALECONV && HAVE_LCONV_THOUSANDS_SEP */
+	return separators;
+}
+
+static int
+getexponent(LDOUBLE value)
+{
+	LDOUBLE tmp = (value >= 0.0) ? value : -value;
+	int exponent = 0;
+
+	/*
+	 * We check for 99 > exponent > -99 in order to work around possible
+	 * endless loops which could happen (at least) in the second loop (at
+	 * least) if we're called with an infinite value.  However, we checked
+	 * for infinity before calling this function using our ISINF() macro, so
+	 * this might be somewhat paranoid.
+	 */
+	while (tmp < 1.0 && tmp > 0.0 && --exponent > -99)
+		tmp *= 10;
+	while (tmp >= 10.0 && ++exponent < 99)
+		tmp /= 10;
+
+	return exponent;
+}
+
+static int
+convert(UINTMAX_T value, char *buf, size_t size, int base, int caps)
+{
+	const char *digits = caps ? "0123456789ABCDEF" : "0123456789abcdef";
+	size_t pos = 0;
+
+	/* We return an unterminated buffer with the digits in reverse order. */
+	do {
+		buf[pos++] = digits[value % base];
+		value /= base;
+	} while (value != 0 && pos < size);
+
+	return (int)pos;
+}
+
+static UINTMAX_T
+cast(LDOUBLE value)
+{
+	UINTMAX_T result;
+
+	/*
+	 * We check for ">=" and not for ">" because if UINTMAX_MAX cannot be
+	 * represented exactly as an LDOUBLE value (but is less than LDBL_MAX),
+	 * it may be increased to the nearest higher representable value for the
+	 * comparison (cf. C99: 6.3.1.4, 2).  It might then equal the LDOUBLE
+	 * value although converting the latter to UINTMAX_T would overflow.
+	 */
+	if (value >= UINTMAX_MAX)
+		return UINTMAX_MAX;
+
+	result = (UINTMAX_T)value;
+	/*
+	 * At least on NetBSD/sparc64 3.0.2 and 4.99.30, casting long double to
+	 * an integer type converts e.g. 1.9 to 2 instead of 1 (which violates
+	 * the standard).  Sigh.
+	 */
+	return (result <= value) ? result : result - 1;
+}
+
+static UINTMAX_T
+myround(LDOUBLE value)
+{
+	UINTMAX_T intpart = cast(value);
+
+	return ((value -= intpart) < 0.5) ? intpart : intpart + 1;
+}
+
+static LDOUBLE
+mypow10(int exponent)
+{
+	LDOUBLE result = 1;
+
+	while (exponent > 0) {
+		result *= 10;
+		exponent--;
+	}
+	while (exponent < 0) {
+		result /= 10;
+		exponent++;
+	}
+	return result;
+}
+#endif	/* !HAVE_VSNPRINTF */
+
+#if !HAVE_VASPRINTF
+#if NEED_MYMEMCPY
+void *
+mymemcpy(void *dst, void *src, size_t len)
+{
+	const char *from = src;
+	char *to = dst;
+
+	/* No need for optimization, we use this only to replace va_copy(3). */
+	while (len-- > 0)
+		*to++ = *from++;
+	return dst;
+}
+#endif	/* NEED_MYMEMCPY */
+
+int
+rpl_vasprintf(char **ret, const char *format, va_list ap)
+{
+	size_t size;
+	int len;
+	va_list aq;
+
+	VA_COPY(aq, ap);
+	len = vsnprintf(NULL, 0, format, aq);
+	VA_END_COPY(aq);
+	if (len < 0 || (*ret = malloc(size = len + 1)) == NULL)
+		return -1;
+	return vsnprintf(*ret, size, format, ap);
+}
+#endif	/* !HAVE_VASPRINTF */
+
+#if !HAVE_SNPRINTF
+#if HAVE_STDARG_H
+int
+rpl_snprintf(char *str, size_t size, const char *format, ...)
+#else
+int
+rpl_snprintf(va_alist) va_dcl
+#endif	/* HAVE_STDARG_H */
+{
+#if !HAVE_STDARG_H
+	char *str;
+	size_t size;
+	char *format;
+#endif	/* HAVE_STDARG_H */
+	va_list ap;
+	int len;
+
+	VA_START(ap, format);
+	VA_SHIFT(ap, str, char *);
+	VA_SHIFT(ap, size, size_t);
+	VA_SHIFT(ap, format, const char *);
+	len = vsnprintf(str, size, format, ap);
+	va_end(ap);
+	return len;
+}
+#endif	/* !HAVE_SNPRINTF */
+
+#if !HAVE_ASPRINTF
+#if HAVE_STDARG_H
+int
+rpl_asprintf(char **ret, const char *format, ...)
+#else
+int
+rpl_asprintf(va_alist) va_dcl
+#endif	/* HAVE_STDARG_H */
+{
+#if !HAVE_STDARG_H
+	char **ret;
+	char *format;
+#endif	/* HAVE_STDARG_H */
+	va_list ap;
+	int len;
+
+	VA_START(ap, format);
+	VA_SHIFT(ap, ret, char **);
+	VA_SHIFT(ap, format, const char *);
+	len = vasprintf(ret, format, ap);
+	va_end(ap);
+	return len;
+}
+#endif	/* !HAVE_ASPRINTF */
+#else	/* Dummy declaration to avoid empty translation unit warnings. */
+int main(void);
+#endif	/* !HAVE_SNPRINTF || !HAVE_VSNPRINTF || !HAVE_ASPRINTF || [...] */
+
+
+/* vim: set joinspaces textwidth=80: */