Merge branch 'upstream-gallium-0.1' into nouveau-gallium-0.1

author: Ben Skeggs <skeggsb@gmail.com> 2008-04-23 12:39:38 +1000
committer: Ben Skeggs <skeggsb@gmail.com> 2008-04-23 12:39:38 +1000
commit: 104ff59585ad1888c8cef5ad9de0e2fdb3f48c21 (patch)
tree: 9128984eef4a90cc6177d336759ce795b835d71f /src/gallium
parent: b20acef90695d6e5975f538b6e9cb812b05f0cf6 (diff)
parent: 6fc530ccda2971a5d99a955ad90ae9762238040f (diff)
109 files changed, 5395 insertions, 7577 deletions
diff --git a/src/gallium/SConscript b/src/gallium/SConscript
index f09778ce99..2653f91bd2 100644
--- a/src/gallium/SConscript
+++ b/src/gallium/SConscript
@@ -18,6 +18,7 @@ SConscript([
 	'auxiliary/rtasm/SConscript',
 	'auxiliary/tgsi/SConscript',
 	'auxiliary/cso_cache/SConscript',
+	'auxiliary/translate/SConscript',
 	'auxiliary/draw/SConscript',
 	'auxiliary/pipebuffer/SConscript',
 ])
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
index 18acab0967..63464e0705 100644
--- a/src/gallium/auxiliary/cso_cache/cso_cache.c
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -290,6 +290,8 @@ void * cso_take_state(struct cso_cache *sc,
 struct cso_cache *cso_cache_create(void)
 {
    struct cso_cache *sc = MALLOC_STRUCT(cso_cache);
+   if (sc == NULL)
+      return NULL;
 
    sc->max_size           = 4096;
    sc->blend_hash         = cso_hash_create();
@@ -332,10 +334,10 @@ void cso_for_each_state(struct cso_cache *sc, enum cso_cache_type type,
    iter = cso_hash_first_node(hash);
    while (!cso_hash_iter_is_null(iter)) {
       void *state = cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
       if (state) {
          func(state, user_data);
       }
-      iter = cso_hash_iter_next(iter);
    }
 }
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 4a1a6cb79c..0523cb1949 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -25,16 +25,20 @@
  *
  **************************************************************************/
 
- /* Wrap the cso cache & hash mechanisms in a simplified
+ /**
+  * @file
+  * 
+  * Wrap the cso cache & hash mechanisms in a simplified
   * pipe-driver-specific interface.
   *
-  * Authors:
-  *   Zack Rusin <zack@tungstengraphics.com>
-  *   Keith Whitwell <keith@tungstengraphics.com>
+  * @author Zack Rusin <zack@tungstengraphics.com>
+  * @author Keith Whitwell <keith@tungstengraphics.com>
   */
 
 #include "pipe/p_state.h"
 #include "pipe/p_util.h"
+#include "pipe/p_inlines.h"
+#include "tgsi/util/tgsi_parse.h"
 
 #include "cso_cache/cso_context.h"
 #include "cso_cache/cso_cache.h"
@@ -135,8 +139,8 @@ void cso_destroy_context( struct cso_context *ctx )
  * the data member of the cso to be the template itself.
  */
 
-void cso_set_blend(struct cso_context *ctx,
-                   const struct pipe_blend_state *templ)
+enum pipe_error cso_set_blend(struct cso_context *ctx,
+                              const struct pipe_blend_state *templ)
 {
    unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_blend_state));
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
@@ -146,6 +150,8 @@ void cso_set_blend(struct cso_context *ctx,
 
    if (cso_hash_iter_is_null(iter)) {
       struct cso_blend *cso = MALLOC(sizeof(struct cso_blend));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
       cso->state = *templ;
       cso->data = ctx->pipe->create_blend_state(ctx->pipe, &cso->state);
@@ -153,6 +159,11 @@ void cso_set_blend(struct cso_context *ctx,
       cso->context = ctx->pipe;
 
       iter = cso_insert_state(ctx->cache, hash_key, CSO_BLEND, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
       handle = cso->data;
    }
    else {
@@ -163,6 +174,7 @@ void cso_set_blend(struct cso_context *ctx,
       ctx->blend = handle;
       ctx->pipe->bind_blend_state(ctx->pipe, handle);
    }
+   return PIPE_OK;
 }
 
 void cso_save_blend(struct cso_context *ctx)
@@ -182,12 +194,12 @@ void cso_restore_blend(struct cso_context *ctx)
 
 
 
-void cso_single_sampler(struct cso_context *ctx,
-                        unsigned idx,
-                        const struct pipe_sampler_state *templ)
+enum pipe_error cso_single_sampler(struct cso_context *ctx,
+                                   unsigned idx,
+                                   const struct pipe_sampler_state *templ)
 {
    void *handle = NULL;
-   
+
    if (templ != NULL) {
       unsigned hash_key = cso_construct_key((void*)templ, sizeof(struct pipe_sampler_state));
       struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
@@ -196,13 +208,20 @@ void cso_single_sampler(struct cso_context *ctx,
 
       if (cso_hash_iter_is_null(iter)) {
          struct cso_sampler *cso = MALLOC(sizeof(struct cso_sampler));
-         
+         if (!cso)
+            return PIPE_ERROR_OUT_OF_MEMORY;
+
          cso->state = *templ;
          cso->data = ctx->pipe->create_sampler_state(ctx->pipe, &cso->state);
          cso->delete_state = (cso_state_callback)ctx->pipe->delete_sampler_state;
          cso->context = ctx->pipe;
 
          iter = cso_insert_state(ctx->cache, hash_key, CSO_SAMPLER, cso);
+         if (cso_hash_iter_is_null(iter)) {
+            FREE(cso);
+            return PIPE_ERROR_OUT_OF_MEMORY;
+         }
+
          handle = cso->data;
       }
       else {
@@ -211,11 +230,12 @@ void cso_single_sampler(struct cso_context *ctx,
    }
 
    ctx->samplers[idx] = handle;
+   return PIPE_OK;
 }
 
 void cso_single_sampler_done( struct cso_context *ctx )
 {
-   unsigned i; 
+   unsigned i;
 
    /* find highest non-null sampler */
    for (i = PIPE_MAX_SAMPLERS; i > 0; i--) {
@@ -226,8 +246,8 @@ void cso_single_sampler_done( struct cso_context *ctx )
    ctx->nr_samplers = i;
 
    if (ctx->hw.nr_samplers != ctx->nr_samplers ||
-       memcmp(ctx->hw.samplers, 
-              ctx->samplers, 
+       memcmp(ctx->hw.samplers,
+              ctx->samplers,
               ctx->nr_samplers * sizeof(void *)) != 0) 
    {
       memcpy(ctx->hw.samplers, ctx->samplers, ctx->nr_samplers * sizeof(void *));
@@ -237,22 +257,36 @@ void cso_single_sampler_done( struct cso_context *ctx )
    }
 }
 
-void cso_set_samplers( struct cso_context *ctx,
-                       unsigned nr,
-                       const struct pipe_sampler_state **templates )
+/*
+ * If the function encouters any errors it will return the
+ * last one. Done to always try to set as many samplers
+ * as possible.
+ */
+enum pipe_error cso_set_samplers( struct cso_context *ctx,
+                                  unsigned nr,
+                                  const struct pipe_sampler_state **templates )
 {
    unsigned i;
-   
+   enum pipe_error temp, error = PIPE_OK;
+
    /* TODO: fastpath
     */
 
-   for (i = 0; i < nr; i++)
-      cso_single_sampler( ctx, i, templates[i] );
+   for (i = 0; i < nr; i++) {
+      temp = cso_single_sampler( ctx, i, templates[i] );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
+
+   for ( ; i < ctx->nr_samplers; i++) {
+      temp = cso_single_sampler( ctx, i, NULL );
+      if (temp != PIPE_OK)
+         error = temp;
+   }
 
-   for ( ; i < ctx->nr_samplers; i++)
-      cso_single_sampler( ctx, i, NULL );
-   
    cso_single_sampler_done( ctx );
+
+   return error;
 }
 
 void cso_save_samplers(struct cso_context *ctx)
@@ -263,44 +297,64 @@ void cso_save_samplers(struct cso_context *ctx)
 
 void cso_restore_samplers(struct cso_context *ctx)
 {
-   cso_set_samplers(ctx, ctx->nr_samplers_saved,
-                    (const struct pipe_sampler_state **) ctx->samplers_saved);
+   ctx->nr_samplers = ctx->nr_samplers_saved;
+   memcpy(ctx->samplers, ctx->samplers_saved, sizeof(ctx->samplers));
+   cso_single_sampler_done( ctx );
 }
 
 
-void cso_set_sampler_textures( struct cso_context *ctx,
-                               uint count,
-                               struct pipe_texture **textures )
+enum pipe_error cso_set_sampler_textures( struct cso_context *ctx,
+                                          uint count,
+                                          struct pipe_texture **textures )
 {
    uint i;
 
    ctx->nr_textures = count;
 
    for (i = 0; i < count; i++)
-      ctx->textures[i] = textures[i];
+      pipe_texture_reference(&ctx->textures[i], textures[i]);
    for ( ; i < PIPE_MAX_SAMPLERS; i++)
-      ctx->textures[i] = NULL;
+      pipe_texture_reference(&ctx->textures[i], NULL);
 
    ctx->pipe->set_sampler_textures(ctx->pipe, count, textures);
+
+   return PIPE_OK;
 }
 
 void cso_save_sampler_textures( struct cso_context *ctx )
 {
+   uint i;
+
    ctx->nr_textures_saved = ctx->nr_textures;
-   memcpy(ctx->textures_saved, ctx->textures, sizeof(ctx->textures));
+   for (i = 0; i < ctx->nr_textures; i++) {
+      assert(!ctx->textures_saved[i]);
+      pipe_texture_reference(&ctx->textures_saved[i], ctx->textures[i]);
+   }
 }
 
 void cso_restore_sampler_textures( struct cso_context *ctx )
 {
-   cso_set_sampler_textures(ctx, ctx->nr_textures_saved, ctx->textures_saved);
+   uint i;
+
+   ctx->nr_textures = ctx->nr_textures_saved;
+
+   for (i = 0; i < ctx->nr_textures; i++) {
+      pipe_texture_reference(&ctx->textures[i], NULL);
+      ctx->textures[i] = ctx->textures_saved[i];
+      ctx->textures_saved[i] = NULL;
+   }
+   for ( ; i < PIPE_MAX_SAMPLERS; i++)
+      pipe_texture_reference(&ctx->textures[i], NULL);
+
+   ctx->pipe->set_sampler_textures(ctx->pipe, ctx->nr_textures, ctx->textures);
+
    ctx->nr_textures_saved = 0;
 }
 
 
 
-
-void cso_set_depth_stencil_alpha(struct cso_context *ctx,
-                                 const struct pipe_depth_stencil_alpha_state *templ)
+enum pipe_error cso_set_depth_stencil_alpha(struct cso_context *ctx,
+                                            const struct pipe_depth_stencil_alpha_state *templ)
 {
    unsigned hash_key = cso_construct_key((void*)templ,
                                          sizeof(struct pipe_depth_stencil_alpha_state));
@@ -312,13 +366,20 @@ void cso_set_depth_stencil_alpha(struct cso_context *ctx,
 
    if (cso_hash_iter_is_null(iter)) {
       struct cso_depth_stencil_alpha *cso = MALLOC(sizeof(struct cso_depth_stencil_alpha));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
       cso->state = *templ;
       cso->data = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &cso->state);
       cso->delete_state = (cso_state_callback)ctx->pipe->delete_depth_stencil_alpha_state;
       cso->context = ctx->pipe;
 
-      cso_insert_state(ctx->cache, hash_key, CSO_DEPTH_STENCIL_ALPHA, cso);
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_DEPTH_STENCIL_ALPHA, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
       handle = cso->data;
    }
    else {
@@ -329,6 +390,7 @@ void cso_set_depth_stencil_alpha(struct cso_context *ctx,
       ctx->depth_stencil = handle;
       ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, handle);
    }
+   return PIPE_OK;
 }
 
 void cso_save_depth_stencil_alpha(struct cso_context *ctx)
@@ -348,8 +410,8 @@ void cso_restore_depth_stencil_alpha(struct cso_context *ctx)
 
 
 
-void cso_set_rasterizer(struct cso_context *ctx,
-                        const struct pipe_rasterizer_state *templ)
+enum pipe_error cso_set_rasterizer(struct cso_context *ctx,
+                                   const struct pipe_rasterizer_state *templ)
 {
    unsigned hash_key = cso_construct_key((void*)templ,
                                          sizeof(struct pipe_rasterizer_state));
@@ -360,13 +422,20 @@ void cso_set_rasterizer(struct cso_context *ctx,
 
    if (cso_hash_iter_is_null(iter)) {
       struct cso_rasterizer *cso = MALLOC(sizeof(struct cso_rasterizer));
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
 
       cso->state = *templ;
       cso->data = ctx->pipe->create_rasterizer_state(ctx->pipe, &cso->state);
       cso->delete_state = (cso_state_callback)ctx->pipe->delete_rasterizer_state;
       cso->context = ctx->pipe;
 
-      cso_insert_state(ctx->cache, hash_key, CSO_RASTERIZER, cso);
+      iter = cso_insert_state(ctx->cache, hash_key, CSO_RASTERIZER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
       handle = cso->data;
    }
    else {
@@ -377,6 +446,7 @@ void cso_set_rasterizer(struct cso_context *ctx,
       ctx->rasterizer = handle;
       ctx->pipe->bind_rasterizer_state(ctx->pipe, handle);
    }
+   return PIPE_OK;
 }
 
 void cso_save_rasterizer(struct cso_context *ctx)
@@ -394,37 +464,61 @@ void cso_restore_rasterizer(struct cso_context *ctx)
    ctx->rasterizer_saved = NULL;
 }
 
-
-void cso_set_fragment_shader(struct cso_context *ctx,
-                             const struct pipe_shader_state *templ)
+enum pipe_error cso_set_fragment_shader_handle(struct cso_context *ctx,
+                                               void *handle )
 {
-   unsigned hash_key = cso_construct_key((void*)templ,
-                                         sizeof(struct pipe_shader_state));
+   if (ctx->fragment_shader != handle) {
+      ctx->fragment_shader = handle;
+      ctx->pipe->bind_fs_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
+
+/* Not really working:
+ */
+#if 0
+enum pipe_error cso_set_fragment_shader(struct cso_context *ctx,
+                                        const struct pipe_shader_state *templ)
+{
+   const struct tgsi_token *tokens = templ->tokens;
+   unsigned num_tokens = tgsi_num_tokens(tokens);
+   size_t tokens_size = num_tokens*sizeof(struct tgsi_token);
+   unsigned hash_key = cso_construct_key((void*)tokens, tokens_size);
    struct cso_hash_iter iter = cso_find_state_template(ctx->cache,
-                                                       hash_key, CSO_FRAGMENT_SHADER,
-                                                       (void*)templ);
+                                                       hash_key, 
+                                                       CSO_FRAGMENT_SHADER,
+                                                       (void*)tokens);
    void *handle = NULL;
 
    if (cso_hash_iter_is_null(iter)) {
-      struct cso_fragment_shader *cso = MALLOC(sizeof(struct cso_fragment_shader));
+      struct cso_fragment_shader *cso = MALLOC(sizeof(struct cso_fragment_shader) + tokens_size);
+      struct tgsi_token *cso_tokens = (struct tgsi_token *)((char *)cso + sizeof(*cso));
 
-      cso->state = *templ;
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
+      memcpy(cso_tokens, tokens, tokens_size);
+      cso->state.tokens = cso_tokens;
       cso->data = ctx->pipe->create_fs_state(ctx->pipe, &cso->state);
       cso->delete_state = (cso_state_callback)ctx->pipe->delete_fs_state;
       cso->context = ctx->pipe;
 
       iter = cso_insert_state(ctx->cache, hash_key, CSO_FRAGMENT_SHADER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
       handle = cso->data;
    }
    else {
       handle = ((struct cso_fragment_shader *)cso_hash_iter_data(iter))->data;
    }
 
-   if (ctx->fragment_shader != handle) {
-      ctx->fragment_shader = handle;
-      ctx->pipe->bind_fs_state(ctx->pipe, handle);
-   }
+   return cso_set_fragment_shader_handle( ctx, handle );
 }
+#endif
 
 void cso_save_fragment_shader(struct cso_context *ctx)
 {
@@ -434,7 +528,6 @@ void cso_save_fragment_shader(struct cso_context *ctx)
 
 void cso_restore_fragment_shader(struct cso_context *ctx)
 {
-   assert(ctx->fragment_shader_saved);
    if (ctx->fragment_shader_saved != ctx->fragment_shader) {
       ctx->pipe->bind_fs_state(ctx->pipe, ctx->fragment_shader_saved);
       ctx->fragment_shader = ctx->fragment_shader_saved;
@@ -443,9 +536,22 @@ void cso_restore_fragment_shader(struct cso_context *ctx)
 }
 
 
+enum pipe_error cso_set_vertex_shader_handle(struct cso_context *ctx,
+                                             void *handle )
+{
+   if (ctx->vertex_shader != handle) {
+      ctx->vertex_shader = handle;
+      ctx->pipe->bind_vs_state(ctx->pipe, handle);
+   }
+   return PIPE_OK;
+}
+
 
-void cso_set_vertex_shader(struct cso_context *ctx,
-                           const struct pipe_shader_state *templ)
+/* Not really working:
+ */
+#if 0
+enum pipe_error cso_set_vertex_shader(struct cso_context *ctx,
+                                      const struct pipe_shader_state *templ)
 {
    unsigned hash_key = cso_construct_key((void*)templ,
                                          sizeof(struct pipe_shader_state));
@@ -457,23 +563,31 @@ void cso_set_vertex_shader(struct cso_context *ctx,
    if (cso_hash_iter_is_null(iter)) {
       struct cso_vertex_shader *cso = MALLOC(sizeof(struct cso_vertex_shader));
 
+      if (!cso)
+         return PIPE_ERROR_OUT_OF_MEMORY;
+
       cso->state = *templ;
       cso->data = ctx->pipe->create_vs_state(ctx->pipe, &cso->state);
       cso->delete_state = (cso_state_callback)ctx->pipe->delete_vs_state;
       cso->context = ctx->pipe;
 
       iter = cso_insert_state(ctx->cache, hash_key, CSO_VERTEX_SHADER, cso);
+      if (cso_hash_iter_is_null(iter)) {
+         FREE(cso);
+         return PIPE_ERROR_OUT_OF_MEMORY;
+      }
+
       handle = cso->data;
    }
    else {
       handle = ((struct cso_vertex_shader *)cso_hash_iter_data(iter))->data;
    }
 
-   if (ctx->vertex_shader != handle) {
-      ctx->vertex_shader = handle;
-      ctx->pipe->bind_vs_state(ctx->pipe, handle);
-   }
+   return cso_set_vertex_shader_handle( ctx, handle );
 }
+#endif
+
+
 
 void cso_save_vertex_shader(struct cso_context *ctx)
 {
@@ -483,9 +597,8 @@ void cso_save_vertex_shader(struct cso_context *ctx)
 
 void cso_restore_vertex_shader(struct cso_context *ctx)
 {
-   assert(ctx->vertex_shader_saved);
    if (ctx->vertex_shader_saved != ctx->vertex_shader) {
-      ctx->pipe->bind_fs_state(ctx->pipe, ctx->vertex_shader_saved);
+      ctx->pipe->bind_vs_state(ctx->pipe, ctx->vertex_shader_saved);
       ctx->vertex_shader = ctx->vertex_shader_saved;
    }
    ctx->vertex_shader_saved = NULL;
@@ -493,14 +606,15 @@ void cso_restore_vertex_shader(struct cso_context *ctx)
 
 
 
-void cso_set_framebuffer(struct cso_context *ctx,
-                         const struct pipe_framebuffer_state *fb)
+enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
+                                    const struct pipe_framebuffer_state *fb)
 {
    /* XXX this memcmp() fails to detect buffer size changes */
    if (1/*memcmp(&ctx->fb, fb, sizeof(*fb))*/) {
       ctx->fb = *fb;
       ctx->pipe->set_framebuffer_state(ctx->pipe, fb);
    }
+   return PIPE_OK;
 }
 
 void cso_save_framebuffer(struct cso_context *ctx)
@@ -517,13 +631,14 @@ void cso_restore_framebuffer(struct cso_context *ctx)
 }
 
 
-void cso_set_viewport(struct cso_context *ctx,
-                      const struct pipe_viewport_state *vp)
+enum pipe_error cso_set_viewport(struct cso_context *ctx,
+                                 const struct pipe_viewport_state *vp)
 {
    if (memcmp(&ctx->vp, vp, sizeof(*vp))) {
       ctx->vp = *vp;
       ctx->pipe->set_viewport_state(ctx->pipe, vp);
    }
+   return PIPE_OK;
 }
 
 void cso_save_viewport(struct cso_context *ctx)
@@ -543,11 +658,12 @@ void cso_restore_viewport(struct cso_context *ctx)
 
 
 
-void cso_set_blend_color(struct cso_context *ctx,
-                         const struct pipe_blend_color *bc)
+enum pipe_error cso_set_blend_color(struct cso_context *ctx,
+                                    const struct pipe_blend_color *bc)
 {
    if (memcmp(&ctx->blend_color, bc, sizeof(ctx->blend_color))) {
       ctx->blend_color = *bc;
       ctx->pipe->set_blend_color(ctx->pipe, bc);
    }
+   return PIPE_OK;
 }
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index 665e8d9911..0405944132 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -31,6 +31,7 @@
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "pipe/p_error.h"
 
 
 #ifdef	__cplusplus
@@ -45,47 +46,47 @@ void cso_destroy_context( struct cso_context *cso );
 
 
 
-void cso_set_blend( struct cso_context *cso,
-                    const struct pipe_blend_state *blend );
+enum pipe_error cso_set_blend( struct cso_context *cso,
+                               const struct pipe_blend_state *blend );
 void cso_save_blend(struct cso_context *cso);
 void cso_restore_blend(struct cso_context *cso);
 
 
 
-void cso_set_depth_stencil_alpha( struct cso_context *cso,
-                                  const struct pipe_depth_stencil_alpha_state *dsa );
+enum pipe_error cso_set_depth_stencil_alpha( struct cso_context *cso,
+                                             const struct pipe_depth_stencil_alpha_state *dsa );
 void cso_save_depth_stencil_alpha(struct cso_context *cso);
 void cso_restore_depth_stencil_alpha(struct cso_context *cso);
 
 
 
-void cso_set_rasterizer( struct cso_context *cso,
-                         const struct pipe_rasterizer_state *rasterizer );
+enum pipe_error cso_set_rasterizer( struct cso_context *cso,
+                                    const struct pipe_rasterizer_state *rasterizer );
 void cso_save_rasterizer(struct cso_context *cso);
 void cso_restore_rasterizer(struct cso_context *cso);
 
 
 
-void cso_set_samplers( struct cso_context *cso,
-                       unsigned count,
-                       const struct pipe_sampler_state **states );
+enum pipe_error cso_set_samplers( struct cso_context *cso,
+                                  unsigned count,
+                                  const struct pipe_sampler_state **states );
 void cso_save_samplers(struct cso_context *cso);
 void cso_restore_samplers(struct cso_context *cso);
 
 /* Alternate interface to support state trackers that like to modify
  * samplers one at a time:
  */
-void cso_single_sampler( struct cso_context *cso,
-                         unsigned nr,
-                         const struct pipe_sampler_state *states );
+enum pipe_error cso_single_sampler( struct cso_context *cso,
+                                    unsigned nr,
+                                    const struct pipe_sampler_state *states );
 
 void cso_single_sampler_done( struct cso_context *cso );
 
 
 
-void cso_set_sampler_textures( struct cso_context *cso,
-                               uint count,
-                               struct pipe_texture **textures );
+enum pipe_error cso_set_sampler_textures( struct cso_context *cso,
+                                          uint count,
+                                          struct pipe_texture **textures );
 void cso_save_sampler_textures( struct cso_context *cso );
 void cso_restore_sampler_textures( struct cso_context *cso );
 
@@ -96,34 +97,37 @@ void cso_restore_sampler_textures( struct cso_context *cso );
  * (eg mesa's internall-generated texenv programs), it will be up to
  * the state tracker to implement their own specialized caching.
  */
-void cso_set_fragment_shader( struct cso_context *cso,
-                              const struct pipe_shader_state *shader );
+enum pipe_error cso_set_fragment_shader_handle(struct cso_context *ctx,
+                                               void *handle );
+enum pipe_error cso_set_fragment_shader( struct cso_context *cso,
+                                         const struct pipe_shader_state *shader );
 void cso_save_fragment_shader(struct cso_context *cso);
 void cso_restore_fragment_shader(struct cso_context *cso);
 
 
-
-void cso_set_vertex_shader( struct cso_context *cso,
-                            const struct pipe_shader_state *shader );
+enum pipe_error cso_set_vertex_shader_handle(struct cso_context *ctx,
+                                             void *handle );
+enum pipe_error cso_set_vertex_shader( struct cso_context *cso,
+                                       const struct pipe_shader_state *shader );
 void cso_save_vertex_shader(struct cso_context *cso);
 void cso_restore_vertex_shader(struct cso_context *cso);
 
 
 
-void cso_set_framebuffer(struct cso_context *cso,
-                         const struct pipe_framebuffer_state *fb);
+enum pipe_error cso_set_framebuffer(struct cso_context *cso,
+                                    const struct pipe_framebuffer_state *fb);
 void cso_save_framebuffer(struct cso_context *cso);
 void cso_restore_framebuffer(struct cso_context *cso);
 
 
-void cso_set_viewport(struct cso_context *cso,
-                      const struct pipe_viewport_state *vp);
+enum pipe_error cso_set_viewport(struct cso_context *cso,
+                                 const struct pipe_viewport_state *vp);
 void cso_save_viewport(struct cso_context *cso);
 void cso_restore_viewport(struct cso_context *cso);
 
 
-void cso_set_blend_color(struct cso_context *cso,
-                         const struct pipe_blend_color *bc);
+enum pipe_error cso_set_blend_color(struct cso_context *cso,
+                                    const struct pipe_blend_color *bc);
 
 
 #ifdef	__cplusplus
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index ddce3822f7..0646efd952 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -110,6 +110,10 @@ cso_hash_create_node(struct cso_hash *hash,
                       struct cso_node **anextNode)
 {
    struct cso_node *node = cso_data_allocate_node(hash->data.d);
+
+   if (!node)
+      return NULL;
+
    node->key = akey;
    node->value = avalue;
 
@@ -219,15 +223,30 @@ struct cso_hash_iter cso_hash_insert(struct cso_hash *hash,
    {
       struct cso_node **nextNode = cso_hash_find_node(hash, key);
       struct cso_node *node = cso_hash_create_node(hash, key, data, nextNode);
-      struct cso_hash_iter iter = {hash, node};
-      return iter;
+      if (!node) {
+         struct cso_hash_iter null_iter = {hash, 0};
+         return null_iter;
+      }
+
+      {
+         struct cso_hash_iter iter = {hash, node};
+         return iter;
+      }
    }
 }
 
 struct cso_hash * cso_hash_create(void)
 {
    struct cso_hash *hash = MALLOC_STRUCT(cso_hash);
+   if (!hash)
+      return NULL;
+
    hash->data.d = MALLOC_STRUCT(cso_hash_data);
+   if (!hash->data.d) {
+      FREE(hash);
+      return NULL;
+   }
+
    hash->data.d->fakeNext = 0;
    hash->data.d->buckets = 0;
    hash->data.d->size = 0;
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index 5ab3cfe5ce..bc6acfe458 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -4,40 +4,36 @@ include $(TOP)/configs/current
 LIBNAME = draw
 
 C_SOURCES = \
-	draw_aaline.c \
-	draw_aapoint.c \
-	draw_clip.c \
-	draw_vs_exec.c \
-	draw_vs_sse.c \
-	draw_vs_llvm.c \
-	draw_context.c\
-	draw_cull.c \
-	draw_debug.c \
-	draw_flatshade.c \
-	draw_offset.c \
+	draw_context.c \
+	draw_pipe.c \
+	draw_pipe_aaline.c \
+	draw_pipe_aapoint.c \
+	draw_pipe_clip.c \
+	draw_pipe_cull.c \
+	draw_pipe_flatshade.c \
+	draw_pipe_offset.c \
+	draw_pipe_pstipple.c \
+	draw_pipe_stipple.c \
+	draw_pipe_twoside.c \
+	draw_pipe_unfilled.c \
+	draw_pipe_util.c \
+	draw_pipe_validate.c \
+	draw_pipe_vbuf.c \
+	draw_pipe_wide_line.c \
+	draw_pipe_wide_point.c \
 	draw_pt.c \
-	draw_pt_vcache.c \
+	draw_pt_elts.c \
+	draw_pt_emit.c \
+	draw_pt_fetch.c \
 	draw_pt_fetch_emit.c \
-	draw_pt_fetch_pipeline.c \
 	draw_pt_fetch_shade_pipeline.c \
-	draw_pt_pipeline.c \
-	draw_pt_elts.c \
-	draw_prim.c \
-	draw_pstipple.c \
-	draw_stipple.c \
-	draw_twoside.c \
-	draw_unfilled.c \
-	draw_validate.c \
-	draw_vbuf.c \
+	draw_pt_post_vs.c \
+	draw_pt_vcache.c \
 	draw_vertex.c \
-	draw_vertex_cache.c \
-	draw_vertex_fetch.c \
-	draw_vertex_shader.c \
-	draw_vf.c \
-	draw_vf_generic.c \
-	draw_vf_sse.c \
-	draw_wide_line.c \
-	draw_wide_point.c
+	draw_vs.c \
+	draw_vs_exec.c \
+	draw_vs_llvm.c \
+	draw_vs_sse.c 
 
 
 include ../../Makefile.template
diff --git a/src/gallium/auxiliary/draw/SConscript b/src/gallium/auxiliary/draw/SConscript
index a7fb5dbd61..0b9852f633 100644
--- a/src/gallium/auxiliary/draw/SConscript
+++ b/src/gallium/auxiliary/draw/SConscript
@@ -3,40 +3,36 @@ Import('*')
 draw = env.ConvenienceLibrary(
 	target = 'draw',
 	source = [
-		'draw_aaline.c',
-		'draw_aapoint.c',
-		'draw_clip.c',
-		'draw_vs_exec.c',
-		'draw_vs_sse.c',
-		'draw_vs_llvm.c',
 		'draw_context.c',
-		'draw_cull.c',
-		'draw_debug.c',
-		'draw_flatshade.c',
-		'draw_offset.c',
+		'draw_pipe.c',
+		'draw_pipe_aaline.c',
+		'draw_pipe_aapoint.c',
+		'draw_pipe_clip.c',
+		'draw_pipe_cull.c',
+		'draw_pipe_flatshade.c',
+		'draw_pipe_offset.c',
+		'draw_pipe_pstipple.c',
+		'draw_pipe_stipple.c',
+		'draw_pipe_twoside.c',
+		'draw_pipe_unfilled.c',
+        	'draw_pipe_util.c',
+		'draw_pipe_validate.c',
+		'draw_pipe_vbuf.c',
+		'draw_pipe_wide_line.c',
+		'draw_pipe_wide_point.c',
 		'draw_pt.c',
-		'draw_pt_vcache.c',
+		'draw_pt_elts.c',
+		'draw_pt_emit.c',
+		'draw_pt_fetch.c',
 		'draw_pt_fetch_emit.c',
-		'draw_pt_fetch_pipeline.c',
 		'draw_pt_fetch_shade_pipeline.c',
-		'draw_pt_pipeline.c',
-		'draw_pt_elts.c',
-		'draw_prim.c',
-		'draw_pstipple.c',
-		'draw_stipple.c',
-		'draw_twoside.c',
-		'draw_unfilled.c',
-		'draw_validate.c',
-		'draw_vbuf.c',
+		'draw_pt_post_vs.c',
+		'draw_pt_vcache.c',
 		'draw_vertex.c',
-		'draw_vertex_cache.c',
-		'draw_vertex_fetch.c',
-		'draw_vertex_shader.c',
-		'draw_vf.c',
-		'draw_vf_generic.c',
-		'draw_vf_sse.c',
-		'draw_wide_point.c',
-		'draw_wide_line.c'
+		'draw_vs.c',
+		'draw_vs_exec.c',
+		'draw_vs_llvm.c',
+		'draw_vs_sse.c',
 	])
 
 auxiliaries.insert(0, draw)
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 0c314f6e1d..f90187816b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -33,8 +33,10 @@
 
 #include "pipe/p_util.h"
 #include "draw_context.h"
-#include "draw_private.h"
 #include "draw_vbuf.h"
+#include "draw_vs.h"
+#include "draw_pt.h"
+#include "draw_pipe.h"
 
 
 struct draw_context *draw_create( void )
@@ -43,40 +45,6 @@ struct draw_context *draw_create( void )
    if (draw == NULL)
       goto fail;
 
-#if defined(__i386__) || defined(__386__)
-   draw->use_sse = GETENV( "GALLIUM_NOSSE" ) == NULL;
-#else
-   draw->use_sse = FALSE;
-#endif
-
-   draw->use_pt_shaders = GETENV( "GALLIUM_PT_SHADERS" ) != NULL;
-
-   /* create pipeline stages */
-   draw->pipeline.wide_line  = draw_wide_line_stage( draw );
-   draw->pipeline.wide_point = draw_wide_point_stage( draw );
-   draw->pipeline.stipple   = draw_stipple_stage( draw );
-   draw->pipeline.unfilled  = draw_unfilled_stage( draw );
-   draw->pipeline.twoside   = draw_twoside_stage( draw );
-   draw->pipeline.offset    = draw_offset_stage( draw );
-   draw->pipeline.clip      = draw_clip_stage( draw );
-   draw->pipeline.flatshade = draw_flatshade_stage( draw );
-   draw->pipeline.cull      = draw_cull_stage( draw );
-   draw->pipeline.validate  = draw_validate_stage( draw );
-   draw->pipeline.first     = draw->pipeline.validate;
-
-   if (!draw->pipeline.wide_line ||
-       !draw->pipeline.wide_point ||
-       !draw->pipeline.stipple ||
-       !draw->pipeline.unfilled ||
-       !draw->pipeline.twoside ||
-       !draw->pipeline.offset ||
-       !draw->pipeline.clip ||
-       !draw->pipeline.flatshade ||
-       !draw->pipeline.cull ||
-       !draw->pipeline.validate)
-      goto fail;
-
-
    ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
    ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
@@ -85,28 +53,18 @@ struct draw_context *draw_create( void )
    ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
    draw->nr_planes = 6;
 
-   /* Statically allocate maximum sized vertices for the cache - could be cleverer...
-    */
-   {
-      char *tmp = align_malloc(VS_QUEUE_LENGTH * MAX_VERTEX_ALLOCATION, 16);
-      if (!tmp)
-         goto fail;
-
-      draw->vs.vertex_cache = tmp;
-   }
 
-   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
 
-   /* these defaults are oriented toward the needs of softpipe */
-   draw->wide_point_threshold = 1000000.0; /* infinity */
-   draw->wide_line_threshold = 1.0;
-   draw->line_stipple = TRUE;
-   draw->point_sprite = TRUE;
+   tgsi_exec_machine_init(&draw->machine);
 
-   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
+   /* FIXME: give this machine thing a proper constructor:
+    */
+   draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
 
-   draw_vertex_cache_invalidate( draw );
-   draw_set_mapped_element_buffer( draw, 0, NULL );
+   if (!draw_pipeline_init( draw ))
+      goto fail;
 
    if (!draw_pt_init( draw ))
       goto fail;
@@ -124,39 +82,14 @@ void draw_destroy( struct draw_context *draw )
    if (!draw)
       return;
 
-   if (draw->pipeline.wide_line)
-      draw->pipeline.wide_line->destroy( draw->pipeline.wide_line );
-   if (draw->pipeline.wide_point)
-      draw->pipeline.wide_point->destroy( draw->pipeline.wide_point );
-   if (draw->pipeline.stipple)
-      draw->pipeline.stipple->destroy( draw->pipeline.stipple );
-   if (draw->pipeline.unfilled)
-      draw->pipeline.unfilled->destroy( draw->pipeline.unfilled );
-   if (draw->pipeline.twoside)
-      draw->pipeline.twoside->destroy( draw->pipeline.twoside );
-   if (draw->pipeline.offset)
-      draw->pipeline.offset->destroy( draw->pipeline.offset );
-   if (draw->pipeline.clip)
-      draw->pipeline.clip->destroy( draw->pipeline.clip );
-   if (draw->pipeline.flatshade)
-      draw->pipeline.flatshade->destroy( draw->pipeline.flatshade );
-   if (draw->pipeline.cull)
-      draw->pipeline.cull->destroy( draw->pipeline.cull );
-   if (draw->pipeline.validate)
-      draw->pipeline.validate->destroy( draw->pipeline.validate );
-   if (draw->pipeline.aaline)
-      draw->pipeline.aaline->destroy( draw->pipeline.aaline );
-   if (draw->pipeline.aapoint)
-      draw->pipeline.aapoint->destroy( draw->pipeline.aapoint );
-   if (draw->pipeline.pstipple)
-      draw->pipeline.pstipple->destroy( draw->pipeline.pstipple );
-   if (draw->pipeline.rasterize)
-      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
+
+   if (draw->machine.Inputs)
+      align_free(draw->machine.Inputs);
+
+   if (draw->machine.Outputs)
+      align_free(draw->machine.Outputs);
 
    tgsi_exec_machine_free_data(&draw->machine);
-   
-   if (draw->vs.vertex_cache)
-      align_free( draw->vs.vertex_cache ); /* Frees all the vertices. */
 
    /* Not so fast -- we're just borrowing this at the moment.
     * 
@@ -164,6 +97,7 @@ void draw_destroy( struct draw_context *draw )
       draw->render->destroy( draw->render );
    */
 
+   draw_pipeline_destroy( draw );
    draw_pt_destroy( draw );
 
    FREE( draw );
@@ -188,6 +122,20 @@ void draw_set_rasterizer_state( struct draw_context *draw,
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
 
    draw->rasterizer = raster;
+   draw->bypass_clipping =
+      ((draw->rasterizer && draw->rasterizer->bypass_clipping) ||
+       draw->driver.bypass_clipping);
+}
+
+
+void draw_set_driver_clipping( struct draw_context *draw,
+                               boolean bypass_clipping )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->driver.bypass_clipping = bypass_clipping;
+   draw->bypass_clipping = (draw->rasterizer->bypass_clipping || 
+                            draw->driver.bypass_clipping);
 }
 
 
@@ -246,9 +194,8 @@ draw_set_vertex_buffers(struct draw_context *draw,
 {
    assert(count <= PIPE_MAX_ATTRIBS);
 
-   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-
-   memcpy(draw->vertex_buffer, buffers, count * sizeof(buffers[0]));
+   memcpy(draw->pt.vertex_buffer, buffers, count * sizeof(buffers[0]));
+   draw->pt.nr_vertex_buffers = count;
 }
 
 
@@ -259,9 +206,8 @@ draw_set_vertex_elements(struct draw_context *draw,
 {
    assert(count <= PIPE_MAX_ATTRIBS);
 
-   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-
-   memcpy(draw->vertex_element, elements, count * sizeof(elements[0]));
+   memcpy(draw->pt.vertex_element, elements, count * sizeof(elements[0]));
+   draw->pt.nr_vertex_elements = count;
 }
 
 
@@ -272,8 +218,7 @@ void
 draw_set_mapped_vertex_buffer(struct draw_context *draw,
                               unsigned attr, const void *buffer)
 {
-   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-   draw->user.vbuffer[attr] = buffer;
+   draw->pt.user.vbuffer[attr] = buffer;
 }
 
 
@@ -281,8 +226,7 @@ void
 draw_set_mapped_constant_buffer(struct draw_context *draw,
                                 const void *buffer)
 {
-   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
-   draw->user.constants = buffer;
+   draw->pt.user.constants = buffer;
 }
 
 
@@ -294,7 +238,7 @@ void
 draw_wide_point_threshold(struct draw_context *draw, float threshold)
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-   draw->wide_point_threshold = threshold;
+   draw->pipeline.wide_point_threshold = threshold;
 }
 
 
@@ -306,7 +250,7 @@ void
 draw_wide_line_threshold(struct draw_context *draw, float threshold)
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-   draw->wide_line_threshold = threshold;
+   draw->pipeline.wide_line_threshold = threshold;
 }
 
 
@@ -317,7 +261,7 @@ void
 draw_enable_line_stipple(struct draw_context *draw, boolean enable)
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-   draw->line_stipple = enable;
+   draw->pipeline.line_stipple = enable;
 }
 
 
@@ -328,7 +272,7 @@ void
 draw_enable_point_sprites(struct draw_context *draw, boolean enable)
 {
    draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-   draw->point_sprite = enable;
+   draw->pipeline.point_sprite = enable;
 }
 
 
@@ -383,79 +327,54 @@ draw_num_vs_outputs(struct draw_context *draw)
 }
 
 
-/**
- * Allocate space for temporary post-transform vertices, such as for clipping.
- */
-void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr )
-{
-   assert(!stage->tmp);
-
-   stage->nr_tmps = nr;
-
-   if (nr) {
-      ubyte *store = (ubyte *) MALLOC( MAX_VERTEX_SIZE * nr );
-      unsigned i;
-
-      stage->tmp = (struct vertex_header **) MALLOC( sizeof(struct vertex_header *) * nr );
-      
-      for (i = 0; i < nr; i++)
-	 stage->tmp[i] = (struct vertex_header *)(store + i * MAX_VERTEX_SIZE);
-   }
-}
-
 
-void draw_free_temp_verts( struct draw_stage *stage )
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render )
 {
-   if (stage->tmp) {
-      FREE( stage->tmp[0] );
-      FREE( stage->tmp );
-      stage->tmp = NULL;
-   }
+   draw->render = render;
 }
 
-
-boolean draw_use_sse(struct draw_context *draw)
+void draw_set_edgeflags( struct draw_context *draw,
+                         const unsigned *edgeflag )
 {
-   return (boolean) draw->use_sse;
+   draw->pt.user.edgeflag = edgeflag;
 }
 
 
-void draw_reset_vertex_ids(struct draw_context *draw)
-{
-   struct draw_stage *stage = draw->pipeline.first;
-   
-   while (stage) {
-      unsigned i;
-
-      for (i = 0; i < stage->nr_tmps; i++)
-	 stage->tmp[i]->vertex_id = UNDEFINED_VERTEX_ID;
 
-      stage = stage->next;
-   }
 
-   draw_vertex_cache_reset_vertex_ids(draw); /* going away soon */
-   draw_pt_reset_vertex_ids(draw);
-}
-
-
-void draw_set_render( struct draw_context *draw, 
-		      struct vbuf_render *render )
+/**
+ * Tell the drawing context about the index/element buffer to use
+ * (ala glDrawElements)
+ * If no element buffer is to be used (i.e. glDrawArrays) then this
+ * should be called with eltSize=0 and elements=NULL.
+ *
+ * \param draw  the drawing context
+ * \param eltSize  size of each element (1, 2 or 4 bytes)
+ * \param elements  the element buffer ptr
+ */
+void
+draw_set_mapped_element_buffer( struct draw_context *draw,
+                                unsigned eltSize, void *elements )
 {
-   draw->render = render;
+   draw->pt.user.elts = elements;
+   draw->pt.user.eltSize = eltSize;
 }
 
-void draw_set_edgeflags( struct draw_context *draw,
-                         const unsigned *edgeflag )
+
+ 
+/* Revamp me please:
+ */
+void draw_do_flush( struct draw_context *draw, unsigned flags )
 {
-   draw->user.edgeflag = edgeflag;
-}
+   if (!draw->flushing && !draw->vcache_flushing)
+   {
+      draw->flushing = TRUE;
 
+      draw_pipeline_flush( draw, flags );
 
-boolean draw_get_edgeflag( struct draw_context *draw,
-                           unsigned idx )
-{
-   if (draw->user.edgeflag)
-      return (draw->user.edgeflag[idx/32] & (1 << (idx%32))) != 0;
-   else
-      return 1;
+      draw->reduced_prim = ~0; /* is reduced_prim needed any more? */
+      
+      draw->flushing = FALSE;
+   }
 }
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index c7ac32b452..c5c3d3b09e 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -42,37 +42,11 @@
 
 
 struct pipe_context;
-struct vertex_buffer;
-struct vertex_info;
 struct draw_context;
 struct draw_stage;
 struct draw_vertex_shader;
 
 
-/**
- * Clipmask flags
- */
-/*@{*/
-#define CLIP_RIGHT_BIT   0x01
-#define CLIP_LEFT_BIT    0x02
-#define CLIP_TOP_BIT     0x04
-#define CLIP_BOTTOM_BIT  0x08
-#define CLIP_NEAR_BIT    0x10
-#define CLIP_FAR_BIT     0x20
-/*@}*/
-
-/**
- * Bitshift for each clip flag
- */
-/*@{*/
-#define CLIP_RIGHT_SHIFT 	0
-#define CLIP_LEFT_SHIFT 	1
-#define CLIP_TOP_SHIFT  	2
-#define CLIP_BOTTOM_SHIFT       3
-#define CLIP_NEAR_SHIFT  	4
-#define CLIP_FAR_SHIFT  	5
-/*@}*/
-
 
 struct draw_context *draw_create( void );
 
@@ -99,15 +73,13 @@ void draw_enable_line_stipple(struct draw_context *draw, boolean enable);
 void draw_enable_point_sprites(struct draw_context *draw, boolean enable);
 
 
-boolean draw_use_sse(struct draw_context *draw);
-
-void
+boolean
 draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe);
 
-void
+boolean
 draw_install_aapoint_stage(struct draw_context *draw, struct pipe_context *pipe);
 
-void
+boolean
 draw_install_pstipple_stage(struct draw_context *draw, struct pipe_context *pipe);
 
 
@@ -168,17 +140,24 @@ void draw_arrays(struct draw_context *draw, unsigned prim,
 
 void draw_flush(struct draw_context *draw);
 
-/***********************************************************************
- * draw_debug.c 
+
+/*******************************************************************************
+ * Driver backend interface 
  */
-boolean draw_validate_prim( unsigned prim, unsigned length );
-unsigned draw_trim_prim( unsigned mode, unsigned count );
+struct vbuf_render;
+void draw_set_render( struct draw_context *draw, 
+		      struct vbuf_render *render );
 
+void draw_set_driver_clipping( struct draw_context *draw,
+                               boolean bypass_clipping );
 
+/*******************************************************************************
+ * Draw pipeline 
+ */
+boolean draw_need_pipeline(const struct draw_context *draw,
+                           const struct pipe_rasterizer_state *rasterizer,
+                           unsigned prim );
 
 
-struct vbuf_render;
-void draw_set_render( struct draw_context *draw, 
-		      struct vbuf_render *render );
 
 #endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_pt_pipeline.c b/src/gallium/auxiliary/draw/draw_pipe.c
index e70e63d08f..d0890203a5 100644
--- a/src/gallium/auxiliary/draw/draw_pt_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -31,16 +31,86 @@
   */
 
 #include "pipe/p_util.h"
-#include "draw/draw_context.h"
 #include "draw/draw_private.h"
-#include "draw/draw_vertex.h"
-#include "draw/draw_pt.h"
+#include "draw/draw_pipe.h"
+
+
+
+boolean draw_pipeline_init( struct draw_context *draw )
+{
+   /* create pipeline stages */
+   draw->pipeline.wide_line  = draw_wide_line_stage( draw );
+   draw->pipeline.wide_point = draw_wide_point_stage( draw );
+   draw->pipeline.stipple   = draw_stipple_stage( draw );
+   draw->pipeline.unfilled  = draw_unfilled_stage( draw );
+   draw->pipeline.twoside   = draw_twoside_stage( draw );
+   draw->pipeline.offset    = draw_offset_stage( draw );
+   draw->pipeline.clip      = draw_clip_stage( draw );
+   draw->pipeline.flatshade = draw_flatshade_stage( draw );
+   draw->pipeline.cull      = draw_cull_stage( draw );
+   draw->pipeline.validate  = draw_validate_stage( draw );
+   draw->pipeline.first     = draw->pipeline.validate;
+
+   if (!draw->pipeline.wide_line ||
+       !draw->pipeline.wide_point ||
+       !draw->pipeline.stipple ||
+       !draw->pipeline.unfilled ||
+       !draw->pipeline.twoside ||
+       !draw->pipeline.offset ||
+       !draw->pipeline.clip ||
+       !draw->pipeline.flatshade ||
+       !draw->pipeline.cull ||
+       !draw->pipeline.validate)
+      return FALSE;
+
+   /* these defaults are oriented toward the needs of softpipe */
+   draw->pipeline.wide_point_threshold = 1000000.0; /* infinity */
+   draw->pipeline.wide_line_threshold = 1.0;
+   draw->pipeline.line_stipple = TRUE;
+   draw->pipeline.point_sprite = TRUE;
+
+   return TRUE;
+}
+
+
+void draw_pipeline_destroy( struct draw_context *draw )
+{
+   if (draw->pipeline.wide_line)
+      draw->pipeline.wide_line->destroy( draw->pipeline.wide_line );
+   if (draw->pipeline.wide_point)
+      draw->pipeline.wide_point->destroy( draw->pipeline.wide_point );
+   if (draw->pipeline.stipple)
+      draw->pipeline.stipple->destroy( draw->pipeline.stipple );
+   if (draw->pipeline.unfilled)
+      draw->pipeline.unfilled->destroy( draw->pipeline.unfilled );
+   if (draw->pipeline.twoside)
+      draw->pipeline.twoside->destroy( draw->pipeline.twoside );
+   if (draw->pipeline.offset)
+      draw->pipeline.offset->destroy( draw->pipeline.offset );
+   if (draw->pipeline.clip)
+      draw->pipeline.clip->destroy( draw->pipeline.clip );
+   if (draw->pipeline.flatshade)
+      draw->pipeline.flatshade->destroy( draw->pipeline.flatshade );
+   if (draw->pipeline.cull)
+      draw->pipeline.cull->destroy( draw->pipeline.cull );
+   if (draw->pipeline.validate)
+      draw->pipeline.validate->destroy( draw->pipeline.validate );
+   if (draw->pipeline.aaline)
+      draw->pipeline.aaline->destroy( draw->pipeline.aaline );
+   if (draw->pipeline.aapoint)
+      draw->pipeline.aapoint->destroy( draw->pipeline.aapoint );
+   if (draw->pipeline.pstipple)
+      draw->pipeline.pstipple->destroy( draw->pipeline.pstipple );
+   if (draw->pipeline.rasterize)
+      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
+}
+
+
+
+
+
 
 
-/**
- * Add a point to the primitive queue.
- * \param i0  index into user's vertex arrays
- */
 static void do_point( struct draw_context *draw,
 		      const char *v0 )
 {
@@ -55,11 +125,6 @@ static void do_point( struct draw_context *draw,
 }
 
 
-/**
- * Add a line to the primitive queue.
- * \param i0  index into user's vertex arrays
- * \param i1  index into user's vertex arrays
- */
 static void do_line( struct draw_context *draw,
 		     const char *v0,
 		     const char *v1 )
@@ -75,9 +140,7 @@ static void do_line( struct draw_context *draw,
    draw->pipeline.first->line( draw->pipeline.first, &prim );
 }
 
-/**
- * Add a triangle to the primitive queue.
- */
+
 static void do_triangle( struct draw_context *draw,
 			 char *v0,
 			 char *v1,
@@ -94,28 +157,11 @@ static void do_triangle( struct draw_context *draw,
                      (prim.v[2]->edgeflag << 2));
    prim.pad = 0;
 
-   if (0) debug_printf("tri ef: %d %d %d\n", 
-                       prim.v[0]->edgeflag,
-                       prim.v[1]->edgeflag,
-                       prim.v[2]->edgeflag);
-   
    draw->pipeline.first->tri( draw->pipeline.first, &prim );
 }
 
 
 
-void draw_pt_reset_vertex_ids( struct draw_context *draw )
-{
-   unsigned i;
-   char *verts = draw->pt.pipeline.verts;
-   unsigned stride = draw->pt.pipeline.vertex_stride;
-
-   for (i = 0; i < draw->pt.pipeline.vertex_count; i++) {
-      ((struct vertex_header *)verts)->vertex_id = UNDEFINED_VERTEX_ID;
-      verts += stride;
-   }
-}
-
 
 /* Code to run the pipeline on a fairly arbitary collection of vertices.
  *
@@ -127,19 +173,20 @@ void draw_pt_reset_vertex_ids( struct draw_context *draw )
  * This code provides a callback to reset the vertex id's which the
  * draw_vbuf.c code uses when it has to perform a flush.
  */
-void draw_pt_run_pipeline( struct draw_context *draw,
-                           unsigned prim,
-                           char *verts,
-                           unsigned stride,
-                           unsigned vertex_count,
-                           const ushort *elts,
-                           unsigned count )
+void draw_pipeline_run( struct draw_context *draw,
+                        unsigned prim,
+                        struct vertex_header *vertices,
+                        unsigned vertex_count,
+                        unsigned stride,
+                        const ushort *elts,
+                        unsigned count )
 {
+   char *verts = (char *)vertices;
    unsigned i;
 
-   draw->pt.pipeline.verts = verts;
-   draw->pt.pipeline.vertex_stride = stride;
-   draw->pt.pipeline.vertex_count = vertex_count;
+   draw->pipeline.verts = verts;
+   draw->pipeline.vertex_stride = stride;
+   draw->pipeline.vertex_count = vertex_count;
    
    switch (prim) {
    case PIPE_PRIM_POINTS:
@@ -162,7 +209,15 @@ void draw_pt_run_pipeline( struct draw_context *draw,
       break;
    }
    
-   draw->pt.pipeline.verts = NULL;
-   draw->pt.pipeline.vertex_count = 0;
+   draw->pipeline.verts = NULL;
+   draw->pipeline.vertex_count = 0;
 }
 
+
+
+void draw_pipeline_flush( struct draw_context *draw, 
+                          unsigned flags )
+{
+   draw->pipeline.first->flush( draw->pipeline.first, flags );
+   draw->pipeline.first = draw->pipeline.validate;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h
new file mode 100644
index 0000000000..2476abb2b2
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe.h
@@ -0,0 +1,114 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#ifndef DRAW_PIPE_H
+#define DRAW_PIPE_H
+
+#include "pipe/p_compiler.h"
+#include "draw_private.h"       /* for sizeof(vertex_header) */
+
+
+
+/**
+ * Base class for all primitive drawing stages.
+ */
+struct draw_stage
+{
+   struct draw_context *draw;   /**< parent context */
+
+   struct draw_stage *next;     /**< next stage in pipeline */
+
+   struct vertex_header **tmp;  /**< temp vert storage, such as for clipping */
+   unsigned nr_tmps;
+
+   void (*point)( struct draw_stage *,
+		  struct prim_header * );
+
+   void (*line)( struct draw_stage *,
+		 struct prim_header * );
+
+   void (*tri)( struct draw_stage *,
+		struct prim_header * );
+
+   void (*flush)( struct draw_stage *,
+		  unsigned flags );
+
+   void (*reset_stipple_counter)( struct draw_stage * );
+
+   void (*destroy)( struct draw_stage * );
+};
+
+
+extern struct draw_stage *draw_unfilled_stage( struct draw_context *context );
+extern struct draw_stage *draw_twoside_stage( struct draw_context *context );
+extern struct draw_stage *draw_offset_stage( struct draw_context *context );
+extern struct draw_stage *draw_clip_stage( struct draw_context *context );
+extern struct draw_stage *draw_flatshade_stage( struct draw_context *context );
+extern struct draw_stage *draw_cull_stage( struct draw_context *context );
+extern struct draw_stage *draw_stipple_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_line_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_point_stage( struct draw_context *context );
+extern struct draw_stage *draw_validate_stage( struct draw_context *context );
+
+
+extern void draw_free_temp_verts( struct draw_stage *stage );
+extern boolean draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr );
+
+extern void draw_reset_vertex_ids( struct draw_context *draw );
+
+void draw_pipe_passthrough_tri(struct draw_stage *stage, struct prim_header *header);
+void draw_pipe_passthrough_line(struct draw_stage *stage, struct prim_header *header);
+void draw_pipe_passthrough_point(struct draw_stage *stage, struct prim_header *header);
+
+
+
+/**
+ * Get a writeable copy of a vertex.
+ * \param stage  drawing stage info
+ * \param vert  the vertex to copy (source)
+ * \param idx  index into stage's tmp[] array to put the copy (dest)
+ * \return  pointer to the copied vertex
+ */
+static INLINE struct vertex_header *
+dup_vert( struct draw_stage *stage,
+	  const struct vertex_header *vert,
+	  unsigned idx )
+{   
+   struct vertex_header *tmp = stage->tmp[idx];
+   const uint vsize = sizeof(struct vertex_header)
+      + stage->draw->num_vs_outputs * 4 * sizeof(float);
+   memcpy(tmp, vert, vsize);
+   tmp->vertex_id = UNDEFINED_VERTEX_ID;
+   return tmp;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index e8d2a45102..7e5f8bd281 100644
--- a/src/gallium/auxiliary/draw/draw_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -43,6 +43,7 @@
 
 #include "draw_context.h"
 #include "draw_private.h"
+#include "draw_pipe.h"
 
 
 /**
@@ -333,11 +334,10 @@ aa_transform_inst(struct tgsi_transform_context *ctx,
  * Generate the frag shader we'll use for drawing AA lines.
  * This will be the user's shader plus some texture/modulate instructions.
  */
-static void
+static boolean
 generate_aaline_fs(struct aaline_stage *aaline)
 {
    const struct pipe_shader_state *orig_fs = &aaline->fs->state;
-   //struct draw_context *draw = aaline->stage.draw;
    struct pipe_shader_state aaline_fs;
    struct aa_transform_context transform;
 
@@ -345,6 +345,8 @@ generate_aaline_fs(struct aaline_stage *aaline)
 
    aaline_fs = *orig_fs; /* copy to init */
    aaline_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (aaline_fs.tokens == NULL)
+      return FALSE;
 
    memset(&transform, 0, sizeof(transform));
    transform.colorOutput = -1;
@@ -369,15 +371,18 @@ generate_aaline_fs(struct aaline_stage *aaline)
 
    aaline->fs->aaline_fs
       = aaline->driver_create_fs_state(aaline->pipe, &aaline_fs);
+   if (aaline->fs->aaline_fs == NULL)
+      return FALSE;
 
    aaline->fs->generic_attrib = transform.maxGeneric + 1;
+   return TRUE;
 }
 
 
 /**
  * Create the texture map we'll use for antialiasing the lines.
  */
-static void
+static boolean
 aaline_create_texture(struct aaline_stage *aaline)
 {
    struct pipe_context *pipe = aaline->pipe;
@@ -395,6 +400,8 @@ aaline_create_texture(struct aaline_stage *aaline)
    texTemp.cpp = 1;
 
    aaline->texture = screen->texture_create(screen, &texTemp);
+   if (!aaline->texture)
+      return FALSE;
 
    /* Fill in mipmap images.
     * Basically each level is solid opaque, except for the outermost
@@ -410,6 +417,8 @@ aaline_create_texture(struct aaline_stage *aaline)
 
       surface = screen->get_tex_surface(screen, aaline->texture, 0, level, 0);
       data = pipe_surface_map(surface);
+      if (data == NULL)
+         return FALSE;
 
       for (i = 0; i < size; i++) {
          for (j = 0; j < size; j++) {
@@ -435,6 +444,7 @@ aaline_create_texture(struct aaline_stage *aaline)
       pipe_surface_reference(&surface, NULL);
       pipe->texture_update(pipe, aaline->texture, 0, (1 << level));
    }
+   return TRUE;
 }
 
 
@@ -443,7 +453,7 @@ aaline_create_texture(struct aaline_stage *aaline)
  * By using a mipmapped texture, we don't have to generate a different
  * texture image for each line size.
  */
-static void
+static boolean
 aaline_create_sampler(struct aaline_stage *aaline)
 {
    struct pipe_sampler_state sampler;
@@ -461,6 +471,10 @@ aaline_create_sampler(struct aaline_stage *aaline)
    sampler.max_lod = MAX_TEXTURE_LEVEL;
 
    aaline->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+   if (aaline->sampler_cso == NULL)
+      return FALSE;
+
+   return TRUE;
 }
 
 
@@ -468,13 +482,15 @@ aaline_create_sampler(struct aaline_stage *aaline)
  * When we're about to draw our first AA line in a batch, this function is
  * called to tell the driver to bind our modified fragment shader.
  */
-static void
+static boolean
 bind_aaline_fragment_shader(struct aaline_stage *aaline)
 {
-   if (!aaline->fs->aaline_fs) {
-      generate_aaline_fs(aaline);
-   }
+   if (!aaline->fs->aaline_fs && 
+       !generate_aaline_fs(aaline))
+      return FALSE;
+
    aaline->driver_bind_fs_state(aaline->pipe, aaline->fs->aaline_fs);
+   return TRUE;
 }
 
 
@@ -486,20 +502,6 @@ aaline_stage( struct draw_stage *stage )
 }
 
 
-static void
-passthrough_point(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->point(stage->next, header);
-}
-
-
-static void
-passthrough_tri(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->tri(stage->next, header);
-}
-
-
 /**
  * Draw a wide line by drawing a quad, using geometry which will
  * fullfill GL's antialiased line requirements.
@@ -637,7 +639,11 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    /*
     * Bind (generate) our fragprog, sampler and texture
     */
-   bind_aaline_fragment_shader(aaline);
+   if (!bind_aaline_fragment_shader(aaline)) {
+      stage->line = draw_pipe_passthrough_line;
+      stage->line(stage, header);
+      return;
+   }
 
    /* update vertex attrib info */
    aaline->tex_slot = draw->num_vs_outputs;
@@ -701,9 +707,11 @@ aaline_destroy(struct draw_stage *stage)
 {
    struct aaline_stage *aaline = aaline_stage(stage);
 
-   aaline->pipe->delete_sampler_state(aaline->pipe, aaline->sampler_cso);
+   if (aaline->sampler_cso)
+      aaline->pipe->delete_sampler_state(aaline->pipe, aaline->sampler_cso);
 
-   pipe_texture_release(&aaline->texture);
+   if (aaline->texture)
+      pipe_texture_release(&aaline->texture);
 
    draw_free_temp_verts( stage );
 
@@ -715,19 +723,28 @@ static struct aaline_stage *
 draw_aaline_stage(struct draw_context *draw)
 {
    struct aaline_stage *aaline = CALLOC_STRUCT(aaline_stage);
+   if (aaline == NULL)
+      return NULL;
 
-   draw_alloc_temp_verts( &aaline->stage, 8 );
+   if (!draw_alloc_temp_verts( &aaline->stage, 8 ))
+      goto fail;
 
    aaline->stage.draw = draw;
    aaline->stage.next = NULL;
-   aaline->stage.point = passthrough_point;
+   aaline->stage.point = draw_pipe_passthrough_point;
    aaline->stage.line = aaline_first_line;
-   aaline->stage.tri = passthrough_tri;
+   aaline->stage.tri = draw_pipe_passthrough_tri;
    aaline->stage.flush = aaline_flush;
    aaline->stage.reset_stipple_counter = aaline_reset_stipple_counter;
    aaline->stage.destroy = aaline_destroy;
 
    return aaline;
+
+ fail:
+   if (aaline)
+      aaline_destroy(&aaline->stage);
+
+   return NULL;
 }
 
 
@@ -749,13 +766,13 @@ aaline_create_fs_state(struct pipe_context *pipe,
 {
    struct aaline_stage *aaline = aaline_stage_from_pipe(pipe);
    struct aaline_fragment_shader *aafs = CALLOC_STRUCT(aaline_fragment_shader);
+   if (aafs == NULL)
+      return NULL;
 
-   if (aafs) {
-      aafs->state = *fs;
+   aafs->state = *fs;
 
-      /* pass-through */
-      aafs->driver_fs = aaline->driver_create_fs_state(aaline->pipe, fs);
-   }
+   /* pass-through */
+   aafs->driver_fs = aaline->driver_create_fs_state(aaline->pipe, fs);
 
    return aafs;
 }
@@ -821,7 +838,7 @@ aaline_set_sampler_textures(struct pipe_context *pipe,
  * into the draw module's pipeline.  This will not be used if the
  * hardware has native support for AA lines.
  */
-void
+boolean
 draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
 {
    struct aaline_stage *aaline;
@@ -832,14 +849,17 @@ draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
     * Create / install AA line drawing / prim stage
     */
    aaline = draw_aaline_stage( draw );
-   assert(aaline);
-   draw->pipeline.aaline = &aaline->stage;
+   if (!aaline)
+      goto fail;
 
    aaline->pipe = pipe;
 
    /* create special texture, sampler state */
-   aaline_create_texture(aaline);
-   aaline_create_sampler(aaline);
+   if (!aaline_create_texture(aaline))
+      goto fail;
+
+   if (!aaline_create_sampler(aaline))
+      goto fail;
 
    /* save original driver functions */
    aaline->driver_create_fs_state = pipe->create_fs_state;
@@ -856,4 +876,16 @@ draw_install_aaline_stage(struct draw_context *draw, struct pipe_context *pipe)
 
    pipe->bind_sampler_states = aaline_bind_sampler_states;
    pipe->set_sampler_textures = aaline_set_sampler_textures;
+   
+   /* Install once everything is known to be OK:
+    */
+   draw->pipeline.aaline = &aaline->stage;
+
+   return TRUE;
+
+ fail:
+   if (aaline)
+      aaline->stage.destroy( &aaline->stage );
+   
+   return FALSE;
 }
diff --git a/src/gallium/auxiliary/draw/draw_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index fcebe3e7a0..ac0aa4cd7c 100644
--- a/src/gallium/auxiliary/draw/draw_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -48,7 +48,8 @@
 #include "tgsi/util/tgsi_dump.h"
 
 #include "draw_context.h"
-#include "draw_private.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
 
 
 /*
@@ -483,7 +484,7 @@ aa_transform_inst(struct tgsi_transform_context *ctx,
  * Generate the frag shader we'll use for drawing AA lines.
  * This will be the user's shader plus some texture/modulate instructions.
  */
-static void
+static boolean
 generate_aapoint_fs(struct aapoint_stage *aapoint)
 {
    const struct pipe_shader_state *orig_fs = &aapoint->fs->state;
@@ -494,6 +495,8 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
 
    aapoint_fs = *orig_fs; /* copy to init */
    aapoint_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (aapoint_fs.tokens == NULL)
+      return FALSE;
 
    memset(&transform, 0, sizeof(transform));
    transform.colorOutput = -1;
@@ -518,8 +521,12 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
 
    aapoint->fs->aapoint_fs
       = aapoint->driver_create_fs_state(aapoint->pipe, &aapoint_fs);
+   if (aapoint->fs->aapoint_fs == NULL)
+      return FALSE;
 
    aapoint->fs->generic_attrib = transform.maxGeneric + 1;
+
+   return TRUE;
 }
 
 
@@ -527,13 +534,15 @@ generate_aapoint_fs(struct aapoint_stage *aapoint)
  * When we're about to draw our first AA line in a batch, this function is
  * called to tell the driver to bind our modified fragment shader.
  */
-static void
+static boolean
 bind_aapoint_fragment_shader(struct aapoint_stage *aapoint)
 {
-   if (!aapoint->fs->aapoint_fs) {
-      generate_aapoint_fs(aapoint);
-   }
+   if (!aapoint->fs->aapoint_fs &&
+       !generate_aapoint_fs(aapoint))
+      return FALSE;
+
    aapoint->driver_bind_fs_state(aapoint->pipe, aapoint->fs->aapoint_fs);
+   return TRUE;
 }
 
 
@@ -545,18 +554,6 @@ aapoint_stage( struct draw_stage *stage )
 }
 
 
-static void
-passthrough_line(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->line(stage->next, header);
-}
-
-
-static void
-passthrough_tri(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->tri(stage->next, header);
-}
 
 
 /**
@@ -742,19 +739,29 @@ static struct aapoint_stage *
 draw_aapoint_stage(struct draw_context *draw)
 {
    struct aapoint_stage *aapoint = CALLOC_STRUCT(aapoint_stage);
+   if (aapoint == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &aapoint->stage, 4 );
+   if (!draw_alloc_temp_verts( &aapoint->stage, 4 ))
+      goto fail;
 
    aapoint->stage.draw = draw;
    aapoint->stage.next = NULL;
    aapoint->stage.point = aapoint_first_point;
-   aapoint->stage.line = passthrough_line;
-   aapoint->stage.tri = passthrough_tri;
+   aapoint->stage.line = draw_pipe_passthrough_line;
+   aapoint->stage.tri = draw_pipe_passthrough_tri;
    aapoint->stage.flush = aapoint_flush;
    aapoint->stage.reset_stipple_counter = aapoint_reset_stipple_counter;
    aapoint->stage.destroy = aapoint_destroy;
 
    return aapoint;
+
+ fail:
+   if (aapoint)
+      aapoint_destroy(&aapoint->stage);
+
+   return NULL;
+
 }
 
 
@@ -776,13 +783,13 @@ aapoint_create_fs_state(struct pipe_context *pipe,
 {
    struct aapoint_stage *aapoint = aapoint_stage_from_pipe(pipe);
    struct aapoint_fragment_shader *aafs = CALLOC_STRUCT(aapoint_fragment_shader);
+   if (aafs == NULL) 
+      return NULL;
 
-   if (aafs) {
-      aafs->state = *fs;
+   aafs->state = *fs;
 
-      /* pass-through */
-      aafs->driver_fs = aapoint->driver_create_fs_state(aapoint->pipe, fs);
-   }
+   /* pass-through */
+   aafs->driver_fs = aapoint->driver_create_fs_state(aapoint->pipe, fs);
 
    return aafs;
 }
@@ -817,7 +824,7 @@ aapoint_delete_fs_state(struct pipe_context *pipe, void *fs)
  * into the draw module's pipeline.  This will not be used if the
  * hardware has native support for AA points.
  */
-void
+boolean
 draw_install_aapoint_stage(struct draw_context *draw,
                            struct pipe_context *pipe)
 {
@@ -829,8 +836,8 @@ draw_install_aapoint_stage(struct draw_context *draw,
     * Create / install AA point drawing / prim stage
     */
    aapoint = draw_aapoint_stage( draw );
-   assert(aapoint);
-   draw->pipeline.aapoint = &aapoint->stage;
+   if (aapoint == NULL)
+      goto fail;
 
    aapoint->pipe = pipe;
 
@@ -843,4 +850,14 @@ draw_install_aapoint_stage(struct draw_context *draw,
    pipe->create_fs_state = aapoint_create_fs_state;
    pipe->bind_fs_state = aapoint_bind_fs_state;
    pipe->delete_fs_state = aapoint_delete_fs_state;
+
+   draw->pipeline.aapoint = &aapoint->stage;
+
+   return TRUE;
+
+ fail:
+   if (aapoint)
+      aapoint->stage.destroy( &aapoint->stage );
+
+   return FALSE;
 }
diff --git a/src/gallium/auxiliary/draw/draw_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index e24c5d8032..21216addea 100644
--- a/src/gallium/auxiliary/draw/draw_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -35,8 +35,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
 
-#include "draw_context.h"
-#include "draw_private.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
 
 
 #ifndef IS_NEGATIVE
@@ -204,7 +204,14 @@ static void emit_poly( struct draw_stage *stage,
    }
 }
 
-
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
 
 
 /* Clip a triangle against the viewport and user clip planes.
@@ -486,8 +493,11 @@ static void clip_destroy( struct draw_stage *stage )
 struct draw_stage *draw_clip_stage( struct draw_context *draw )
 {
    struct clipper *clipper = CALLOC_STRUCT(clipper);
+   if (clipper == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
+   if (!draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 ))
+      goto fail;
 
    clipper->stage.draw = draw;
    clipper->stage.point = clip_point;
@@ -500,4 +510,10 @@ struct draw_stage *draw_clip_stage( struct draw_context *draw )
    clipper->plane = draw->plane;
 
    return &clipper->stage;
+
+ fail:
+   if (clipper)
+      clipper->stage.destroy( &clipper->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_cull.c b/src/gallium/auxiliary/draw/draw_pipe_cull.c
index 8177b0ac86..87aaf1f85b 100644
--- a/src/gallium/auxiliary/draw/draw_cull.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_cull.c
@@ -35,7 +35,7 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
-#include "draw_private.h"
+#include "draw_pipe.h"
 
 
 struct cull_stage {
@@ -95,20 +95,6 @@ static void cull_first_tri( struct draw_stage *stage,
 
 
 
-static void cull_line( struct draw_stage *stage,
-		       struct prim_header *header )
-{
-   stage->next->line( stage->next, header );
-}
-
-
-static void cull_point( struct draw_stage *stage,
-			struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
-
-
 static void cull_flush( struct draw_stage *stage, unsigned flags )
 {
    stage->tri = cull_first_tri;
@@ -134,17 +120,26 @@ static void cull_destroy( struct draw_stage *stage )
 struct draw_stage *draw_cull_stage( struct draw_context *draw )
 {
    struct cull_stage *cull = CALLOC_STRUCT(cull_stage);
+   if (cull == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &cull->stage, 0 );
+   if (!draw_alloc_temp_verts( &cull->stage, 0 ))
+      goto fail;
 
    cull->stage.draw = draw;
    cull->stage.next = NULL;
-   cull->stage.point = cull_point;
-   cull->stage.line = cull_line;
+   cull->stage.point = draw_pipe_passthrough_point;
+   cull->stage.line = draw_pipe_passthrough_line;
    cull->stage.tri = cull_first_tri;
    cull->stage.flush = cull_flush;
    cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
    cull->stage.destroy = cull_destroy;
 
    return &cull->stage;
+
+ fail:
+   if (cull)
+      cull->stage.destroy( &cull->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index af2cb05c98..205000cbea 100644
--- a/src/gallium/auxiliary/draw/draw_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -30,7 +30,8 @@
 
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
 
 
 /** subclass of draw_stage */
@@ -151,13 +152,6 @@ static void flatshade_line_1( struct draw_stage *stage,
 }
 
 
-/* Flatshade point -- passthrough.
- */
-static void flatshade_point( struct draw_stage *stage,
-                             struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
 
 
 static void flatshade_init_state( struct draw_stage *stage )
@@ -230,12 +224,15 @@ static void flatshade_destroy( struct draw_stage *stage )
 struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
 {
    struct flat_stage *flatshade = CALLOC_STRUCT(flat_stage);
+   if (flatshade == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &flatshade->stage, 2 );
+   if (!draw_alloc_temp_verts( &flatshade->stage, 2 ))
+      goto fail;
 
    flatshade->stage.draw = draw;
    flatshade->stage.next = NULL;
-   flatshade->stage.point = flatshade_point;
+   flatshade->stage.point = draw_pipe_passthrough_point;
    flatshade->stage.line = flatshade_first_line;
    flatshade->stage.tri = flatshade_first_tri;
    flatshade->stage.flush = flatshade_flush;
@@ -243,6 +240,12 @@ struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
    flatshade->stage.destroy = flatshade_destroy;
 
    return &flatshade->stage;
+
+ fail:
+   if (flatshade)
+      flatshade->stage.destroy( &flatshade->stage );
+
+   return NULL;
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_offset.c b/src/gallium/auxiliary/draw/draw_pipe_offset.c
index dbc676deae..ffec85ccdd 100644
--- a/src/gallium/auxiliary/draw/draw_offset.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_offset.c
@@ -33,7 +33,7 @@
  */
 
 #include "pipe/p_util.h"
-#include "draw_private.h"
+#include "draw_pipe.h"
 
 
 
@@ -129,18 +129,6 @@ static void offset_first_tri( struct draw_stage *stage,
 }
 
 
-static void offset_line( struct draw_stage *stage,
-			 struct prim_header *header )
-{
-   stage->next->line( stage->next, header );
-}
-
-
-static void offset_point( struct draw_stage *stage,
-			  struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
 
 
 static void offset_flush( struct draw_stage *stage,
@@ -170,17 +158,25 @@ static void offset_destroy( struct draw_stage *stage )
 struct draw_stage *draw_offset_stage( struct draw_context *draw )
 {
    struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
+   if (offset == NULL)
+      goto fail;
 
    draw_alloc_temp_verts( &offset->stage, 3 );
 
    offset->stage.draw = draw;
    offset->stage.next = NULL;
-   offset->stage.point = offset_point;
-   offset->stage.line = offset_line;
+   offset->stage.point = draw_pipe_passthrough_point;
+   offset->stage.line = draw_pipe_passthrough_line;
    offset->stage.tri = offset_first_tri;
    offset->stage.flush = offset_flush;
    offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
    offset->stage.destroy = offset_destroy;
 
    return &offset->stage;
+
+ fail:
+   if (offset)
+      offset->stage.destroy( &offset->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index 4dddb72906..aec485a6e7 100644
--- a/src/gallium/auxiliary/draw/draw_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -44,7 +44,7 @@
 #include "tgsi/util/tgsi_dump.h"
 
 #include "draw_context.h"
-#include "draw_private.h"
+#include "draw_pipe.h"
 
 
 
@@ -320,7 +320,7 @@ pstip_transform_inst(struct tgsi_transform_context *ctx,
  * Generate the frag shader we'll use for doing polygon stipple.
  * This will be the user's shader prefixed with a TEX and KIL instruction.
  */
-static void
+static boolean
 generate_pstip_fs(struct pstip_stage *pstip)
 {
    const struct pipe_shader_state *orig_fs = &pstip->fs->state;
@@ -332,6 +332,8 @@ generate_pstip_fs(struct pstip_stage *pstip)
 
    pstip_fs = *orig_fs; /* copy to init */
    pstip_fs.tokens = MALLOC(sizeof(struct tgsi_token) * MAX);
+   if (pstip_fs.tokens == NULL)
+      return FALSE;
 
    memset(&transform, 0, sizeof(transform));
    transform.wincoordInput = -1;
@@ -355,6 +357,8 @@ generate_pstip_fs(struct pstip_stage *pstip)
    assert(pstip->fs->sampler_unit < PIPE_MAX_SAMPLERS);
 
    pstip->fs->pstip_fs = pstip->driver_create_fs_state(pstip->pipe, &pstip_fs);
+
+   return TRUE;
 }
 
 
@@ -404,7 +408,7 @@ pstip_update_texture(struct pstip_stage *pstip)
 /**
  * Create the texture map we'll use for stippling.
  */
-static void
+static boolean
 pstip_create_texture(struct pstip_stage *pstip)
 {
    struct pipe_context *pipe = pstip->pipe;
@@ -421,7 +425,10 @@ pstip_create_texture(struct pstip_stage *pstip)
    texTemp.cpp = 1;
 
    pstip->texture = screen->texture_create(screen, &texTemp);
-   assert(pstip->texture->refcount == 1);
+   if (pstip->texture == NULL)
+      return FALSE;
+
+   return TRUE;
 }
 
 
@@ -430,7 +437,7 @@ pstip_create_texture(struct pstip_stage *pstip)
  * By using a mipmapped texture, we don't have to generate a different
  * texture image for each line size.
  */
-static void
+static boolean
 pstip_create_sampler(struct pstip_stage *pstip)
 {
    struct pipe_sampler_state sampler;
@@ -448,6 +455,10 @@ pstip_create_sampler(struct pstip_stage *pstip)
    sampler.max_lod = 0.0f;
 
    pstip->sampler_cso = pipe->create_sampler_state(pipe, &sampler);
+   if (pstip->sampler_cso == NULL)
+      return FALSE;
+   
+   return TRUE;
 }
 
 
@@ -455,13 +466,15 @@ pstip_create_sampler(struct pstip_stage *pstip)
  * When we're about to draw our first AA line in a batch, this function is
  * called to tell the driver to bind our modified fragment shader.
  */
-static void
+static boolean
 bind_pstip_fragment_shader(struct pstip_stage *pstip)
 {
-   if (!pstip->fs->pstip_fs) {
-      generate_pstip_fs(pstip);
-   }
+   if (!pstip->fs->pstip_fs &&
+       !generate_pstip_fs(pstip))
+      return FALSE;
+
    pstip->driver_bind_fs_state(pstip->pipe, pstip->fs->pstip_fs);
+   return TRUE;
 }
 
 
@@ -473,25 +486,6 @@ pstip_stage( struct draw_stage *stage )
 }
 
 
-static void
-passthrough_point(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->point(stage->next, header);
-}
-
-
-static void
-passthrough_line(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->line(stage->next, header);
-}
-
-
-static void
-passthrough_tri(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->tri(stage->next, header);
-}
 
 
 
@@ -505,7 +499,12 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
    assert(stage->draw->rasterizer->poly_stipple_enable);
 
    /* bind our fragprog */
-   bind_pstip_fragment_shader(pstip);
+   if (!bind_pstip_fragment_shader(pstip)) {
+      stage->tri = draw_pipe_passthrough_tri;
+      stage->tri(stage, header);
+      return;
+   }
+      
 
    /* how many samplers? */
    /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */
@@ -523,7 +522,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header)
    pstip->driver_set_sampler_textures(pipe, num_samplers, pstip->state.textures);
 
    /* now really draw first line */
-   stage->tri = passthrough_tri;
+   stage->tri = draw_pipe_passthrough_tri;
    stage->tri(stage, header);
 }
 
@@ -579,8 +578,8 @@ draw_pstip_stage(struct draw_context *draw)
 
    pstip->stage.draw = draw;
    pstip->stage.next = NULL;
-   pstip->stage.point = passthrough_point;
-   pstip->stage.line = passthrough_line;
+   pstip->stage.point = draw_pipe_passthrough_point;
+   pstip->stage.line = draw_pipe_passthrough_line;
    pstip->stage.tri = pstip_first_tri;
    pstip->stage.flush = pstip_flush;
    pstip->stage.reset_stipple_counter = pstip_reset_stipple_counter;
@@ -705,7 +704,7 @@ pstip_set_polygon_stipple(struct pipe_context *pipe,
  * into the draw module's pipeline.  This will not be used if the
  * hardware has native support for AA lines.
  */
-void
+boolean
 draw_install_pstipple_stage(struct draw_context *draw,
                             struct pipe_context *pipe)
 {
@@ -717,14 +716,19 @@ draw_install_pstipple_stage(struct draw_context *draw,
     * Create / install AA line drawing / prim stage
     */
    pstip = draw_pstip_stage( draw );
-   assert(pstip);
+   if (pstip == NULL)
+      goto fail;
+
    draw->pipeline.pstipple = &pstip->stage;
 
    pstip->pipe = pipe;
 
    /* create special texture, sampler state */
-   pstip_create_texture(pstip);
-   pstip_create_sampler(pstip);
+   if (!pstip_create_texture(pstip))
+      goto fail;
+
+   if (!pstip_create_sampler(pstip))
+      goto fail;
 
    /* save original driver functions */
    pstip->driver_create_fs_state = pipe->create_fs_state;
@@ -743,4 +747,12 @@ draw_install_pstipple_stage(struct draw_context *draw,
    pipe->bind_sampler_states = pstip_bind_sampler_states;
    pipe->set_sampler_textures = pstip_set_sampler_textures;
    pipe->set_polygon_stipple = pstip_set_polygon_stipple;
+
+   return TRUE;
+
+ fail:
+   if (pstip)
+      pstip->stage.destroy( &pstip->stage );
+
+   return FALSE;
 }
diff --git a/src/gallium/auxiliary/draw/draw_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 506f33512c..9cf5840cce 100644
--- a/src/gallium/auxiliary/draw/draw_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -39,7 +39,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
+#include "draw_pipe.h"
 
 
 /** Subclass of draw_stage */
@@ -195,18 +195,6 @@ stipple_flush(struct draw_stage *stage, unsigned flags)
 }
 
 
-static void
-passthrough_point(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->point( stage->next, header );
-}
-
-
-static void
-passthrough_tri(struct draw_stage *stage, struct prim_header *header)
-{
-   stage->next->tri(stage->next, header);
-}
 
 
 static void 
@@ -228,9 +216,9 @@ struct draw_stage *draw_stipple_stage( struct draw_context *draw )
 
    stipple->stage.draw = draw;
    stipple->stage.next = NULL;
-   stipple->stage.point = passthrough_point;
+   stipple->stage.point = draw_pipe_passthrough_point;
    stipple->stage.line = stipple_first_line;
-   stipple->stage.tri = passthrough_tri;
+   stipple->stage.tri = draw_pipe_passthrough_tri;
    stipple->stage.reset_stipple_counter = reset_stipple_counter;
    stipple->stage.flush = stipple_flush;
    stipple->stage.destroy = stipple_destroy;
diff --git a/src/gallium/auxiliary/draw/draw_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 3debaac282..5910dccc43 100644
--- a/src/gallium/auxiliary/draw/draw_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -31,8 +31,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
-
+#include "draw_vs.h"
+#include "draw_pipe.h"
 
 struct twoside_stage {
    struct draw_stage stage;
@@ -99,21 +99,6 @@ static void twoside_tri( struct draw_stage *stage,
 }
 
 
-static void twoside_line( struct draw_stage *stage,
-		       struct prim_header *header )
-{
-   /* pass-through */
-   stage->next->line( stage->next, header );
-}
-
-
-static void twoside_point( struct draw_stage *stage,
-			struct prim_header *header )
-{
-   /* pass-through */
-   stage->next->point( stage->next, header );
-}
-
 
 static void twoside_first_tri( struct draw_stage *stage, 
 			       struct prim_header *header )
@@ -187,17 +172,26 @@ static void twoside_destroy( struct draw_stage *stage )
 struct draw_stage *draw_twoside_stage( struct draw_context *draw )
 {
    struct twoside_stage *twoside = CALLOC_STRUCT(twoside_stage);
+   if (twoside == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &twoside->stage, 3 );
+   if (!draw_alloc_temp_verts( &twoside->stage, 3 ))
+      goto fail;
 
    twoside->stage.draw = draw;
    twoside->stage.next = NULL;
-   twoside->stage.point = twoside_point;
-   twoside->stage.line = twoside_line;
+   twoside->stage.point = draw_pipe_passthrough_point;
+   twoside->stage.line = draw_pipe_passthrough_line;
    twoside->stage.tri = twoside_first_tri;
    twoside->stage.flush = twoside_flush;
    twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
    twoside->stage.destroy = twoside_destroy;
 
    return &twoside->stage;
+
+ fail:
+   if (twoside)
+      twoside->stage.destroy( &twoside->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_unfilled.c b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
index b07860cd9e..eeb2bc43f9 100644
--- a/src/gallium/auxiliary/draw/draw_unfilled.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_unfilled.c
@@ -36,6 +36,7 @@
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
+#include "draw_pipe.h"
 
 
 struct unfilled_stage {
@@ -147,19 +148,6 @@ static void unfilled_first_tri( struct draw_stage *stage,
 }
 
 
-static void unfilled_line( struct draw_stage *stage,
-                           struct prim_header *header )
-{
-   stage->next->line( stage->next, header );
-}
-
-
-static void unfilled_point( struct draw_stage *stage,
-                            struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
-
 
 static void unfilled_flush( struct draw_stage *stage,
 			    unsigned flags )
@@ -189,18 +177,27 @@ static void unfilled_destroy( struct draw_stage *stage )
 struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
 {
    struct unfilled_stage *unfilled = CALLOC_STRUCT(unfilled_stage);
+   if (unfilled == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &unfilled->stage, 0 );
+   if (!draw_alloc_temp_verts( &unfilled->stage, 0 ))
+      goto fail;
 
    unfilled->stage.draw = draw;
    unfilled->stage.next = NULL;
    unfilled->stage.tmp = NULL;
-   unfilled->stage.point = unfilled_point;
-   unfilled->stage.line = unfilled_line;
+   unfilled->stage.point = draw_pipe_passthrough_point;
+   unfilled->stage.line = draw_pipe_passthrough_line;
    unfilled->stage.tri = unfilled_first_tri;
    unfilled->stage.flush = unfilled_flush;
    unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
    unfilled->stage.destroy = unfilled_destroy;
 
    return &unfilled->stage;
+
+ fail:
+   if (unfilled)
+      unfilled->stage.destroy( &unfilled->stage );
+
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pipe_util.c b/src/gallium/auxiliary/draw/draw_pipe_util.c
new file mode 100644
index 0000000000..04438f4dd0
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pipe_util.c
@@ -0,0 +1,137 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pipe.h"
+
+
+
+void
+draw_pipe_passthrough_point(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->point(stage->next, header);
+}
+
+void
+draw_pipe_passthrough_line(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->line(stage->next, header);
+}
+
+void
+draw_pipe_passthrough_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+
+
+
+/* This is only used for temporary verts.
+ */
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+/**
+ * Allocate space for temporary post-transform vertices, such as for clipping.
+ */
+boolean draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr )
+{
+   assert(!stage->tmp);
+
+   stage->tmp = NULL;
+   stage->nr_tmps = nr;
+
+   if (nr != 0)
+   {
+      unsigned i;
+      ubyte *store = (ubyte *) MALLOC( MAX_VERTEX_SIZE * nr );
+
+      if (store == NULL)
+         return FALSE;
+
+      stage->tmp = (struct vertex_header **) MALLOC( sizeof(struct vertex_header *) * nr );
+      if (stage->tmp == NULL) {
+         FREE(store);
+         return FALSE;
+      }
+         
+      for (i = 0; i < nr; i++)
+         stage->tmp[i] = (struct vertex_header *)(store + i * MAX_VERTEX_SIZE);
+   }
+
+   return TRUE;
+}
+
+
+void draw_free_temp_verts( struct draw_stage *stage )
+{
+   if (stage->tmp) {
+      FREE( stage->tmp[0] );
+      FREE( stage->tmp );
+      stage->tmp = NULL;
+   }
+}
+
+
+/* Reset vertex ids.  This is basically a type of flush.
+ *
+ * Called only from draw_pipe_vbuf.c
+ */
+void draw_reset_vertex_ids(struct draw_context *draw)
+{
+   struct draw_stage *stage = draw->pipeline.first;
+   
+   while (stage) {
+      unsigned i;
+
+      for (i = 0; i < stage->nr_tmps; i++)
+	 stage->tmp[i]->vertex_id = UNDEFINED_VERTEX_ID;
+
+      stage = stage->next;
+   }
+
+   if (draw->pipeline.verts)
+   {
+      unsigned i;
+      char *verts = draw->pipeline.verts;
+      unsigned stride = draw->pipeline.vertex_stride;
+
+      for (i = 0; i < draw->pipeline.vertex_count; i++) {
+         ((struct vertex_header *)verts)->vertex_id = UNDEFINED_VERTEX_ID;
+         verts += stride;
+      }
+   }
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_validate.c b/src/gallium/auxiliary/draw/draw_pipe_validate.c
index e163e078f0..6be1d369c3 100644
--- a/src/gallium/auxiliary/draw/draw_validate.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_validate.c
@@ -31,6 +31,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "draw_private.h"
+#include "draw_pipe.h"
+#include "draw_context.h"
 
 static boolean points( unsigned prim )
 {
@@ -56,7 +58,8 @@ static boolean triangles( unsigned prim )
  * pipeline stages.
  */
 boolean
-draw_need_pipeline(const struct draw_context *draw, 
+draw_need_pipeline(const struct draw_context *draw,
+                   const struct pipe_rasterizer_state *rasterizer,
                    unsigned int prim )
 {
    /* Don't have to worry about triangles turning into lines/points
@@ -66,30 +69,30 @@ draw_need_pipeline(const struct draw_context *draw,
    if (lines(prim)) 
    {
       /* line stipple */
-      if (draw->rasterizer->line_stipple_enable && draw->line_stipple)
+      if (rasterizer->line_stipple_enable && draw->pipeline.line_stipple)
          return TRUE;
 
       /* wide lines */
-      if (draw->rasterizer->line_width > draw->wide_line_threshold)
+      if (rasterizer->line_width > draw->pipeline.wide_line_threshold)
          return TRUE;
 
       /* AA lines */
-      if (draw->rasterizer->line_smooth && draw->pipeline.aaline)
+      if (rasterizer->line_smooth && draw->pipeline.aaline)
          return TRUE;
    }
 
    if (points(prim))
    {
       /* large points */
-      if (draw->rasterizer->point_size > draw->wide_point_threshold)
+      if (rasterizer->point_size > draw->pipeline.wide_point_threshold)
          return TRUE;
 
       /* AA points */
-      if (draw->rasterizer->point_smooth && draw->pipeline.aapoint)
+      if (rasterizer->point_smooth && draw->pipeline.aapoint)
          return TRUE;
 
       /* point sprites */
-      if (draw->rasterizer->point_sprite && draw->point_sprite)
+      if (rasterizer->point_sprite && draw->pipeline.point_sprite)
          return TRUE;
    }
 
@@ -97,20 +100,20 @@ draw_need_pipeline(const struct draw_context *draw,
    if (triangles(prim)) 
    {
       /* polygon stipple */
-      if (draw->rasterizer->poly_stipple_enable && draw->pipeline.pstipple)
+      if (rasterizer->poly_stipple_enable && draw->pipeline.pstipple)
          return TRUE;
 
       /* unfilled polygons */
-      if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
-          draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL)
+      if (rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+          rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL)
          return TRUE;
       
       /* polygon offset */
-      if (draw->rasterizer->offset_cw || draw->rasterizer->offset_ccw)
+      if (rasterizer->offset_cw || rasterizer->offset_ccw)
          return TRUE;
 
       /* two-side lighting */
-      if (draw->rasterizer->light_twoside)
+      if (rasterizer->light_twoside)
          return TRUE;
    }
 
@@ -119,7 +122,7 @@ draw_need_pipeline(const struct draw_context *draw,
     * 
     * Generally this isn't a reason to require the pipeline, though.
     *
-   if (draw->rasterizer->cull_mode)
+   if (rasterizer->cull_mode)
       return TRUE;
     */
 
@@ -145,15 +148,15 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
    stage->next = next;
 
    /* drawing wide lines? */
-   wide_lines = (draw->rasterizer->line_width > draw->wide_line_threshold
+   wide_lines = (draw->rasterizer->line_width > draw->pipeline.wide_line_threshold
                  && !draw->rasterizer->line_smooth);
 
    /* drawing large points? */
-   if (draw->rasterizer->point_sprite && draw->point_sprite)
+   if (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)
       wide_points = TRUE;
    else if (draw->rasterizer->point_smooth && draw->pipeline.aapoint)
       wide_points = FALSE;
-   else if (draw->rasterizer->point_size > draw->wide_point_threshold)
+   else if (draw->rasterizer->point_size > draw->pipeline.wide_point_threshold)
       wide_points = TRUE;
    else
       wide_points = FALSE;
@@ -186,7 +189,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
       next = draw->pipeline.wide_point;
    }
 
-   if (draw->rasterizer->line_stipple_enable && draw->line_stipple) {
+   if (draw->rasterizer->line_stipple_enable && draw->pipeline.line_stipple) {
       draw->pipeline.stipple->next = next;
       next = draw->pipeline.stipple;
       precalc_flat = 1;		/* only needed for lines really */
@@ -238,7 +241,7 @@ static struct draw_stage *validate_pipeline( struct draw_stage *stage )
 
    /* Clip stage
     */
-   if (!draw->rasterizer->bypass_clipping)
+   if (!draw->bypass_clipping)
    {
       draw->pipeline.clip->next = next;
       next = draw->pipeline.clip;
@@ -298,6 +301,8 @@ static void validate_destroy( struct draw_stage *stage )
 struct draw_stage *draw_validate_stage( struct draw_context *draw )
 {
    struct draw_stage *stage = CALLOC_STRUCT(draw_stage);
+   if (stage == NULL)
+      return NULL;
 
    stage->draw = draw;
    stage->next = NULL;
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index e3216ff711..afd5f5544d 100644
--- a/src/gallium/auxiliary/draw/draw_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -40,7 +40,8 @@
 #include "draw_vbuf.h"
 #include "draw_private.h"
 #include "draw_vertex.h"
-#include "draw_vf.h"
+#include "draw_pipe.h"
+#include "translate/translate.h"
 
 
 /**
@@ -56,7 +57,7 @@ struct vbuf_stage {
    /** Vertex size in bytes */
    unsigned vertex_size;
 
-   struct draw_vertex_fetch *vf;
+   struct translate *translate;
    
    /* FIXME: we have no guarantee that 'unsigned' is 32bit */
 
@@ -71,8 +72,9 @@ struct vbuf_stage {
    unsigned max_indices;
    unsigned nr_indices;
 
-   /** Pipe primitive */
-   unsigned prim;
+   /* Cache point size somewhere it's address won't change:
+    */
+   float point_size;
 };
 
 
@@ -113,61 +115,6 @@ check_space( struct vbuf_stage *vbuf, unsigned nr )
 }
 
 
-#if 0
-static INLINE void
-dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
-{
-   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-   unsigned i, j, k;
-
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         debug_printf("EMIT_OMIT:");
-         break;
-      case EMIT_1F:
-         debug_printf("EMIT_1F:\t");
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         break;
-      case EMIT_1F_PSIZE:
-         debug_printf("EMIT_1F_PSIZE:\t");
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         break;
-      case EMIT_2F:
-         debug_printf("EMIT_2F:\t");
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         break;
-      case EMIT_3F:
-         debug_printf("EMIT_3F:\t");
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         data += sizeof(float);
-         break;
-      case EMIT_4F:
-         debug_printf("EMIT_4F:\t");
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         debug_printf("%f ", *(float *)data); data += sizeof(float);
-         break;
-      case EMIT_4UB:
-         debug_printf("EMIT_4UB:\t");
-         debug_printf("%u ", *data++);
-         debug_printf("%u ", *data++);
-         debug_printf("%u ", *data++);
-         debug_printf("%u ", *data++);
-         break;
-      default:
-         assert(0);
-      }
-      debug_printf("\n");
-   }
-   debug_printf("\n");
-}
-#endif
 
 
 /**
@@ -177,96 +124,25 @@ dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
  * have a couple of slots at the beginning (1-dword header, 4-dword
  * clip pos) that we ignore here.  We only use the vertex->data[] fields.
  */
-static INLINE void 
+static INLINE ushort 
 emit_vertex( struct vbuf_stage *vbuf,
              struct vertex_header *vertex )
 {
-#if 0
-   debug_printf("emit vertex %d to %p\n", 
-           vbuf->nr_vertices, vbuf->vertex_ptr);
-#endif
-
-   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
-      if(vertex->vertex_id < vbuf->nr_vertices)
-	 return;
-      else
-	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
-	         vertex->vertex_id, vbuf->nr_vertices);
-      return;
-   }
-      
-   vertex->vertex_id = vbuf->nr_vertices++;
-
-   if(!vbuf->vf) {
-      const struct vertex_info *vinfo = vbuf->vinfo;
-      uint i;
-      uint count = 0;  /* for debug/sanity */
+   if(vertex->vertex_id == UNDEFINED_VERTEX_ID) {      
+      /* Hmm - vertices are emitted one at a time - better make sure
+       * set_buffer is efficient.  Consider a special one-shot mode for
+       * translate.
+       */
+      vbuf->translate->set_buffer(vbuf->translate, 0, vertex->data[0], 0);
+      vbuf->translate->run(vbuf->translate, 0, 1, vbuf->vertex_ptr);
+
+      if (0) draw_dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr);
       
-      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
-
-      for (i = 0; i < vinfo->num_attribs; i++) {
-         uint j = vinfo->src_index[i];
-         switch (vinfo->emit[i]) {
-         case EMIT_OMIT:
-            /* no-op */
-            break;
-         case EMIT_1F:
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-            count++;
-            break;
-         case EMIT_1F_PSIZE:
-            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
-            count++;
-            break;
-         case EMIT_2F:
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-            count += 2;
-            break;
-         case EMIT_3F:
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-            count += 3;
-            break;
-         case EMIT_4F:
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
-            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
-            count += 4;
-            break;
-         case EMIT_4UB:
-            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
-                                           float_to_ubyte( vertex->data[j][1] ),
-                                           float_to_ubyte( vertex->data[j][0] ),
-                                           float_to_ubyte( vertex->data[j][3] ));
-            count += 1;
-            break;
-         default:
-            assert(0);
-         }
-      }
-      assert(count == vinfo->size);
-#if 0
-      {
-	 static float data[256]; 
-	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
-	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
-            debug_printf("With VF:\n");
-            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
-	    debug_printf("Without VF:\n");
-	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
-	    assert(0);
-	 }
-      }
-#endif
-   }
-   else {
-      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
-   
       vbuf->vertex_ptr += vbuf->vertex_size/4;
+      vertex->vertex_id = vbuf->nr_vertices++;
    }
+
+   return vertex->vertex_id;
 }
 
 
@@ -280,9 +156,7 @@ vbuf_tri( struct draw_stage *stage,
    check_space( vbuf, 3 );
 
    for (i = 0; i < 3; i++) {
-      emit_vertex( vbuf, prim->v[i] );
-      
-      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
    }
 }
 
@@ -297,9 +171,7 @@ vbuf_line( struct draw_stage *stage,
    check_space( vbuf, 2 );
 
    for (i = 0; i < 2; i++) {
-      emit_vertex( vbuf, prim->v[i] );
-
-      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+      vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[i] );
    }   
 }
 
@@ -312,43 +184,112 @@ vbuf_point( struct draw_stage *stage,
 
    check_space( vbuf, 1 );
 
-   emit_vertex( vbuf, prim->v[0] );
-   
-   vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[0]->vertex_id;
+   vbuf->indices[vbuf->nr_indices++] = emit_vertex( vbuf, prim->v[0] );
 }
 
 
+
+
 /**
  * Set the prim type for subsequent vertices.
  * This may result in a new vertex size.  The existing vbuffer (if any)
  * will be flushed if needed and a new one allocated.
  */
 static void
-vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
+vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
 {
-   const struct vertex_info *vinfo;
-   unsigned vertex_size;
-
-   assert(newprim == PIPE_PRIM_POINTS ||
-          newprim == PIPE_PRIM_LINES ||
-          newprim == PIPE_PRIM_TRIANGLES);
+   struct translate_key hw_key;
+   unsigned dst_offset;
+   unsigned i;
 
-   vbuf->prim = newprim;
-   vbuf->render->set_primitive(vbuf->render, newprim);
+   vbuf->render->set_primitive(vbuf->render, prim);
 
-   vinfo = vbuf->render->get_vertex_info(vbuf->render);
-   vertex_size = vinfo->size * sizeof(float);
+   /* Must do this after set_primitive() above:
+    * 
+    * XXX: need some state managment to track when this needs to be
+    * recalculated.  The driver should tell us whether there was a
+    * state change.
+    */
+   vbuf->vinfo = vbuf->render->get_vertex_info(vbuf->render);
 
-   if (vertex_size != vbuf->vertex_size)
+   if (vbuf->vertex_size != vbuf->vinfo->size * sizeof(float)) {
       vbuf_flush_vertices(vbuf);
+      vbuf->vertex_size = vbuf->vinfo->size * sizeof(float);
+   }
 
-   vbuf->vinfo = vinfo;
-   vbuf->vertex_size = vertex_size;
-   if(vbuf->vf)
-      draw_vf_set_vertex_info(vbuf->vf, 
-                              vbuf->vinfo,
-                              vbuf->stage.draw->rasterizer->point_size);
-   
+   /* Translate from pipeline vertices to hw vertices.
+    */
+   dst_offset = 0;
+   memset(&hw_key, 0, sizeof(hw_key));
+
+   for (i = 0; i < vbuf->vinfo->num_attribs; i++) {
+      unsigned emit_sz = 0;
+      unsigned src_buffer = 0;
+      unsigned output_format;
+      unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) );
+
+      switch (vbuf->vinfo->emit[i]) {
+      case EMIT_4F:
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
+	 break;
+      case EMIT_3F:
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
+	 break;
+      case EMIT_2F:
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
+	 break;
+      case EMIT_1F:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 break;
+      case EMIT_1F_PSIZE:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 src_buffer = 1;
+	 src_offset = 0;
+	 break;
+      case EMIT_4UB:
+	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 emit_sz = 4 * sizeof(ubyte);
+      default:
+	 assert(0);
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 break;
+      }
+      
+      hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      hw_key.element[i].input_buffer = src_buffer;
+      hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].output_format = output_format;
+      hw_key.element[i].output_offset = dst_offset;
+
+      dst_offset += emit_sz;
+   }
+
+   hw_key.nr_elements = vbuf->vinfo->num_attribs;
+   hw_key.output_stride = vbuf->vinfo->size * 4;
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!vbuf->translate ||
+       memcmp(&vbuf->translate->key, &hw_key, sizeof(hw_key)) != 0) 
+   {
+      if (vbuf->translate)
+	 vbuf->translate->release(vbuf->translate);
+
+      vbuf->translate = translate_create( &hw_key );
+
+      vbuf->translate->set_buffer(vbuf->translate, 1, &vbuf->point_size, 0);
+   }
+
+   vbuf->point_size = vbuf->stage.draw->rasterizer->point_size;
+
+   /* Allocate new buffer?
+    */
    if (!vbuf->vertices)
       vbuf_alloc_vertices(vbuf);
 }
@@ -402,29 +343,9 @@ vbuf_flush_indices( struct vbuf_stage *vbuf )
    assert((uint) (vbuf->vertex_ptr - vbuf->vertices) == 
           vbuf->nr_vertices * vbuf->vertex_size / sizeof(unsigned));
 
-   switch(vbuf->prim) {
-   case PIPE_PRIM_POINTS:
-      break;
-   case PIPE_PRIM_LINES:
-      assert(vbuf->nr_indices % 2 == 0);
-      break;
-   case PIPE_PRIM_TRIANGLES:
-      assert(vbuf->nr_indices % 3 == 0);
-      break;
-   default:
-      assert(0);
-   }
-   
    vbuf->render->draw(vbuf->render, vbuf->indices, vbuf->nr_indices);
    
    vbuf->nr_indices = 0;
-
-   /* don't need to reset point/line/tri functions */
-#if 0
-   stage->point = vbuf_first_point;
-   stage->line = vbuf_first_line;
-   stage->tri = vbuf_first_tri;
-#endif
 }
 
 
@@ -466,8 +387,8 @@ vbuf_alloc_vertices( struct vbuf_stage *vbuf )
    /* Allocate a new vertex buffer */
    vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
    vbuf->vertices = (uint *) vbuf->render->allocate_vertices(vbuf->render,
-                                                    (ushort) vbuf->vertex_size,
-                                                    (ushort) vbuf->max_vertices);
+							     (ushort) vbuf->vertex_size,
+							     (ushort) vbuf->max_vertices);
    vbuf->vertex_ptr = vbuf->vertices;
 }
 
@@ -505,8 +426,8 @@ static void vbuf_destroy( struct draw_stage *stage )
    if(vbuf->indices)
       align_free( vbuf->indices );
    
-   if(vbuf->vf)
-      draw_vf_destroy( vbuf->vf );
+   if(vbuf->translate)
+      vbuf->translate->release( vbuf->translate );
 
    if (vbuf->render)
       vbuf->render->destroy( vbuf->render );
@@ -522,9 +443,8 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
                                     struct vbuf_render *render )
 {
    struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
-
-   if(!vbuf)
-      return NULL;
+   if (vbuf == NULL)
+      goto fail;
    
    vbuf->stage.draw = draw;
    vbuf->stage.point = vbuf_first_point;
@@ -535,21 +455,22 @@ struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
    vbuf->stage.destroy = vbuf_destroy;
    
    vbuf->render = render;
+   vbuf->max_indices = MAX2(render->max_indices, UNDEFINED_VERTEX_ID-1);
 
-   assert(render->max_indices < UNDEFINED_VERTEX_ID);
-   vbuf->max_indices = render->max_indices;
-   vbuf->indices = (ushort *)
-      align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
-   if(!vbuf->indices)
-      vbuf_destroy(&vbuf->stage);
+   vbuf->indices = (ushort *) align_malloc( vbuf->max_indices * 
+					    sizeof(vbuf->indices[0]), 
+					    16 );
+   if (!vbuf->indices)
+      goto fail;
    
    vbuf->vertices = NULL;
    vbuf->vertex_ptr = vbuf->vertices;
-
-   vbuf->prim = ~0;
-   
-   if(!GETENV("GALLIUM_NOVF"))
-      vbuf->vf = draw_vf_create();
    
    return &vbuf->stage;
+
+ fail:
+   if (vbuf)
+      vbuf_destroy(&vbuf->stage);
+   
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_wide_line.c b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
index 9a168ce8bd..452732e662 100644
--- a/src/gallium/auxiliary/draw/draw_wide_line.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_line.c
@@ -32,6 +32,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
+#include "draw_pipe.h"
 
 
 struct wideline_stage {
@@ -48,19 +49,6 @@ static INLINE struct wideline_stage *wideline_stage( struct draw_stage *stage )
 }
 
 
-static void wideline_point( struct draw_stage *stage,
-                               struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
-
-
-static void wideline_tri( struct draw_stage *stage,
-                             struct prim_header *header )
-{
-   stage->next->tri(stage->next, header);
-}
-
 
 /**
  * Draw a wide line by drawing a quad (two triangles).
@@ -179,9 +167,9 @@ struct draw_stage *draw_wide_line_stage( struct draw_context *draw )
 
    wide->stage.draw = draw;
    wide->stage.next = NULL;
-   wide->stage.point = wideline_point;
+   wide->stage.point = draw_pipe_passthrough_point;
    wide->stage.line = wideline_line;
-   wide->stage.tri = wideline_tri;
+   wide->stage.tri = draw_pipe_passthrough_point;
    wide->stage.flush = wideline_flush;
    wide->stage.reset_stipple_counter = wideline_reset_stipple_counter;
    wide->stage.destroy = wideline_destroy;
diff --git a/src/gallium/auxiliary/draw/draw_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 6fc7c9fcd7..ed08573382 100644
--- a/src/gallium/auxiliary/draw/draw_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -31,7 +31,8 @@
 #include "pipe/p_util.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
+#include "draw_vs.h"
+#include "draw_pipe.h"
 
 
 struct widepoint_stage {
@@ -60,23 +61,6 @@ widepoint_stage( struct draw_stage *stage )
 }
 
 
-static void passthrough_point( struct draw_stage *stage,
-                             struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
-
-static void widepoint_line( struct draw_stage *stage,
-                            struct prim_header *header )
-{
-   stage->next->line(stage->next, header);
-}
-
-static void widepoint_tri( struct draw_stage *stage,
-                           struct prim_header *header )
-{
-   stage->next->tri(stage->next, header);
-}
 
 
 /**
@@ -199,16 +183,16 @@ static void widepoint_first_point( struct draw_stage *stage,
    wide->ybias = 0.0;
 
    if (draw->rasterizer->gl_rasterization_rules) {
-      wide->ybias = -0.125;
+      wide->xbias = 0.125;
    }
 
    /* XXX we won't know the real size if it's computed by the vertex shader! */
-   if ((draw->rasterizer->point_size > draw->wide_point_threshold) ||
-       (draw->rasterizer->point_sprite && draw->point_sprite)) {
+   if ((draw->rasterizer->point_size > draw->pipeline.wide_point_threshold) ||
+       (draw->rasterizer->point_sprite && draw->pipeline.point_sprite)) {
       stage->point = widepoint_point;
    }
    else {
-      stage->point = passthrough_point;
+      stage->point = draw_pipe_passthrough_point;
    }
 
    if (draw->rasterizer->point_sprite) {
@@ -265,17 +249,26 @@ static void widepoint_destroy( struct draw_stage *stage )
 struct draw_stage *draw_wide_point_stage( struct draw_context *draw )
 {
    struct widepoint_stage *wide = CALLOC_STRUCT(widepoint_stage);
+   if (wide == NULL)
+      goto fail;
 
-   draw_alloc_temp_verts( &wide->stage, 4 );
+   if (!draw_alloc_temp_verts( &wide->stage, 4 ))
+      goto fail;
 
    wide->stage.draw = draw;
    wide->stage.next = NULL;
    wide->stage.point = widepoint_first_point;
-   wide->stage.line = widepoint_line;
-   wide->stage.tri = widepoint_tri;
+   wide->stage.line = draw_pipe_passthrough_line;
+   wide->stage.tri = draw_pipe_passthrough_tri;
    wide->stage.flush = widepoint_flush;
    wide->stage.reset_stipple_counter = widepoint_reset_stipple_counter;
    wide->stage.destroy = widepoint_destroy;
 
    return &wide->stage;
+
+ fail:
+   if (wide)
+      wide->stage.destroy( &wide->stage );
+   
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_prim.c b/src/gallium/auxiliary/draw/draw_prim.c
deleted file mode 100644
index 51b6950334..0000000000
--- a/src/gallium/auxiliary/draw/draw_prim.c
+++ /dev/null
@@ -1,523 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-
-#include "draw_private.h"
-#include "draw_context.h"
-
-
-
-#define RP_NONE  0
-#define RP_POINT 1
-#define RP_LINE  2
-#define RP_TRI   3
-
-
-static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
-   RP_POINT,
-   RP_LINE,
-   RP_LINE,
-   RP_LINE,
-   RP_TRI,
-   RP_TRI,
-   RP_TRI,
-   RP_TRI,
-   RP_TRI,
-   RP_TRI
-};
-
-
-static void draw_prim_queue_flush( struct draw_context *draw )
-{
-   unsigned i;
-
-   if (0)
-      debug_printf("Flushing with %d prims, %d verts\n",
-                   draw->pq.queue_nr, draw->vs.queue_nr);
-
-   assert (draw->pq.queue_nr != 0);
-
-   /* NOTE: we cannot save draw->pipeline->first in a local var because
-    * draw->pipeline->first is often changed by the first call to tri(),
-    * line(), etc.
-    */
-   if (draw->rasterizer->line_stipple_enable) {
-      switch (draw->reduced_prim) {
-      case RP_TRI:
-	 for (i = 0; i < draw->pq.queue_nr; i++) {
-	    if (draw->pq.queue[i].reset_line_stipple)
-	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-	    
-	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
-	 }
-	 break;
-      case RP_LINE:
-	 for (i = 0; i < draw->pq.queue_nr; i++) {
-	    if (draw->pq.queue[i].reset_line_stipple)
-	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-	    
-	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
-	 }
-	 break;
-      case RP_POINT:
-	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
-	 for (i = 0; i < draw->pq.queue_nr; i++)
-	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-	 break;
-      }
-   }
-   else {
-      switch (draw->reduced_prim) {
-      case RP_TRI:
-	 for (i = 0; i < draw->pq.queue_nr; i++) 
-	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
-	 break;
-      case RP_LINE:
-	 for (i = 0; i < draw->pq.queue_nr; i++) 
-	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
-	 break;
-      case RP_POINT:
-	 for (i = 0; i < draw->pq.queue_nr; i++)
-	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
-	 break;
-      }
-   }
-
-   draw->pq.queue_nr = 0;   
-   draw->vs.post_nr = 0;   
-   draw_vertex_cache_unreference( draw );
-}
-
-void draw_do_flush( struct draw_context *draw, unsigned flags )
-{
-   if (0)
-      debug_printf("Flushing with %d verts, %d prims\n",
-                   draw->vs.queue_nr,
-                   draw->pq.queue_nr );
-
-   if (draw->flushing)
-      return;
-
-   draw->flushing = TRUE;
-
-   if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
-      if (draw->vs.queue_nr) {
-         (*draw->shader_queue_flush)(draw);
-      }
-
-      if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
-	 if (draw->pq.queue_nr)
-	    draw_prim_queue_flush(draw);
-
-	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
-            draw_vertex_cache_invalidate(draw);
-
-	    if (flags >= DRAW_FLUSH_STATE_CHANGE) {
-               draw->pipeline.first->flush( draw->pipeline.first, flags );
-               draw->pipeline.first = draw->pipeline.validate;
-               draw->reduced_prim = ~0;
-	    }
-	 }
-      }    
-   }
-
-   draw->flushing = FALSE;
-}
-
-
-
-/* Return a pointer to a freshly queued primitive header.  Ensure that
- * there is room in the vertex cache for a maximum of "nr_verts" new
- * vertices.  Flush primitive and/or vertex queues if necessary to
- * make space.
- */
-static struct prim_header *get_queued_prim( struct draw_context *draw,
-					    unsigned nr_verts )
-{
-   if (!draw_vertex_cache_check_space( draw, nr_verts )) {
-//      debug_printf("v");
-      draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
-   }
-   else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
-//      debug_printf("p");
-      draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
-   }
-
-   assert(draw->pq.queue_nr < PRIM_QUEUE_LENGTH);
-
-   return &draw->pq.queue[draw->pq.queue_nr++];
-}
-
-
-
-/**
- * Add a point to the primitive queue.
- * \param i0  index into user's vertex arrays
- */
-static void do_point( struct draw_context *draw,
-		      unsigned i0 )
-{
-   struct prim_header *prim = get_queued_prim( draw, 1 );
-   
-   prim->reset_line_stipple = 0;
-   prim->edgeflags = 1;
-   prim->pad = 0;
-   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
-}
-
-
-/**
- * Add a line to the primitive queue.
- * \param i0  index into user's vertex arrays
- * \param i1  index into user's vertex arrays
- */
-static void do_line( struct draw_context *draw,
-		     boolean reset_stipple,
-		     unsigned i0,
-		     unsigned i1 )
-{
-   struct prim_header *prim = get_queued_prim( draw, 2 );
-   
-   prim->reset_line_stipple = reset_stipple;
-   prim->edgeflags = 1;
-   prim->pad = 0;
-   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
-   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
-}
-
-/**
- * Add a triangle to the primitive queue.
- */
-static void do_triangle( struct draw_context *draw,
-			 unsigned i0,
-			 unsigned i1,
-			 unsigned i2 )
-{
-   struct prim_header *prim = get_queued_prim( draw, 3 );
-   
-//   _mesa_printf("tri %d %d %d\n", i0, i1, i2);
-   prim->reset_line_stipple = 1;
-   prim->edgeflags = ~0;
-   prim->pad = 0;
-   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
-   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
-   prim->v[2] = draw->vcache.get_vertex( draw, i2 );
-}
-			  
-static void do_ef_triangle( struct draw_context *draw,
-			    boolean reset_stipple,
-			    unsigned ef_mask,
-			    unsigned i0,
-			    unsigned i1,
-			    unsigned i2 )
-{
-   struct prim_header *prim = get_queued_prim( draw, 3 );
-   struct vertex_header *v0 = draw->vcache.get_vertex( draw, i0 );
-   struct vertex_header *v1 = draw->vcache.get_vertex( draw, i1 );
-   struct vertex_header *v2 = draw->vcache.get_vertex( draw, i2 );
-
-   prim->reset_line_stipple = reset_stipple;
-
-   prim->edgeflags = ef_mask & ((v0->edgeflag << 0) | 
-				(v1->edgeflag << 1) | 
-				(v2->edgeflag << 2));
-   prim->pad = 0;
-   prim->v[0] = v0;
-   prim->v[1] = v1;
-   prim->v[2] = v2;
-}
-
-
-static void do_ef_quad( struct draw_context *draw,
-		     unsigned v0,
-		     unsigned v1,
-		     unsigned v2,
-		     unsigned v3 )
-{
-   const unsigned omitEdge2 = ~(1 << 1);
-   const unsigned omitEdge3 = ~(1 << 2);
-   do_ef_triangle( draw, 1, omitEdge2, v0, v1, v3 );
-   do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
-}
-
-static void do_quad( struct draw_context *draw,
-		     unsigned v0,
-		     unsigned v1,
-		     unsigned v2,
-		     unsigned v3 )
-{
-   do_triangle( draw, v0, v1, v3 );
-   do_triangle( draw, v1, v2, v3 );
-}
-
-
-/**
- * Main entrypoint to draw some number of points/lines/triangles
- */
-static void
-draw_prim( struct draw_context *draw, 
-	   unsigned prim, unsigned start, unsigned count )
-{
-   unsigned i;
-   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
-		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
-   boolean flatfirst =
-      (draw->rasterizer->flatshade & draw->rasterizer->flatshade_first) ? TRUE : FALSE;
-
-//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
-
-   switch (prim) {
-   case PIPE_PRIM_POINTS:
-      for (i = 0; i < count; i ++) {
-	 do_point( draw,
-		   start + i );
-      }
-      break;
-
-   case PIPE_PRIM_LINES:
-      for (i = 0; i+1 < count; i += 2) {
-         do_line( draw, 
-                  TRUE,
-                  start + i + 0,
-                  start + i + 1);
-      }
-      break;
-
-   case PIPE_PRIM_LINE_LOOP:  
-      if (count >= 2) {
-	 for (i = 1; i < count; i++) {
-	    do_line( draw, 
-		     i == 1, 	/* XXX: only if vb not split */
-		     start + i - 1,
-		     start + i );
-	 }
-
-	 do_line( draw, 
-		  0,
-		  start + count - 1,
-		  start + 0 );
-      }
-      break;
-
-   case PIPE_PRIM_LINE_STRIP:
-      for (i = 1; i < count; i++) {
-         do_line( draw,
-                  i == 1,
-                  start + i - 1,
-                  start + i );
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLES:
-      if (unfilled) {
-         for (i = 0; i+2 < count; i += 3) {
-            do_ef_triangle( draw,
-                            1, 
-                            ~0,
-                            start + i + 0,
-                            start + i + 1,
-                            start + i + 2 );
-         }
-      } 
-      else {
-         for (i = 0; i+2 < count; i += 3) {
-            do_triangle( draw,
-                         start + i + 0,
-                         start + i + 1,
-                         start + i + 2 );
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      if (flatfirst) {
-         for (i = 0; i+2 < count; i++) {
-            if (i & 1) {
-               do_triangle( draw,
-                  start + i + 0,
-                  start + i + 2,
-                  start + i + 1 );
-            }
-            else {
-               do_triangle( draw,
-                  start + i + 0,
-                  start + i + 1,
-                  start + i + 2 );
-            }
-         }
-      }
-      else {
-         for (i = 0; i+2 < count; i++) {
-            if (i & 1) {
-               do_triangle( draw,
-                  start + i + 1,
-                  start + i + 0,
-                  start + i + 2 );
-            }
-            else {
-               do_triangle( draw,
-                  start + i + 0,
-                  start + i + 1,
-                  start + i + 2 );
-            }
-         }
-      }
-      break;
-
-   case PIPE_PRIM_TRIANGLE_FAN:
-      if (count >= 3) {
-         if (flatfirst) {
-            for (i = 0; i+2 < count; i++) {
-               do_triangle( draw,
-                  start + i + 1,
-                  start + i + 2,
-                  start + 0 );
-            }
-         }
-         else {
-            for (i = 0; i+2 < count; i++) {
-               do_triangle( draw,
-                  start + 0,
-                  start + i + 1,
-                  start + i + 2 );
-            }
-         }
-      }
-      break;
-
-
-   case PIPE_PRIM_QUADS:
-      if (unfilled) {
-	 for (i = 0; i+3 < count; i += 4) {
-	    do_ef_quad( draw,
-			start + i + 0,
-			start + i + 1,
-			start + i + 2,
-			start + i + 3);
-	 }
-      }
-      else {
-	 for (i = 0; i+3 < count; i += 4) {
-	    do_quad( draw,
-		     start + i + 0,
-		     start + i + 1,
-		     start + i + 2,
-		     start + i + 3);
-	 }
-      }
-      break;
-
-   case PIPE_PRIM_QUAD_STRIP:
-      if (unfilled) {
-	 for (i = 0; i+3 < count; i += 2) {
-	    do_ef_quad( draw,
-			start + i + 2,
-			start + i + 0,
-			start + i + 1,
-			start + i + 3);
-	 }
-      }
-      else {
-	 for (i = 0; i+3 < count; i += 2) {
-	    do_quad( draw,
-		     start + i + 2,
-		     start + i + 0,
-		     start + i + 1,
-		     start + i + 3);
-	 }
-      }
-      break;
-
-   case PIPE_PRIM_POLYGON:
-      if (unfilled) {
-	 unsigned ef_mask = (1<<2) | (1<<0);
-
-	 for (i = 0; i+2 < count; i++) {
-
-            if (i + 3 >= count)
-	       ef_mask |= (1<<1);
-
-	    do_ef_triangle( draw,
-			    i == 0,
-			    ef_mask,
-			    start + i + 1,
-			    start + i + 2,
-			    start + 0);
-
-	    ef_mask &= ~(1<<2);
-	 }
-      }
-      else {
-	 for (i = 0; i+2 < count; i++) {
-	    do_triangle( draw,
-			 start + i + 1,
-			 start + i + 2,
-			 start + 0);
-	 }
-      }
-      break;
-
-   default:
-      assert(0);
-      break;
-   }
-}
-
-
-
-
-/**
- * Draw vertex arrays
- * This is the main entrypoint into the drawing module.
- * \param prim  one of PIPE_PRIM_x
- * \param start  index of first vertex to draw
- * \param count  number of vertices to draw
- */
-void
-draw_arrays(struct draw_context *draw, unsigned prim,
-            unsigned start, unsigned count)
-{
-   if (reduced_prim[prim] != draw->reduced_prim) {
-      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-      draw->reduced_prim = reduced_prim[prim];
-   }
-
-   /* drawing done here: */
-   if (!draw_pt_arrays(draw, prim, start, count)) {
-      /* we have to run the whole pipeline */
-      draw_prim(draw, prim, start, count);
-   }
-}
-
-
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index c8cb96c8ba..39aa81b43c 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -44,7 +44,6 @@
 #include "pipe/p_state.h"
 #include "pipe/p_defines.h"
 
-#include "rtasm/rtasm_x86sse.h"
 #include "tgsi/exec/tgsi_exec.h"
 #include "tgsi/util/tgsi_scan.h"
 
@@ -52,11 +51,11 @@
 struct pipe_context;
 struct gallivm_prog;
 struct gallivm_cpu_engine;
+struct draw_vertex_shader;
+struct draw_context;
+struct draw_stage;
+struct vbuf_render;
 
-struct draw_pt_middle_end;
-struct draw_pt_front_end;
-
-#define MAX_SHADER_VERTICES 128
 
 /**
  * Basic vertex info.
@@ -70,17 +69,14 @@ struct vertex_header {
 
    float clip[4];
 
-   float data[][4];		/* Note variable size */
+   /* This will probably become float (*data)[4] soon:
+    */
+   float data[][4];
 };
 
 /* NOTE: It should match vertex_id size above */
 #define UNDEFINED_VERTEX_ID 0xffff
 
-/* XXX This is too large */
-#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
-#define MAX_VERTEX_ALLOCATION ((MAX_VERTEX_SIZE + 0x0f) & ~0x0f)
-
-
 
 /**
  * Basic info for a point/line/triangle primitive.
@@ -95,92 +91,11 @@ struct prim_header {
 
 
 
-struct draw_context;
-
-/**
- * Base class for all primitive drawing stages.
- */
-struct draw_stage
-{
-   struct draw_context *draw;   /**< parent context */
-
-   struct draw_stage *next;     /**< next stage in pipeline */
-
-   struct vertex_header **tmp;  /**< temp vert storage, such as for clipping */
-   unsigned nr_tmps;
-
-   void (*point)( struct draw_stage *,
-		  struct prim_header * );
-
-   void (*line)( struct draw_stage *,
-		 struct prim_header * );
-
-   void (*tri)( struct draw_stage *,
-		struct prim_header * );
-
-   void (*flush)( struct draw_stage *,
-		  unsigned flags );
-
-   void (*reset_stipple_counter)( struct draw_stage * );
 
-   void (*destroy)( struct draw_stage * );
-};
-
-
-#define PRIM_QUEUE_LENGTH      32
-#define VCACHE_SIZE            32
-#define VCACHE_OVERFLOW        4
-#define VS_QUEUE_LENGTH        (VCACHE_SIZE + VCACHE_OVERFLOW + 1)	/* can never fill up */
-
-/**
- * Private version of the compiled vertex_shader
- */
-struct draw_vertex_shader {
-
-   /* This member will disappear shortly:
-    */
-   struct pipe_shader_state   state;
-
-   struct tgsi_shader_info info;
-
-   void (*prepare)( struct draw_vertex_shader *shader,
-		    struct draw_context *draw );
-
-   /* Run the shader - this interface will get cleaned up in the
-    * future:
-    */
-   boolean (*run)( struct draw_vertex_shader *shader,
-                   struct draw_context *draw,
-                   const unsigned *elts,
-                   unsigned count,
-                   void *out,
-                   unsigned vertex_size);
-
-
-   void (*delete)( struct draw_vertex_shader * );
-};
-
-
-/* Internal function for vertex fetch.
- */
-typedef void (*fetch_func)(const void *ptr, float *attrib);
-
-fetch_func draw_get_fetch_func( enum pipe_format format );
-
-
-
-typedef void (*full_fetch_func)( struct draw_context *draw,
-				 struct tgsi_exec_machine *machine,
-				 const unsigned *elts,
-				 unsigned count );
-
-typedef void (*pt_fetch_func)( struct draw_context *draw,
-			      float *out,
-			      unsigned start,
-			       unsigned count );
-
-
-struct vbuf_render;
+#define PT_SHADE      0x1
+#define PT_CLIPTEST   0x2
+#define PT_PIPELINE   0x4
+#define PT_MAX_MIDDLE 0x8
 
 /**
  * Private context for the drawing module.
@@ -207,6 +122,17 @@ struct draw_context
       struct draw_stage *wide_line;
       struct draw_stage *wide_point;
       struct draw_stage *rasterize;
+
+      float wide_point_threshold; /**< convert pnts to tris if larger than this */
+      float wide_line_threshold;  /**< convert lines to tris if wider than this */
+      boolean line_stipple;       /**< do line stipple? */
+      boolean point_sprite;       /**< convert points to quads for sprites? */
+
+      /* Temporary storage while the pipeline is being run:
+       */
+      char *verts;
+      unsigned vertex_stride;
+      unsigned vertex_count;
    } pipeline;
 
 
@@ -215,71 +141,63 @@ struct draw_context
    /* Support prototype passthrough path:
     */
    struct {
-      unsigned prim;           /* XXX: to be removed */
-      unsigned hw_vertex_size; /* XXX: to be removed */
-
       struct {
          struct draw_pt_middle_end *fetch_emit;
-         struct draw_pt_middle_end *fetch_pipeline;
-         struct draw_pt_middle_end *fetch_shade_emit;
-         struct draw_pt_middle_end *fetch_shade_cliptest_pipeline_or_emit;
+         struct draw_pt_middle_end *general;
       } middle;
 
       struct {
-         struct draw_pt_front_end *noop;
-         struct draw_pt_front_end *split_arrays;
          struct draw_pt_front_end *vcache;
       } front;
 
+      struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+      unsigned nr_vertex_buffers;
+
+      struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+      unsigned nr_vertex_elements;
+
+      /* user-space vertex data, buffers */
       struct {
-         char *verts;
-         unsigned vertex_stride;
-         unsigned vertex_count;
-      } pipeline;
+         const unsigned *edgeflag;
+
+         /** vertex element/index buffer (ex: glDrawElements) */
+         const void *elts;
+         /** bytes per index (0, 1, 2 or 4) */
+         unsigned eltSize;
+         
+         /** vertex arrays */
+         const void *vbuffer[PIPE_MAX_ATTRIBS];
+         
+         /** constant buffer (for vertex shader) */
+         const void *constants;
+      } user;
 
    } pt;
 
+   struct {
+      boolean bypass_clipping;
+   } driver;
+
    boolean flushing;
+   boolean vcache_flushing;
+   boolean bypass_clipping;     /* set if either api or driver bypass_clipping true */
 
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
-   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
-   struct pipe_vertex_element vertex_element[PIPE_MAX_ATTRIBS];
+
    struct draw_vertex_shader *vertex_shader;
 
    boolean identity_viewport;
 
    uint num_vs_outputs;  /**< convenience, from vertex_shader */
 
-   /* user-space vertex data, buffers */
-   struct {
-      const unsigned *edgeflag;
-
-      /** vertex element/index buffer (ex: glDrawElements) */
-      const void *elts;
-      /** bytes per index (0, 1, 2 or 4) */
-      unsigned eltSize;
-
-      /** vertex arrays */
-      const void *vbuffer[PIPE_MAX_ATTRIBS];
-
-      /** constant buffer (for vertex shader) */
-      const void *constants;
-   } user;
 
    /* Clip derived state:
     */
    float plane[12][4];
    unsigned nr_planes;
 
-   float wide_point_threshold; /**< convert pnts to tris if larger than this */
-   float wide_line_threshold;  /**< convert lines to tris if wider than this */
-   boolean line_stipple;       /**< do line stipple? */
-   boolean point_sprite;       /**< convert points to quads for sprites? */
-   boolean use_sse;
-   boolean use_pt_shaders;	/* temporary flag to switch on pt shader paths */
-
    /* If a prim stage introduces new vertex attributes, they'll be stored here
     */
    struct {
@@ -293,59 +211,6 @@ struct draw_context
    /** TGSI program interpreter runtime state */
    struct tgsi_exec_machine machine;
 
-   /* Vertex fetch internal state
-    */
-   struct {
-      const ubyte *src_ptr[PIPE_MAX_ATTRIBS];
-      unsigned pitch[PIPE_MAX_ATTRIBS];
-      fetch_func fetch[PIPE_MAX_ATTRIBS];
-      unsigned nr_attrs;
-      full_fetch_func fetch_func;
-      pt_fetch_func pt_fetch;
-   } vertex_fetch;
-
-   /* Post-tnl vertex cache:
-    */
-   struct {
-      unsigned referenced;  /**< bitfield */
-
-      struct {
-	 unsigned in;		/* client array element */
-	 unsigned out;		/* index in vs queue/array */
-      } idx[VCACHE_SIZE + VCACHE_OVERFLOW];
-
-      unsigned overflow;
-
-      /** To find space in the vertex cache: */
-      struct vertex_header *(*get_vertex)( struct draw_context *draw,
-                                           unsigned i );
-   } vcache;
-
-   /* Vertex shader queue:
-    */
-   struct {
-      unsigned elts[VS_QUEUE_LENGTH];   /**< index into the user's vertex arrays */
-      char *vertex_cache;
-      unsigned queue_nr;
-      unsigned post_nr;
-   } vs;
-
-   /**
-    * Run the vertex shader on all vertices in the vertex queue.
-    */
-   void (*shader_queue_flush)(struct draw_context *draw);
-
-   /* Prim pipeline queue:
-    */
-   struct {
-      /* Need to queue up primitives until their vertices have been
-       * transformed by a vs queue flush.
-       */
-      struct prim_header queue[PRIM_QUEUE_LENGTH];
-      unsigned queue_nr;
-   } pq;
-
-
    /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
     */
    struct gallivm_cpu_engine *engine;   
@@ -354,107 +219,49 @@ struct draw_context
 
 
 
-extern struct draw_stage *draw_unfilled_stage( struct draw_context *context );
-extern struct draw_stage *draw_twoside_stage( struct draw_context *context );
-extern struct draw_stage *draw_offset_stage( struct draw_context *context );
-extern struct draw_stage *draw_clip_stage( struct draw_context *context );
-extern struct draw_stage *draw_flatshade_stage( struct draw_context *context );
-extern struct draw_stage *draw_cull_stage( struct draw_context *context );
-extern struct draw_stage *draw_stipple_stage( struct draw_context *context );
-extern struct draw_stage *draw_wide_line_stage( struct draw_context *context );
-extern struct draw_stage *draw_wide_point_stage( struct draw_context *context );
-extern struct draw_stage *draw_validate_stage( struct draw_context *context );
 
 
-extern void draw_free_temp_verts( struct draw_stage *stage );
 
-extern void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr );
 
-extern void draw_reset_vertex_ids( struct draw_context *draw );
+/*******************************************************************************
+ * Vertex processing (was passthrough) code:
+ */
+boolean draw_pt_init( struct draw_context *draw );
+void draw_pt_destroy( struct draw_context *draw );
+void draw_pt_reset_vertex_ids( struct draw_context *draw );
 
 
-extern int draw_vertex_cache_check_space( struct draw_context *draw, 
-					  unsigned nr_verts );
+/*******************************************************************************
+ * Primitive processing (pipeline) code: 
+ */
 
-extern void draw_vertex_cache_invalidate( struct draw_context *draw );
-extern void draw_vertex_cache_unreference( struct draw_context *draw );
-extern void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw );
+boolean draw_pipeline_init( struct draw_context *draw );
+void draw_pipeline_destroy( struct draw_context *draw );
 
-extern void draw_vertex_shader_queue_flush( struct draw_context *draw );
+void draw_pipeline_run( struct draw_context *draw,
+                        unsigned prim,
+                        struct vertex_header *vertices,
+                        unsigned vertex_count,
+                        unsigned stride,
+                        const ushort *elts,
+                        unsigned count );
 
-extern void draw_update_vertex_fetch( struct draw_context *draw );
+void draw_pipeline_flush( struct draw_context *draw, 
+                          unsigned flags );
 
-extern boolean draw_need_pipeline(const struct draw_context *draw,
-                                  unsigned prim );
 
 
-/* Passthrough mode (second attempt):
+/*******************************************************************************
+ * Flushing 
  */
-boolean draw_pt_init( struct draw_context *draw );
-void draw_pt_destroy( struct draw_context *draw );
-boolean draw_pt_arrays( struct draw_context *draw,
-                        unsigned prim,
-                        unsigned start,
-                        unsigned count );
 
-void draw_pt_reset_vertex_ids( struct draw_context *draw );
-void draw_pt_run_pipeline( struct draw_context *draw,
-                           unsigned prim,
-                           char *verts,
-                           unsigned vertex_stride,
-                           unsigned vertex_count,
-                           const ushort *elts,
-                           unsigned count );
-
-
-#define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
-#define DRAW_FLUSH_PRIM_QUEUE                0x2
-#define DRAW_FLUSH_VERTEX_CACHE              0x4
 #define DRAW_FLUSH_STATE_CHANGE              0x8
 #define DRAW_FLUSH_BACKEND                   0x10
 
 
 void draw_do_flush( struct draw_context *draw, unsigned flags );
 
-boolean draw_get_edgeflag( struct draw_context *draw,
-                           unsigned idx );
 
 
-/**
- * Get a writeable copy of a vertex.
- * \param stage  drawing stage info
- * \param vert  the vertex to copy (source)
- * \param idx  index into stage's tmp[] array to put the copy (dest)
- * \return  pointer to the copied vertex
- */
-static INLINE struct vertex_header *
-dup_vert( struct draw_stage *stage,
-	  const struct vertex_header *vert,
-	  unsigned idx )
-{   
-   struct vertex_header *tmp = stage->tmp[idx];
-   const uint vsize = sizeof(struct vertex_header)
-      + stage->draw->num_vs_outputs * 4 * sizeof(float);
-   memcpy(tmp, vert, vsize);
-   tmp->vertex_id = UNDEFINED_VERTEX_ID;
-   return tmp;
-}
-
-static INLINE float
-dot4(const float *a, const float *b)
-{
-   float result = (a[0]*b[0] +
-                   a[1]*b[1] +
-                   a[2]*b[2] +
-                   a[3]*b[3]);
-
-   return result;
-}
-
-static INLINE struct vertex_header *
-draw_header_from_block(char *block, int size, int num)
-{
-   return (struct vertex_header*)(block + num * size);
-}
 
 #endif /* DRAW_PRIVATE_H */
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 3d2e7bf7b8..f5a3bf390e 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -36,146 +36,53 @@
 #include "draw/draw_pt.h"
 
 
-#if 0
-static boolean too_many_elts( struct draw_context *draw,
-                              unsigned elts )
-{
-   return elts > (8 * 1024);
-}
-#endif
-
-static INLINE unsigned reduced_prim(unsigned prim)
-{
-   /*FIXME*/
-   return prim;
-}
 
-static INLINE boolean good_prim(unsigned prim)
-{
-   /*FIXME*/
-   return FALSE;
-}
 
-boolean
+/* Overall we split things into:
+ *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
+ *     - middle   -- fetch, shade, cliptest, viewport
+ *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
+ *     - backend  -- the vbuf_render provided by the driver.
+ */
+static boolean
 draw_pt_arrays(struct draw_context *draw, 
                unsigned prim,
                unsigned start, 
                unsigned count)
 {
-   const boolean pipeline = draw_need_pipeline(draw, prim);
-   const boolean cliptest = !draw->rasterizer->bypass_clipping;
-   const boolean shading  = !draw->rasterizer->bypass_vs;
    struct draw_pt_front_end *frontend = NULL;
    struct draw_pt_middle_end *middle = NULL;
+   unsigned opt = 0;
 
-   if (!draw->render)
-      return FALSE;
-   /*debug_printf("XXXXXXXXXX needs_pipeline = %d\n", pipeline);*/
+   if (!draw->render) {
+      opt |= PT_PIPELINE;
+   }
 
-   /* Overall we do:
-    *     - frontend -- prepare fetch_elts, draw_elts - eg vcache
-    *     - middle   -- fetch, shade, cliptest, viewport
-    *     - pipeline -- the prim pipeline: clipping, wide lines, etc 
-    *     - backend  -- the vbuf_render provided by the driver.
-    */
+   if (draw_need_pipeline(draw, 
+                          draw->rasterizer,
+                          prim)) {
+      opt |= PT_PIPELINE;
+   }
 
-   if (shading && !draw->use_pt_shaders)
-      return FALSE;
+   if (!draw->bypass_clipping) {
+      opt |= PT_CLIPTEST;
+   }
 
+   if (!draw->rasterizer->bypass_vs) {
+      opt |= PT_SHADE;
+   }
 
-   if (!cliptest && !pipeline && !shading) {
-      /* This is the 'passthrough' path:
-       */
-      /* Fetch user verts, emit hw verts:
-       */
+   if (opt)
+      middle = draw->pt.middle.general;
+   else
       middle = draw->pt.middle.fetch_emit;
-   }
-   else if (!cliptest && !shading) {
-      /* This is the 'passthrough' path targetting the pipeline backend.
-       */
-      /* Fetch user verts, emit pipeline verts, run pipeline:
-       */
-      middle = draw->pt.middle.fetch_pipeline;
-   }
-   else if (!cliptest && !pipeline) {
-      /* Fetch user verts, run vertex shader, emit hw verts:
-       */
-      middle = draw->pt.middle.fetch_shade_emit;
-   }
-   else if (!pipeline) {
-      /* Even though !pipeline, we have to run it to get clipping.  We
-       * do know that the pipeline is just the clipping operation, but
-       * that probably doesn't help much.
-       *
-       * This is going to be the most important path for a lot of
-       * swtnl cards.
-       */
-      /* Fetch user verts, 
-       *    run vertex shader, 
-       *    cliptest and viewport trasform
-       *    if no clipped vertices,
-       *        emit hw verts
-       *    else
-       *        run pipline
-       */
-      middle = draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit;
-   }
-   else {
-      /* This is what we're currently always doing:
-       */
-      /* Fetch user verts, run vertex shader, cliptest, run pipeline
-       * or
-       * Fetch user verts, run vertex shader, run pipeline
-       */
-      middle = draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit;
-   }
 
 
-   /* If !pipeline, need to make sure we respect the driver's limited
-    * capabilites to receive blocks of vertex data and elements.
+   /* May create a short-circuited version of this for small primitives:
     */
-#if 0
-   if (!pipeline) {
-      unsigned vertex_mode = passthrough;
-      unsigned nr_verts = count_vertices( draw, start, count );
-      unsigned hw_prim = prim;
-
-      if (is_elts(draw)) {
-         frontend = draw->pt.front.vcache;
-         hw_prim = reduced_prim(prim);
-      }
-#if 0
-      if (too_many_verts(nr_verts)) {
-         /* if (is_verts(draw) && can_split(prim)) {
-            draw = draw_arrays_split;
-         }
-         else */ {
-            frontend = draw->pt.front.vcache;
-            hw_prim = reduced_prim(prim);
-         }
-      }
-#endif
-
-      if (too_many_elts(count)) {
-
-         /* if (is_elts(draw) && can_split(prim)) {
-            draw = draw_elts_split;
-         }
-         else */ {
-            frontend = draw->pt.front.vcache;
-            hw_prim = reduced_prim(prim);
-         }
-      }
-
-      if (!good_prim(hw_prim)) {
-         frontend = draw->pt.front.vcache;
-      }
-   }
-#else
    frontend = draw->pt.front.vcache;
-#endif   
 
-   frontend->prepare( frontend, prim, middle );
+   frontend->prepare( frontend, prim, middle, opt );
 
    frontend->run( frontend,
                   draw_pt_elt_func( draw ),
@@ -190,21 +97,16 @@ draw_pt_arrays(struct draw_context *draw,
 
 boolean draw_pt_init( struct draw_context *draw )
 {
-   draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
-   if (!draw->pt.middle.fetch_emit)
-      return FALSE;
-
-   draw->pt.middle.fetch_pipeline = draw_pt_fetch_pipeline( draw );
-   if (!draw->pt.middle.fetch_pipeline)
+   draw->pt.front.vcache = draw_pt_vcache( draw );
+   if (!draw->pt.front.vcache)
       return FALSE;
 
-   draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit =
-      draw_pt_fetch_pipeline_or_emit( draw );
-   if (!draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit)
+   draw->pt.middle.fetch_emit = draw_pt_fetch_emit( draw );
+   if (!draw->pt.middle.fetch_emit)
       return FALSE;
 
-   draw->pt.front.vcache = draw_pt_vcache( draw );
-   if (!draw->pt.front.vcache)
+   draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );
+   if (!draw->pt.middle.general)
       return FALSE;
 
    return TRUE;
@@ -213,24 +115,63 @@ boolean draw_pt_init( struct draw_context *draw )
 
 void draw_pt_destroy( struct draw_context *draw )
 {
+   if (draw->pt.middle.general) {
+      draw->pt.middle.general->destroy( draw->pt.middle.general );
+      draw->pt.middle.general = NULL;
+   }
+
    if (draw->pt.middle.fetch_emit) {
       draw->pt.middle.fetch_emit->destroy( draw->pt.middle.fetch_emit );
       draw->pt.middle.fetch_emit = NULL;
    }
 
-   if (draw->pt.middle.fetch_pipeline) {
-      draw->pt.middle.fetch_pipeline->destroy( draw->pt.middle.fetch_pipeline );
-      draw->pt.middle.fetch_pipeline = NULL;
-   }
-
-   if (draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit) {
-      draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit->destroy(
-         draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit );
-      draw->pt.middle.fetch_shade_cliptest_pipeline_or_emit = NULL;
-   }
-
    if (draw->pt.front.vcache) {
       draw->pt.front.vcache->destroy( draw->pt.front.vcache );
       draw->pt.front.vcache = NULL;
    }
 }
+
+
+
+static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+   PIPE_PRIM_POINTS,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES,
+   PIPE_PRIM_TRIANGLES
+};
+
+
+/**
+ * Draw vertex arrays
+ * This is the main entrypoint into the drawing module.
+ * \param prim  one of PIPE_PRIM_x
+ * \param start  index of first vertex to draw
+ * \param count  number of vertices to draw
+ */
+void
+draw_arrays(struct draw_context *draw, unsigned prim,
+            unsigned start, unsigned count)
+{
+   if (reduced_prim[prim] != draw->reduced_prim) {
+      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw->reduced_prim = reduced_prim[prim];
+   }
+
+   /* drawing done here: */
+   draw_pt_arrays(draw, prim, start, count);
+}
+
+boolean draw_pt_get_edgeflag( struct draw_context *draw,
+                              unsigned idx )
+{
+   if (draw->pt.user.edgeflag)
+      return (draw->pt.user.edgeflag[idx/32] & (1 << (idx%32))) != 0;
+   else
+      return 1;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 48413b648a..fd0d158fcf 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -50,6 +50,12 @@ struct draw_context;
 #define DRAW_PT_FLAG_MASK     (3<<30)
 
 
+#define PT_SHADE      0x1
+#define PT_CLIPTEST   0x2
+#define PT_PIPELINE   0x4
+#define PT_MAX_MIDDLE 0x8
+
+
 /* The "front end" - prepare sets of fetch, draw elements for the
  * middle end.
  *
@@ -64,7 +70,8 @@ struct draw_context;
 struct draw_pt_front_end {
    void (*prepare)( struct draw_pt_front_end *,
                     unsigned prim,
-                    struct draw_pt_middle_end * );
+                    struct draw_pt_middle_end *,
+		    unsigned opt );
 
    void (*run)( struct draw_pt_front_end *,
                 pt_elt_func elt_func,
@@ -82,15 +89,11 @@ struct draw_pt_front_end {
  * Currently two versions of this:
  *     - fetch, vertex shade, cliptest, prim-pipeline
  *     - fetch, emit (ie passthrough)
- * Later:
- *     - fetch, vertex shade, cliptest, maybe-pipeline, maybe-emit
- *     - fetch, vertex shade, emit
- *
- * Currenly only using the passthrough version.
  */
 struct draw_pt_middle_end {
    void (*prepare)( struct draw_pt_middle_end *,
-                    unsigned prim );
+                    unsigned prim,
+		    unsigned opt );
 
    void (*run)( struct draw_pt_middle_end *,
                 const unsigned *fetch_elts,
@@ -104,12 +107,9 @@ struct draw_pt_middle_end {
 
 
 /* The "back end" - supplied by the driver, defined in draw_vbuf.h.
- *
- * Not sure whether to wrap the prim pipeline up as an alternate
- * backend.  Would be a win for everything except pure passthrough
- * mode...  
  */
 struct vbuf_render;
+struct vertex_header;
 
 
 /* Helper functions.
@@ -118,12 +118,88 @@ pt_elt_func draw_pt_elt_func( struct draw_context *draw );
 const void *draw_pt_elt_ptr( struct draw_context *draw,
                              unsigned start );
 
-/* Implementations:
+/* Frontends: 
+ *
+ * Currently only the general-purpose vcache implementation, could add
+ * a special case for tiny vertex buffers.
  */
 struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
+
+/* Middle-ends:
+ *
+ * Currently one general-purpose case which can do all possibilities,
+ * at the slight expense of creating a vertex_header in some cases
+ * unecessarily.
+ *
+ * The special case fetch_emit code avoids pipeline vertices
+ * altogether and builds hardware vertices directly from API
+ * vertex_elements.
+ */
 struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw );
-struct draw_pt_middle_end *draw_pt_fetch_pipeline( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
 
 
+/* More helpers:
+ */
+boolean draw_pt_get_edgeflag( struct draw_context *draw,
+                              unsigned idx );
+
+
+/*******************************************************************************
+ * HW vertex emit:
+ */
+struct pt_emit;
+
+void draw_pt_emit_prepare( struct pt_emit *emit,
+			   unsigned prim );
+
+void draw_pt_emit( struct pt_emit *emit,
+		   const float (*vertex_data)[4],
+		   unsigned vertex_count,
+		   unsigned stride,
+		   const ushort *elts,
+		   unsigned count );
+
+void draw_pt_emit_destroy( struct pt_emit *emit );
+
+struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
+
+
+/*******************************************************************************
+ * API vertex fetch:
+ */
+
+struct pt_fetch;
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    unsigned vertex_size );
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts );
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch );
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
+
+/*******************************************************************************
+ * Post-VS: cliptest, rhw, viewport
+ */
+struct pt_post_vs;
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned stride,
+			     unsigned count );
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl );
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
+
+
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_elts.c b/src/gallium/auxiliary/draw/draw_pt_elts.c
index d49770e7b2..2094c081ed 100644
--- a/src/gallium/auxiliary/draw/draw_pt_elts.c
+++ b/src/gallium/auxiliary/draw/draw_pt_elts.c
@@ -59,7 +59,7 @@ static unsigned elt_vert( const void *elts, unsigned idx )
 
 pt_elt_func draw_pt_elt_func( struct draw_context *draw )
 {
-   switch (draw->user.eltSize) {
+   switch (draw->pt.user.eltSize) {
    case 0: return elt_vert;
    case 1: return elt_ubyte;
    case 2: return elt_ushort; 
@@ -71,9 +71,9 @@ pt_elt_func draw_pt_elt_func( struct draw_context *draw )
 const void *draw_pt_elt_ptr( struct draw_context *draw,
                              unsigned start )
 {
-   const char *elts = draw->user.elts;
+   const char *elts = draw->pt.user.elts;
 
-   switch (draw->user.eltSize) {
+   switch (draw->pt.user.eltSize) {
    case 0: 
       return (const void *)(((const ubyte *)NULL) + start);
    case 1: 
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
new file mode 100644
index 0000000000..d35329aba0
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -0,0 +1,252 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct pt_emit {
+   struct draw_context *draw;
+
+   struct translate *translate;
+
+   struct cso_hash *hash;
+};
+
+static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+{
+   unsigned size = sizeof(struct translate_key) -
+                   sizeof(struct translate_element) * (PIPE_MAX_ATTRIBS - key->nr_elements);
+   return size;
+}
+
+static INLINE unsigned create_key(struct translate_key *key)
+{
+   unsigned hash_key;
+   unsigned size = translate_hash_key_size(key);
+   /*debug_printf("key size = %d, (els = %d)\n",
+     size, key->nr_elements);*/
+   hash_key = cso_construct_key(key, size);
+   return hash_key;
+}
+
+static struct translate *cached_translate(struct pt_emit *emit,
+                                          struct translate_key *key)
+{
+   unsigned hash_key = create_key(key);
+   struct cso_hash_iter iter = cso_hash_find(emit->hash, hash_key);
+   struct translate *translate = 0;
+
+   if (cso_hash_iter_is_null(iter)) {
+      translate = translate_create(key);
+      cso_hash_insert(emit->hash, hash_key, translate);
+      /*debug_printf("\tCREATED with %d\n", hash_key);*/
+   } else {
+      translate = cso_hash_iter_data(iter);
+      /*debug_printf("\tOK with %d\n", hash_key);*/
+   }
+
+   return translate;
+}
+
+
+static INLINE void delete_translates(struct pt_emit *emit)
+{
+   struct cso_hash *hash = emit->hash;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      struct translate *state = (struct translate*)cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         state->release(state);
+      }
+   }
+}
+
+void draw_pt_emit_prepare( struct pt_emit *emit,
+			   unsigned prim )
+{
+   struct draw_context *draw = emit->draw;
+   const struct vertex_info *vinfo;
+   unsigned dst_offset;
+   struct translate_key hw_key;
+   unsigned i;
+   boolean ok;
+
+   ok = draw->render->set_primitive(draw->render, prim);
+   if (!ok) {
+      assert(0);
+      return;
+   }
+
+   /* Must do this after set_primitive() above:
+    */
+   vinfo = draw->render->get_vertex_info(draw->render);
+
+
+   /* Translate from pipeline vertices to hw vertices.
+    */
+   dst_offset = 0;
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      unsigned emit_sz = 0;
+      unsigned src_buffer = 0;
+      unsigned output_format;
+      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
+
+
+         
+      switch (vinfo->emit[i]) {
+      case EMIT_4F:
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
+	 break;
+      case EMIT_3F:
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
+	 break;
+      case EMIT_2F:
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
+	 break;
+      case EMIT_1F:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 break;
+      case EMIT_1F_PSIZE:
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
+	 src_buffer = 1;
+	 src_offset = 0;
+	 break;
+      case EMIT_4UB:
+	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+	 emit_sz = 4 * sizeof(ubyte);
+      default:
+	 assert(0);
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 break;
+      }
+      
+      hw_key.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      hw_key.element[i].input_buffer = src_buffer;
+      hw_key.element[i].input_offset = src_offset;
+      hw_key.element[i].output_format = output_format;
+      hw_key.element[i].output_offset = dst_offset;
+
+      dst_offset += emit_sz;
+   }
+
+   hw_key.nr_elements = vinfo->num_attribs;
+   hw_key.output_stride = vinfo->size * 4;
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!emit->translate ||
+       memcmp(&emit->translate->key, &hw_key, sizeof(hw_key)) != 0) 
+   {
+      emit->translate = cached_translate(emit, &hw_key);
+   }
+}
+
+
+void draw_pt_emit( struct pt_emit *emit,
+		   const float (*vertex_data)[4],
+		   unsigned vertex_count,
+		   unsigned stride,
+		   const ushort *elts,
+		   unsigned count )
+{
+   struct draw_context *draw = emit->draw;
+   struct translate *translate = emit->translate;
+   struct vbuf_render *render = draw->render;
+   void *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   hw_verts = render->allocate_vertices(render,
+					(ushort)translate->key.output_stride,
+					(ushort)count);
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   translate->set_buffer(translate, 
+			 0, 
+			 vertex_data,
+			 stride );
+
+   translate->set_buffer(translate, 
+			 1, 
+			 &draw->rasterizer->point_size,
+			 0);
+
+   translate->run( translate,
+		   0, 
+		   vertex_count,
+		   hw_verts );
+
+   render->draw(render,
+		elts,
+		count);
+
+   render->release_vertices(render,
+			    hw_verts,
+			    translate->key.output_stride,
+			    vertex_count);
+}
+
+
+struct pt_emit *draw_pt_emit_create( struct draw_context *draw )
+{
+   struct pt_emit *emit = CALLOC_STRUCT(pt_emit);
+   if (!emit)
+      return NULL;
+
+   emit->draw = draw;
+   emit->hash = cso_hash_create();
+
+   return emit;
+}
+
+void draw_pt_emit_destroy( struct pt_emit *emit )
+{
+   delete_translates(emit);
+   cso_hash_delete(emit->hash);
+
+   FREE(emit);
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
new file mode 100644
index 0000000000..93da811ed8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -0,0 +1,223 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "translate/translate.h"
+
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct pt_fetch {
+   struct draw_context *draw;
+
+   struct translate *translate;
+
+   unsigned vertex_size;
+
+   struct cso_hash *hash;
+};
+
+static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+{
+   unsigned size = sizeof(struct translate_key) -
+                   sizeof(struct translate_element) * (PIPE_MAX_ATTRIBS - key->nr_elements);
+   return size;
+}
+
+static INLINE unsigned create_key(struct translate_key *key)
+{
+   unsigned hash_key;
+   unsigned size = translate_hash_key_size(key);
+   /*debug_printf("key size = %d, (els = %d)\n",
+     size, key->nr_elements);*/
+   hash_key = cso_construct_key(key, size);
+   return hash_key;
+}
+
+static struct translate *cached_translate(struct pt_fetch *fetch,
+                                          struct translate_key *key)
+{
+   unsigned hash_key = create_key(key);
+   struct cso_hash_iter iter = cso_hash_find(fetch->hash, hash_key);
+   struct translate *translate = 0;
+
+   if (cso_hash_iter_is_null(iter)) {
+      translate = translate_create(key);
+      cso_hash_insert(fetch->hash, hash_key, translate);
+      /*debug_printf("\tCREATED with %d\n", hash_key);*/
+   } else {
+      translate = cso_hash_iter_data(iter);
+      /*debug_printf("\tOK with %d\n", hash_key);*/
+   }
+
+   return translate;
+}
+
+static INLINE void delete_translates(struct pt_fetch *fetch)
+{
+   struct cso_hash *hash = fetch->hash;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      struct translate *state = (struct translate*)cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         state->release(state);
+      }
+   }
+}
+
+/* Perform the fetch from API vertex elements & vertex buffers, to a
+ * contiguous set of float[4] attributes as required for the
+ * vertex_shader->run_linear() method.
+ *
+ * This is used in all cases except pure passthrough
+ * (draw_pt_fetch_emit.c) which has its own version to translate
+ * directly to hw vertices.
+ *
+ */
+void draw_pt_fetch_prepare( struct pt_fetch *fetch,
+			    unsigned vertex_size )
+{
+   struct draw_context *draw = fetch->draw;
+   unsigned i, nr = 0;
+   unsigned dst_offset = 0;
+   struct translate_key key;
+
+   fetch->vertex_size = vertex_size;
+
+   memset(&key, 0, sizeof(key));
+
+   /* Always emit/leave space for a vertex header.
+    *
+    * It's worth considering whether the vertex headers should contain
+    * a pointer to the 'data', rather than having it inline.
+    * Something to look at after we've fully switched over to the pt
+    * paths.
+    */
+   {
+      /* Need to set header->vertex_id = 0xffff somehow.
+       */
+      key.element[nr].input_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].input_buffer = draw->pt.nr_vertex_buffers;
+      key.element[nr].input_offset = 0;
+      key.element[nr].output_format = PIPE_FORMAT_R32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+      dst_offset += 1 * sizeof(float);
+      nr++;
+
+
+      /* Just leave the clip[] array untouched.
+       */
+      dst_offset += 4 * sizeof(float);
+   }
+      
+
+   for (i = 0; i < draw->pt.nr_vertex_elements; i++) {
+      key.element[nr].input_format = draw->pt.vertex_element[i].src_format;
+      key.element[nr].input_buffer = draw->pt.vertex_element[i].vertex_buffer_index;
+      key.element[nr].input_offset = draw->pt.vertex_element[i].src_offset;
+      key.element[nr].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      key.element[nr].output_offset = dst_offset;
+
+      dst_offset += 4 * sizeof(float);
+      nr++;
+   }
+
+   assert(dst_offset <= vertex_size);
+
+   key.nr_elements = nr;
+   key.output_stride = vertex_size;
+
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!fetch->translate ||
+       memcmp(&fetch->translate->key, &key, sizeof(key)) != 0) 
+   {
+      fetch->translate = cached_translate(fetch, &key);
+
+      {
+	 static struct vertex_header vh = { 0, 0, 0, 0xffff };
+	 fetch->translate->set_buffer(fetch->translate, 
+				      draw->pt.nr_vertex_buffers, 
+				      &vh,
+				      0);
+      }
+   }
+}
+
+
+
+
+void draw_pt_fetch_run( struct pt_fetch *fetch,
+			const unsigned *elts,
+			unsigned count,
+			char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      translate->set_buffer(translate, 
+			    i, 
+			    ((char *)draw->pt.user.vbuffer[i] + 
+			     draw->pt.vertex_buffer[i].buffer_offset),
+			    draw->pt.vertex_buffer[i].pitch );
+   }
+
+   translate->run_elts( translate,
+			elts, 
+			count,
+			verts );
+}
+
+
+struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )
+{
+   struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch);
+   if (!fetch)
+      return NULL;
+
+   fetch->draw = draw;
+   fetch->hash = cso_hash_create();
+   return fetch;
+}
+
+void draw_pt_fetch_destroy( struct pt_fetch *fetch )
+{
+   delete_translates(fetch);
+   cso_hash_delete(fetch->hash);
+
+   FREE(fetch);
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 3a26a5d712..68b2c5b1e3 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -36,6 +36,7 @@
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
+#include "translate/translate.h"
 
 /* The simplest 'middle end' in the new vertex code.  
  * 
@@ -72,105 +73,29 @@
 struct fetch_emit_middle_end {
    struct draw_pt_middle_end base;
    struct draw_context *draw;
-
-   struct {
-      const ubyte *ptr;
-      unsigned pitch;
-      void (*fetch)( const void *from, float *attrib);
-      void (*emit)( const float *attrib, float **out );
-   } fetch[PIPE_MAX_ATTRIBS];
    
-   unsigned nr_fetch;
-   unsigned hw_vertex_size;
-};
-
-
-
-static void fetch_R32_FLOAT( const void *from,
-                             float *attrib )
-{
-   float *f = (float *) from;
-   attrib[0] = f[0];
-   attrib[1] = 0.0;
-   attrib[2] = 0.0;
-   attrib[3] = 1.0;
-}
-
-
-static void emit_R32_FLOAT( const float *attrib,
-                            float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out) += 1;
-}
+   struct translate *translate;
+   const struct vertex_info *vinfo;
 
-static void emit_R32G32_FLOAT( const float *attrib,
-                               float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out) += 2;
-}
+   /* Cache point size somewhere it's address won't change:
+    */
+   float point_size;
 
-static void emit_R32G32B32_FLOAT( const float *attrib,
-                                  float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out) += 3;
-}
+};
 
-static void emit_R32G32B32A32_FLOAT( const float *attrib,
-                                     float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out)[3] = attrib[3];
-   (*out) += 4;
-}
-
-
-/**
- * General-purpose fetch from user's vertex arrays, emit to driver's
- * vertex buffer.
- *
- * XXX this is totally temporary.
- */
-static void
-fetch_store_general( struct fetch_emit_middle_end *feme,
-                     void *out_ptr,
-                     const unsigned *fetch_elts,
-                     unsigned count )
-{
-   float *out = (float *)out_ptr;
-   uint i, j;
-
-   for (i = 0; i < count; i++) {
-      unsigned elt = fetch_elts[i] & ~DRAW_PT_FLAG_MASK;
-      
-      for (j = 0; j < feme->nr_fetch; j++) {
-         float attrib[4];
-         const ubyte *from = (feme->fetch[j].ptr +
-                              feme->fetch[j].pitch * elt);
-         
-         feme->fetch[j].fetch( from, attrib );
-         feme->fetch[j].emit( attrib, &out );
-      }
-   }
-}
 
 
 
 static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
-                                unsigned prim )
+                                unsigned prim,
+				unsigned opt )
 {
    struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
    struct draw_context *draw = feme->draw;
    const struct vertex_info *vinfo;
-   unsigned i;
+   unsigned i, dst_offset;
    boolean ok;
+   struct translate_key key;
 
 
    ok = draw->render->set_primitive( draw->render, 
@@ -182,49 +107,93 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    
    /* Must do this after set_primitive() above:
     */
-   vinfo = draw->render->get_vertex_info(draw->render);
+   vinfo = feme->vinfo = draw->render->get_vertex_info(draw->render);
+   
+   
 
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      unsigned src_element = vinfo->src_index[i];
-      unsigned src_buffer = draw->vertex_element[src_element].vertex_buffer_index;
-         
-      feme->fetch[i].ptr = ((const ubyte *)draw->user.vbuffer[src_buffer] + 
-                            draw->vertex_buffer[src_buffer].buffer_offset + 
-                            draw->vertex_element[src_element].src_offset);
+   /* Transform from API vertices to HW vertices, skipping the
+    * pipeline_vertex intermediate step.
+    */
+   dst_offset = 0;
+   memset(&key, 0, sizeof(key));
 
-      feme->fetch[i].pitch = draw->vertex_buffer[src_buffer].pitch;
-         
-      feme->fetch[i].fetch = draw_get_fetch_func(draw->vertex_element[src_element].src_format);
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]];
 
+      unsigned emit_sz = 0;
+      unsigned input_format = src->src_format;
+      unsigned input_buffer = src->vertex_buffer_index;
+      unsigned input_offset = src->src_offset;
+      unsigned output_format;
 
       switch (vinfo->emit[i]) {
       case EMIT_4F:
-         feme->fetch[i].emit = emit_R32G32B32A32_FLOAT;
+	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+	 emit_sz = 4 * sizeof(float);
          break;
       case EMIT_3F:
-         feme->fetch[i].emit = emit_R32G32B32_FLOAT;
+	 output_format = PIPE_FORMAT_R32G32B32_FLOAT;
+	 emit_sz = 3 * sizeof(float);
          break;
       case EMIT_2F:
-         feme->fetch[i].emit = emit_R32G32_FLOAT;
+	 output_format = PIPE_FORMAT_R32G32_FLOAT;
+	 emit_sz = 2 * sizeof(float);
          break;
       case EMIT_1F:
-         feme->fetch[i].emit = emit_R32_FLOAT;
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
          break;
       case EMIT_1F_PSIZE:
-         feme->fetch[i].ptr = (const ubyte *)&feme->draw->rasterizer->point_size;
-         feme->fetch[i].pitch = 0;
-         feme->fetch[i].fetch = fetch_R32_FLOAT;
-         feme->fetch[i].emit = emit_R32_FLOAT;
+	 input_format = PIPE_FORMAT_R32_FLOAT;
+	 input_buffer = draw->pt.nr_vertex_buffers;
+	 input_offset = 0;
+	 output_format = PIPE_FORMAT_R32_FLOAT;
+	 emit_sz = 1 * sizeof(float);
          break;
       default:
          assert(0);
-         feme->fetch[i].emit = NULL;
-         break;
+	 output_format = PIPE_FORMAT_NONE;
+	 emit_sz = 0;
+	 continue;
       }
+
+      key.element[i].input_format = input_format;
+      key.element[i].input_buffer = input_buffer;
+      key.element[i].input_offset = input_offset;
+      key.element[i].output_format = output_format;
+      key.element[i].output_offset = dst_offset;
+      
+      dst_offset += emit_sz;
    }
 
-   feme->nr_fetch = vinfo->num_attribs;
-   feme->hw_vertex_size = vinfo->size * 4;
+   key.nr_elements = vinfo->num_attribs;
+   key.output_stride = vinfo->size * 4;
+
+   /* Don't bother with caching at this stage:
+    */
+   if (!feme->translate ||
+       memcmp(&feme->translate->key, &key, sizeof(key)) != 0) 
+   {
+      if (feme->translate)
+	 feme->translate->release(feme->translate);
+
+      feme->translate = translate_create( &key );
+
+      feme->translate->set_buffer(feme->translate, 
+				  draw->pt.nr_vertex_buffers, 
+				  &feme->point_size,
+				  0);
+   }
+   
+   feme->point_size = draw->rasterizer->point_size;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      feme->translate->set_buffer(feme->translate, 
+                                  i, 
+                                  ((char *)draw->pt.user.vbuffer[i] + 
+                                   draw->pt.vertex_buffer[i].buffer_offset),
+                                  draw->pt.vertex_buffer[i].pitch );
+   }
 }
 
 
@@ -246,7 +215,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
    hw_verts = draw->render->allocate_vertices( draw->render,
-                                               (ushort)feme->hw_vertex_size,
+                                               (ushort)feme->translate->key.output_stride,
                                                (ushort)fetch_count );
    if (!hw_verts) {
       assert(0);
@@ -256,10 +225,19 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
 					
    /* Single routine to fetch vertices and emit HW verts.
     */
-   fetch_store_general( feme, 
-                        hw_verts,
-                        fetch_elts,
-                        fetch_count );
+   feme->translate->run_elts( feme->translate, 
+			      fetch_elts,
+			      fetch_count,
+			      hw_verts );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < fetch_count; i++) {
+         debug_printf("\n\nvertex %d:\n", i);
+         draw_dump_emitted_vertex( feme->vinfo, 
+                                   (const uint8_t *)hw_verts + feme->vinfo->size * 4 * i );
+      }
+   }
 
    /* XXX: Draw arrays path to avoid re-emitting index list again and
     * again.
@@ -272,7 +250,7 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
     */
    draw->render->release_vertices( draw->render, 
                                    hw_verts, 
-                                   feme->hw_vertex_size, 
+                                   feme->translate->key.output_stride, 
                                    fetch_count );
 
 }
@@ -286,6 +264,11 @@ static void fetch_emit_finish( struct draw_pt_middle_end *middle )
 
 static void fetch_emit_destroy( struct draw_pt_middle_end *middle )
 {
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+
+   if (feme->translate)
+      feme->translate->release( feme->translate );
+   
    FREE(middle);
 }
 
@@ -293,6 +276,8 @@ static void fetch_emit_destroy( struct draw_pt_middle_end *middle )
 struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw )
 {
    struct fetch_emit_middle_end *fetch_emit = CALLOC_STRUCT( fetch_emit_middle_end );
+   if (fetch_emit == NULL)
+      return NULL;
  
    fetch_emit->base.prepare = fetch_emit_prepare;
    fetch_emit->base.run     = fetch_emit_run;
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
deleted file mode 100644
index a70d129c93..0000000000
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_pipeline.c
+++ /dev/null
@@ -1,326 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "pipe/p_util.h"
-#include "draw/draw_context.h"
-#include "draw/draw_private.h"
-#include "draw/draw_vertex.h"
-#include "draw/draw_pt.h"
-
-/* The simplest 'middle end' in the new vertex code.  
- * 
- * The responsibilities of a middle end are to:
- *  - perform vertex fetch using
- *       - draw vertex element/buffer state
- *       - a list of fetch indices we received as an input
- *  - run the vertex shader
- *  - cliptest, 
- *  - clip coord calculation 
- *  - viewport transformation
- *  - if necessary, run the primitive pipeline, passing it:
- *       - a linear array of vertex_header vertices constructed here
- *       - a set of draw indices we received as an input
- *  - otherwise, drive the hw backend,
- *       - allocate space for hardware format vertices
- *       - translate the vertex-shader output vertices to hw format
- *       - calling the backend draw functions.
- *
- * For convenience, we provide a helper function to drive the hardware
- * backend given similar inputs to those required to run the pipeline.
- *
- * In the case of passthrough mode, many of these actions are disabled
- * or noops, so we end up doing:
- *
- *  - perform vertex fetch
- *  - drive the hw backend
- *
- * IE, basically just vertex fetch to post-vs-format vertices,
- * followed by a call to the backend helper function.
- */
-
-
-struct fetch_pipeline_middle_end {
-   struct draw_pt_middle_end base;
-   struct draw_context *draw;
-
-   void (*header)( unsigned idx, float **out);
-
-   struct {
-      const ubyte *ptr;
-      unsigned pitch;
-      void (*fetch)( const void *from, float *attrib);
-      void (*emit)( const float *attrib, float **out );
-   } fetch[PIPE_MAX_ATTRIBS];
-   
-   unsigned nr_fetch;
-   unsigned pipeline_vertex_size;
-   unsigned prim;
-};
-
-
-#if 0
-static void emit_R32_FLOAT( const float *attrib,
-                            float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out) += 1;
-}
-
-static void emit_R32G32_FLOAT( const float *attrib,
-                               float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out) += 2;
-}
-
-static void emit_R32G32B32_FLOAT( const float *attrib,
-                                  float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out) += 3;
-}
-#endif
-static void emit_R32G32B32A32_FLOAT( const float *attrib,
-                                     float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out)[3] = attrib[3];
-   (*out) += 4;
-}
-
-static void header( unsigned idx,
-                    float **out )
-{
-   struct vertex_header *header = (struct vertex_header *) (*out);
-
-   header->clipmask = 0;
-   header->edgeflag = 1;
-   header->pad = 0;
-   header->vertex_id = UNDEFINED_VERTEX_ID;
-
-   (*out)[1] = 0;
-   (*out)[2] = 0;
-   (*out)[3] = 0;
-   (*out)[3] = 1;
-   (*out) += 5;
-}
-
-
-static void header_ef( unsigned idx,
-                       float **out )
-{
-   struct vertex_header *header = (struct vertex_header *) (*out);
-
-   /* XXX: need a reset_stipple flag in the vertex header too? 
-    */
-   header->clipmask = 0;
-   header->edgeflag = (idx & DRAW_PT_EDGEFLAG) != 0;
-   header->pad = 0;
-   header->vertex_id = UNDEFINED_VERTEX_ID;
-
-   (*out)[1] = 0;
-   (*out)[2] = 0;
-   (*out)[3] = 0;
-   (*out)[3] = 1;
-   (*out) += 5;
-}
-
-
-/**
- * General-purpose fetch from user's vertex arrays, emit to driver's
- * vertex buffer.
- *
- * XXX this is totally temporary.
- */
-static void
-fetch_store_general( struct fetch_pipeline_middle_end *fpme,
-                     void *out_ptr,
-                     const unsigned *fetch_elts,
-                     unsigned count )
-{
-   float *out = (float *)out_ptr;
-   uint i, j;
-
-   for (i = 0; i < count; i++) {
-      unsigned elt = fetch_elts[i];
-      
-      fpme->header( elt, &out );
-      elt &= ~DRAW_PT_FLAG_MASK;
-
-      for (j = 0; j < fpme->nr_fetch; j++) {
-         float attrib[4];
-         const ubyte *from = (fpme->fetch[j].ptr +
-                              fpme->fetch[j].pitch * elt);
-         
-         fpme->fetch[j].fetch( from, attrib );
-         fpme->fetch[j].emit( attrib, &out );
-      }
-   }
-}
-
-
-/* We aren't running a vertex shader, but are running the pipeline.
- * That means the vertices we need to build look like:
- *
- * dw0: vertex header (zero?)
- * dw1: clip coord 0
- * dw2: clip coord 1
- * dw3: clip coord 2
- * dw4: clip coord 4
- * dw5: screen coord 0
- * dw6: screen coord 0
- * dw7: screen coord 0
- * dw8: screen coord 0
- * dw9: other attribs...
- *
- */
-static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
-                                    unsigned prim )
-{
-   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   struct draw_context *draw = fpme->draw;
-   unsigned i, nr = 0;
-
-   fpme->prim = prim;
-
-   /* Emit the vertex header and empty clipspace coord field:
-    */
-   if (draw->user.edgeflag) {
-      fpme->header = header_ef;
-   }
-   else {
-      fpme->header = header;
-   }
-   
-
-   /* Need to look at vertex shader inputs (we know it is a
-    * passthrough shader, so these define the outputs too).  If we
-    * were running a shader, we'd still be looking at the inputs at
-    * this point.
-    */
-   for (i = 0; i < draw->vertex_shader->info.num_inputs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
-
-      fpme->fetch[nr].ptr = ((const ubyte *) draw->user.vbuffer[buf] + 
-                            draw->vertex_buffer[buf].buffer_offset + 
-                            draw->vertex_element[i].src_offset);
-
-      fpme->fetch[nr].pitch = draw->vertex_buffer[buf].pitch;
-      fpme->fetch[nr].fetch = draw_get_fetch_func( format );
-
-      /* Always do this -- somewhat redundant...
-       */
-      fpme->fetch[nr].emit = emit_R32G32B32A32_FLOAT;
-      nr++;
-   }
-
-   fpme->nr_fetch = nr;
-   fpme->pipeline_vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
-}
-
-
-
-
-static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
-                            const unsigned *fetch_elts,
-                            unsigned fetch_count,
-                            const ushort *draw_elts,
-                            unsigned draw_count )
-{
-   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
-   char *pipeline_verts;
-   
-   pipeline_verts = MALLOC( fpme->pipeline_vertex_size * 
-                            fetch_count );
-   if (!pipeline_verts) {
-      assert(0);
-      return;
-   }
-         
-					
-   /* Single routine to fetch vertices and emit pipeline verts.
-    */
-   fetch_store_general( fpme, 
-                        pipeline_verts,
-                        fetch_elts,
-                        fetch_count );
-
-   
-   /* Run the pipeline
-    */
-   draw_pt_run_pipeline( fpme->draw,
-                         fpme->prim,
-                         pipeline_verts,
-                         fpme->pipeline_vertex_size,
-                         fetch_count,
-                         draw_elts,
-                         draw_count );
-                 
-
-   /* Done -- that was easy, wasn't it: 
-    */
-   FREE( pipeline_verts );
-}
-
-
-
-static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )
-{
-   /* nothing to do */
-}
-
-static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
-{
-   FREE(middle);
-}
-
-
-struct draw_pt_middle_end *draw_pt_fetch_pipeline( struct draw_context *draw )
-{
-   struct fetch_pipeline_middle_end *fetch_pipeline = CALLOC_STRUCT( fetch_pipeline_middle_end );
- 
-   fetch_pipeline->base.prepare = fetch_pipeline_prepare;
-   fetch_pipeline->base.run     = fetch_pipeline_run;
-   fetch_pipeline->base.finish  = fetch_pipeline_finish;
-   fetch_pipeline->base.destroy = fetch_pipeline_destroy;
-
-   fetch_pipeline->draw = draw;
-     
-   return &fetch_pipeline->base;
-}
-
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 04b3d2c4cf..f0763dad8d 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -27,109 +27,73 @@
 
 #include "pipe/p_util.h"
 #include "draw/draw_context.h"
-#include "draw/draw_private.h"
 #include "draw/draw_vbuf.h"
 #include "draw/draw_vertex.h"
 #include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+#include "translate/translate.h"
+
 
 struct fetch_pipeline_middle_end {
    struct draw_pt_middle_end base;
    struct draw_context *draw;
 
-   struct {
-      const ubyte *ptr;
-      unsigned pitch;
-      void (*fetch)( const void *from, float *attrib);
-      void (*emit)( const float *attrib, float **out );
-   } fetch[PIPE_MAX_ATTRIBS];
+   struct pt_emit *emit;
+   struct pt_fetch *fetch;
+   struct pt_post_vs *post_vs;
 
-   unsigned nr_fetch;
-   unsigned pipeline_vertex_size;
-   unsigned hw_vertex_size;
+   unsigned vertex_data_offset;
+   unsigned vertex_size;
    unsigned prim;
+   unsigned opt;
 };
 
-#if 0
-static void emit_R32_FLOAT( const float *attrib,
-                            float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out) += 1;
-}
-
-static void emit_R32G32_FLOAT( const float *attrib,
-                               float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out) += 2;
-}
-
-static void emit_R32G32B32_FLOAT( const float *attrib,
-                                  float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out) += 3;
-}
-#endif
-static void emit_R32G32B32A32_FLOAT( const float *attrib,
-                                     float **out )
-{
-   (*out)[0] = attrib[0];
-   (*out)[1] = attrib[1];
-   (*out)[2] = attrib[2];
-   (*out)[3] = attrib[3];
-   (*out) += 4;
-}
 
 static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
-                                    unsigned prim )
+                                    unsigned prim,
+				    unsigned opt )
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   unsigned i, nr = 0;
-   boolean ok;
-   const struct vertex_info *vinfo;
+   struct draw_vertex_shader *vs = draw->vertex_shader;
+
+   /* Add one to num_outputs because the pipeline occasionally tags on
+    * an additional texcoord, eg for AA lines.
+    */
+   unsigned nr = MAX2( vs->info.num_inputs,
+		       vs->info.num_outputs + 1 );
 
    fpme->prim = prim;
+   fpme->opt = opt;
 
-   ok = draw->render->set_primitive(draw->render, prim);
-   if (!ok) {
-      assert(0);
-      return;
-   }
-   /* Must do this after set_primitive() above:
+   /* Always leave room for the vertex header whether we need it or
+    * not.  It's hard to get rid of it in particular because of the
+    * viewport code in draw_pt_post_vs.c.  
     */
-   vinfo = draw->render->get_vertex_info(draw->render);
+   fpme->vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
 
-   /* Need to look at vertex shader inputs (we know it is a
-    * passthrough shader, so these define the outputs too).  If we
-    * were running a shader, we'd still be looking at the inputs at
-    * this point.
-    */
-   for (i = 0; i < draw->vertex_shader->info.num_inputs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
+   
 
-      fpme->fetch[nr].ptr = ((const ubyte *) draw->user.vbuffer[buf] +
-                            draw->vertex_buffer[buf].buffer_offset +
-                            draw->vertex_element[i].src_offset);
+   draw_pt_fetch_prepare( fpme->fetch, 
+			  fpme->vertex_size );
 
-      fpme->fetch[nr].pitch = draw->vertex_buffer[buf].pitch;
-      fpme->fetch[nr].fetch = draw_get_fetch_func( format );
+   /* XXX: it's not really gl rasterization rules we care about here,
+    * but gl vs dx9 clip spaces.
+    */
+   draw_pt_post_vs_prepare( fpme->post_vs,
+			    draw->bypass_clipping,
+			    draw->identity_viewport,
+			    draw->rasterizer->gl_rasterization_rules );
+			    
 
-      /* Always do this -- somewhat redundant...
-       */
-      fpme->fetch[nr].emit = emit_R32G32B32A32_FLOAT;
-      nr++;
-   }
+   if (!(opt & PT_PIPELINE)) 
+      draw_pt_emit_prepare( fpme->emit, 
+			    prim );
+
+   /* No need to prepare the shader.
+    */
+   vs->prepare(vs, draw);
 
-   fpme->nr_fetch = nr;
-   //fpme->pipeline_vertex_size = sizeof(struct vertex_header) + nr * 4 * sizeof(float);
-   fpme->pipeline_vertex_size = MAX_VERTEX_ALLOCATION;
-   fpme->hw_vertex_size = vinfo->size * 4;
 }
 
 
@@ -144,71 +108,67 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
    struct draw_vertex_shader *shader = draw->vertex_shader;
-   char *pipeline_verts;
+   unsigned opt = fpme->opt;
 
-   pipeline_verts = MALLOC(fpme->pipeline_vertex_size *
-			   fetch_count);
+   struct vertex_header *pipeline_verts = 
+      (struct vertex_header *)MALLOC(fpme->vertex_size * fetch_count);
 
    if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
       assert(0);
       return;
    }
 
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run( fpme->fetch,
+		      fetch_elts, 
+		      fetch_count,
+		      (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
+   {
+      shader->run_linear(shader, 
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->pt.user.constants,
+			 fetch_count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    fetch_count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
 
-   /* Shade
+   /* Do we need to run the pipeline?
     */
-   shader->prepare(shader, draw);
-   if (shader->run(shader, draw, fetch_elts, fetch_count, pipeline_verts,
-                   fpme->pipeline_vertex_size)) {
-      /* Run the pipeline */
-      draw_pt_run_pipeline( fpme->draw,
-                            fpme->prim,
-                            pipeline_verts,
-                            fpme->pipeline_vertex_size,
-                            fetch_count,
-                            draw_elts,
-                            draw_count );
-   } else {
-      unsigned i, j;
-      void *hw_verts;
-      float *out;
-
-      /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
-       */
-      draw_do_flush( draw, DRAW_FLUSH_BACKEND );
-
-      hw_verts = draw->render->allocate_vertices(draw->render,
-                                                 (ushort)fpme->hw_vertex_size,
-                                                 (ushort)fetch_count);
-      if (!hw_verts) {
-         assert(0);
-         return;
-      }
-
-      out = (float *)hw_verts;
-      for (i = 0; i < fetch_count; i++) {
-         struct vertex_header *header =
-            (struct vertex_header*)(pipeline_verts + (fpme->pipeline_vertex_size * i));
-
-         for (j = 0; j < fpme->nr_fetch; j++) {
-            float *attrib = header->data[j];
-            /*debug_printf("emiting [%f, %f, %f, %f]\n",
-                         attrib[0], attrib[1],
-                         attrib[2], attrib[3]);*/
-            fpme->fetch[j].emit(attrib, &out);
-         }
-      }
-      /* XXX: Draw arrays path to avoid re-emitting index list again and
-       * again.
-       */
-      draw->render->draw(draw->render,
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run( fpme->draw,
+                         fpme->prim,
+                         pipeline_verts,
+                         fetch_count,
+                         fpme->vertex_size,
                          draw_elts,
-                         draw_count);
-
-      draw->render->release_vertices(draw->render,
-                                     hw_verts,
-                                     fpme->hw_vertex_size,
-                                     fetch_count);
+                         draw_count );
+   } 
+   else {
+      draw_pt_emit( fpme->emit,
+		    (const float (*)[4])pipeline_verts->data,
+		    fetch_count,
+		    fpme->vertex_size,
+		    draw_elts,
+		    draw_count );
    }
 
 
@@ -224,20 +184,51 @@ static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )
 
 static void fetch_pipeline_destroy( struct draw_pt_middle_end *middle )
 {
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+
+   if (fpme->fetch)
+      draw_pt_fetch_destroy( fpme->fetch );
+
+   if (fpme->emit)
+      draw_pt_emit_destroy( fpme->emit );
+
+   if (fpme->post_vs)
+      draw_pt_post_vs_destroy( fpme->post_vs );
+
    FREE(middle);
 }
 
 
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *draw )
 {
-   struct fetch_pipeline_middle_end *fetch_pipeline = CALLOC_STRUCT( fetch_pipeline_middle_end );
+   struct fetch_pipeline_middle_end *fpme = CALLOC_STRUCT( fetch_pipeline_middle_end );
+   if (!fpme)
+      goto fail;
+
+   fpme->base.prepare = fetch_pipeline_prepare;
+   fpme->base.run     = fetch_pipeline_run;
+   fpme->base.finish  = fetch_pipeline_finish;
+   fpme->base.destroy = fetch_pipeline_destroy;
+
+   fpme->draw = draw;
+
+   fpme->fetch = draw_pt_fetch_create( draw );
+   if (!fpme->fetch)
+      goto fail;
+
+   fpme->post_vs = draw_pt_post_vs_create( draw );
+   if (!fpme->post_vs)
+      goto fail;
+
+   fpme->emit = draw_pt_emit_create( draw );
+   if (!fpme->emit) 
+      goto fail;
 
-   fetch_pipeline->base.prepare = fetch_pipeline_prepare;
-   fetch_pipeline->base.run     = fetch_pipeline_run;
-   fetch_pipeline->base.finish  = fetch_pipeline_finish;
-   fetch_pipeline->base.destroy = fetch_pipeline_destroy;
+   return &fpme->base;
 
-   fetch_pipeline->draw = draw;
+ fail:
+   if (fpme)
+      fetch_pipeline_destroy( &fpme->base );
 
-   return &fetch_pipeline->base;
+   return NULL;
 }
diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
new file mode 100644
index 0000000000..f98e130ed6
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c
@@ -0,0 +1,215 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "pipe/p_context.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+
+struct pt_post_vs {
+   struct draw_context *draw;
+
+   boolean (*run)( struct pt_post_vs *pvs,
+		struct vertex_header *vertices,
+		unsigned count,
+		unsigned stride );
+};
+
+
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   return (a[0]*b[0] +
+           a[1]*b[1] +
+           a[2]*b[2] +
+           a[3]*b[3]);
+}
+
+
+
+static INLINE unsigned
+compute_clipmask_gl(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0x0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= (1<<0);
+   if ( clip[0] + clip[3] < 0) mask |= (1<<1);
+   if (-clip[1] + clip[3] < 0) mask |= (1<<2);
+   if ( clip[1] + clip[3] < 0) mask |= (1<<3);
+   if ( clip[2] + clip[3] < 0) mask |= (1<<4); /* match mesa clipplane numbering - for now */
+   if (-clip[2] + clip[3] < 0) mask |= (1<<5); /* match mesa clipplane numbering - for now */
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+/* The normal case - cliptest, rhw divide, viewport transform.
+ *
+ * Also handle identity viewport here at the expense of a few wasted
+ * instructions
+ */
+static boolean post_vs_cliptest_viewport_gl( struct pt_post_vs *pvs,
+					  struct vertex_header *vertices,
+					  unsigned count,
+					  unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned clipped = 0;
+   unsigned j;
+
+   if (0) debug_printf("%s\n");
+
+   for (j = 0; j < count; j++) {
+      out->clip[0] = out->data[0][0];
+      out->clip[1] = out->data[0][1];
+      out->clip[2] = out->data[0][2];
+      out->clip[3] = out->data[0][3];
+
+      out->vertex_id = 0xffff;
+      out->edgeflag = 1;
+      out->clipmask = compute_clipmask_gl(out->clip, 
+					  pvs->draw->plane,
+					  pvs->draw->nr_planes);
+      clipped += out->clipmask;
+
+      if (out->clipmask == 0)
+      {
+	 /* divide by w */
+	 float w = 1.0f / out->data[0][3];
+
+	 /* Viewport mapping */
+	 out->data[0][0] = out->data[0][0] * w * scale[0] + trans[0];
+	 out->data[0][1] = out->data[0][1] * w * scale[1] + trans[1];
+	 out->data[0][2] = out->data[0][2] * w * scale[2] + trans[2];
+	 out->data[0][3] = w;
+      }
+
+      out = (struct vertex_header *)( (char *)out + stride );
+   }
+
+   return clipped != 0;
+}
+
+
+
+/* If bypass_clipping is set, skip cliptest and rhw divide.
+ */
+static boolean post_vs_viewport( struct pt_post_vs *pvs,
+			      struct vertex_header *vertices,
+			      unsigned count,
+			      unsigned stride )
+{
+   struct vertex_header *out = vertices;
+   const float *scale = pvs->draw->viewport.scale;
+   const float *trans = pvs->draw->viewport.translate;
+   unsigned j;
+
+   if (0) debug_printf("%s\n", __FUNCTION__);
+   for (j = 0; j < count; j++) {
+      /* Viewport mapping only, no cliptest/rhw divide
+       */
+      out->data[0][0] = out->data[0][0] * scale[0] + trans[0];
+      out->data[0][1] = out->data[0][1] * scale[1] + trans[1];
+      out->data[0][2] = out->data[0][2] * scale[2] + trans[2];
+
+      out = (struct vertex_header *)((char *)out + stride);
+   }
+   
+   return FALSE;
+}
+
+
+/* If bypass_clipping is set and we have an identity viewport, nothing
+ * to do.
+ */
+static boolean post_vs_none( struct pt_post_vs *pvs,
+			     struct vertex_header *vertices,
+			     unsigned count,
+			     unsigned stride )
+{
+   if (0) debug_printf("%s\n", __FUNCTION__);
+   return FALSE;
+}
+
+boolean draw_pt_post_vs_run( struct pt_post_vs *pvs,
+			     struct vertex_header *pipeline_verts,
+			     unsigned count,
+			     unsigned stride )
+{
+   return pvs->run( pvs, pipeline_verts, count, stride );
+}
+
+
+void draw_pt_post_vs_prepare( struct pt_post_vs *pvs,
+			      boolean bypass_clipping,
+			      boolean identity_viewport,
+			      boolean opengl )
+{
+   if (bypass_clipping) {
+      if (identity_viewport)
+	 pvs->run = post_vs_none;
+      else
+	 pvs->run = post_vs_viewport;
+   }
+   else {
+      //if (opengl) 
+      pvs->run = post_vs_cliptest_viewport_gl;
+   }
+}
+
+
+struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw )
+{
+   struct pt_post_vs *pvs = CALLOC_STRUCT( pt_post_vs );
+   if (!pvs)
+      return NULL;
+
+   pvs->draw = draw;
+   
+   return pvs;
+}
+
+void draw_pt_post_vs_destroy( struct pt_post_vs *pvs )
+{
+   FREE(pvs);
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index 107dcfc269..afcff41043 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -63,6 +63,7 @@ struct vcache_frontend {
 
 static void vcache_flush( struct vcache_frontend *vcache )
 {
+   vcache->draw->vcache_flushing = TRUE;
    if (vcache->draw_count) {
       vcache->middle->run( vcache->middle,
                            vcache->fetch_elts,
@@ -74,6 +75,7 @@ static void vcache_flush( struct vcache_frontend *vcache )
    memset(vcache->in, ~0, sizeof(vcache->in));
    vcache->fetch_count = 0;
    vcache->draw_count = 0;
+   vcache->draw->vcache_flushing = FALSE;
 }
 
 static void vcache_check_flush( struct vcache_frontend *vcache )
@@ -106,7 +108,7 @@ static unsigned add_edgeflag( struct vcache_frontend *vcache,
                               unsigned idx, 
                               unsigned mask )
 {
-   if (mask && draw_get_edgeflag(vcache->draw, idx)) 
+   if (0 && mask && draw_pt_get_edgeflag(vcache->draw, idx)) 
       return idx | DRAW_PT_EDGEFLAG;
    else
       return idx;
@@ -116,7 +118,7 @@ static unsigned add_edgeflag( struct vcache_frontend *vcache,
 static unsigned add_reset_stipple( unsigned idx,
                                    unsigned reset )
 {
-   if (reset)
+   if (0 && reset)
       return idx | DRAW_PT_RESET_STIPPLE;
    else
       return idx;
@@ -128,9 +130,9 @@ static void vcache_triangle( struct vcache_frontend *vcache,
                              unsigned i1,
                              unsigned i2 )
 {
-   vcache_elt(vcache, i0 | DRAW_PT_EDGEFLAG | DRAW_PT_RESET_STIPPLE);
-   vcache_elt(vcache, i1 | DRAW_PT_EDGEFLAG);
-   vcache_elt(vcache, i2 | DRAW_PT_EDGEFLAG);
+   vcache_elt(vcache, i0 /* | DRAW_PT_EDGEFLAG | DRAW_PT_RESET_STIPPLE */ );
+   vcache_elt(vcache, i1 /* | DRAW_PT_EDGEFLAG */);
+   vcache_elt(vcache, i2 /* | DRAW_PT_EDGEFLAG */);
    vcache_check_flush(vcache);
 }
 
@@ -142,11 +144,12 @@ static void vcache_ef_triangle( struct vcache_frontend *vcache,
                                 unsigned i1,
                                 unsigned i2 )
 {
+/*
    i0 = add_edgeflag( vcache, i0, (ef_mask >> 0) & 1 );
    i1 = add_edgeflag( vcache, i1, (ef_mask >> 1) & 1 );
    i2 = add_edgeflag( vcache, i2, (ef_mask >> 2) & 1 );
-
    i0 = add_reset_stipple( i0, reset_stipple );
+*/
 
    vcache_elt(vcache, i0);
    vcache_elt(vcache, i1);
@@ -448,7 +451,8 @@ static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
 
 static void vcache_prepare( struct draw_pt_front_end *frontend,
                             unsigned prim,
-                            struct draw_pt_middle_end *middle )
+                            struct draw_pt_middle_end *middle,
+			    unsigned opt )
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
 
@@ -464,7 +468,7 @@ static void vcache_prepare( struct draw_pt_front_end *frontend,
    vcache->output_prim = reduced_prim[prim];
 
    vcache->middle = middle;
-   middle->prepare( middle, vcache->output_prim );
+   middle->prepare( middle, vcache->output_prim, opt );
 }
 
 
@@ -486,6 +490,8 @@ static void vcache_destroy( struct draw_pt_front_end *frontend )
 struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw )
 {
    struct vcache_frontend *vcache = CALLOC_STRUCT( vcache_frontend );
+   if (vcache == NULL)
+      return NULL;
  
    vcache->base.prepare = vcache_prepare;
    vcache->base.run     = NULL;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 168036eee8..1446f785c5 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -72,6 +72,58 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
          assert(0);
       }
    }
+}
+
 
-   assert(vinfo->size * 4 <= MAX_VERTEX_SIZE);
+void
+draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   unsigned i, j;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         debug_printf("EMIT_OMIT:");
+         break;
+      case EMIT_1F:
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
 }
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 65818463ca..6d8bac5138 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -106,5 +106,7 @@ draw_emit_vertex_attr(struct vertex_info *vinfo,
 
 extern void draw_compute_vertex_size(struct vertex_info *vinfo);
 
+void draw_dump_emitted_vertex(const struct vertex_info *vinfo, 
+                              const uint8_t *data);
 
 #endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vertex_cache.c b/src/gallium/auxiliary/draw/draw_vertex_cache.c
deleted file mode 100644
index 730c18bcb3..0000000000
--- a/src/gallium/auxiliary/draw/draw_vertex_cache.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "pipe/p_util.h"
-#include "draw_private.h"
-#include "draw_context.h"
-
-
-void draw_vertex_cache_invalidate( struct draw_context *draw )
-{
-   assert(draw->pq.queue_nr == 0);
-   assert(draw->vs.queue_nr == 0);
-   assert(draw->vcache.referenced == 0);
-
-   /* There's an error somewhere in the vcache code that requires this
-    * memset.  The bug is exposed in q3demo demo001, but probably
-    * elsewhere as well.  Will track it down later.
-    */
-   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
-}
-
-
-/**
- * Check if vertex is in cache, otherwise add it.  It won't go through
- * VS yet, not until there is a flush operation or the VS queue fills up.  
- *
- * Note that cache entries are basically just two pointers: the first
- * an index into the user's vertex arrays, the second a location in
- * the vertex shader cache for the post-transformed vertex.
- *
- * \return pointer to location of (post-transformed) vertex header in the cache
- */
-static struct vertex_header *get_vertex( struct draw_context *draw,
-					 unsigned i )
-{
-   unsigned slot = (i + (i>>5)) % VCACHE_SIZE;
-   
-   assert(slot < 32); /* so we don't exceed the bitfield size below */
-
-   if (draw->vcache.referenced & (1<<slot))
-   {
-      /* Cache hit?
-       */
-      if (draw->vcache.idx[slot].in == i) {
-	 /*debug_printf("HIT %d %d\n", slot, i);*/
-	 assert(draw->vcache.idx[slot].out < draw->vs.queue_nr);
-	 return draw_header_from_block(draw->vs.vertex_cache,
-                                       MAX_VERTEX_ALLOCATION,
-                                       draw->vcache.idx[slot].out);
-      }
-
-      /* Otherwise a collision
-       */
-      slot = VCACHE_SIZE + draw->vcache.overflow++;
-      /*debug_printf("XXX %d --> %d\n", i, slot);*/
-   }
-
-   /* Deal with the cache miss: 
-    */
-   {
-      unsigned out;
-      struct vertex_header *header;
-
-      assert(slot < Elements(draw->vcache.idx));
-
-      /*debug_printf("NEW %d %d\n", slot, i);*/
-      draw->vcache.idx[slot].in = i;
-      draw->vcache.idx[slot].out = out = draw->vs.queue_nr++;
-      draw->vcache.referenced |= (1 << slot);
-
-
-      /* Add to vertex shader queue:
-       */
-      assert(draw->vs.queue_nr < VS_QUEUE_LENGTH);
-
-      header = draw_header_from_block(draw->vs.vertex_cache, MAX_VERTEX_ALLOCATION,
-                                      out);
-      draw->vs.elts[out] = i;
-      header->clipmask = 0;
-      header->edgeflag = draw_get_edgeflag(draw, i);
-      header->pad = 0;
-      header->vertex_id = UNDEFINED_VERTEX_ID;
-
-      /* Need to set the vertex's edge flag here.  If we're being called
-       * by do_ef_triangle(), that function needs edge flag info!
-       */
-
-      return draw_header_from_block(draw->vs.vertex_cache,
-                                    MAX_VERTEX_ALLOCATION,
-                                    draw->vcache.idx[slot].out);
-   }
-}
-
-
-static struct vertex_header *get_uint_elt_vertex( struct draw_context *draw,
-                                                  unsigned i )
-{
-   const unsigned *elts = (const unsigned *) draw->user.elts;
-   return get_vertex( draw, elts[i] );
-}
-
-
-static struct vertex_header *get_ushort_elt_vertex( struct draw_context *draw,
-						    unsigned i )
-{
-   const ushort *elts = (const ushort *) draw->user.elts;
-   return get_vertex( draw, elts[i] );
-}
-
-
-static struct vertex_header *get_ubyte_elt_vertex( struct draw_context *draw,
-                                                   unsigned i )
-{
-   const ubyte *elts = (const ubyte *) draw->user.elts;
-   return get_vertex( draw, elts[i] );
-}
-
-
-void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw )
-{
-   unsigned i;
-
-   for (i = 0; i < draw->vs.post_nr; i++) {
-      struct vertex_header * header =
-         draw_header_from_block(draw->vs.vertex_cache,
-                                MAX_VERTEX_ALLOCATION, i);
-      header->vertex_id = UNDEFINED_VERTEX_ID;
-   }
-}
-
-
-void draw_vertex_cache_unreference( struct draw_context *draw )
-{
-   draw->vcache.referenced = 0;
-   draw->vcache.overflow = 0;
-}
-
-
-int draw_vertex_cache_check_space( struct draw_context *draw,
-				   unsigned nr_verts )
-{
-   if (draw->vcache.overflow + nr_verts < VCACHE_OVERFLOW) {
-      /* The vs queue is sized so that this can never happen:
-       */
-      assert(draw->vs.queue_nr + nr_verts < VS_QUEUE_LENGTH);
-      return TRUE;
-   }
-   else
-      return FALSE;
-}
-
-
-
-/**
- * Tell the drawing context about the index/element buffer to use
- * (ala glDrawElements)
- * If no element buffer is to be used (i.e. glDrawArrays) then this
- * should be called with eltSize=0 and elements=NULL.
- *
- * \param draw  the drawing context
- * \param eltSize  size of each element (1, 2 or 4 bytes)
- * \param elements  the element buffer ptr
- */
-void
-draw_set_mapped_element_buffer( struct draw_context *draw,
-                                unsigned eltSize, void *elements )
-{
-//   draw_statechange( draw );
-
-   /* choose the get_vertex() function to use */
-   switch (eltSize) {
-   case 0:
-      draw->vcache.get_vertex = get_vertex;
-      break;
-   case 1:
-      draw->vcache.get_vertex = get_ubyte_elt_vertex;
-      break;
-   case 2:
-      draw->vcache.get_vertex = get_ushort_elt_vertex;
-      break;
-   case 4:
-      draw->vcache.get_vertex = get_uint_elt_vertex;
-      break;
-   default:
-      assert(0);
-   }
-   draw->user.elts = elements;
-   draw->user.eltSize = eltSize;
-}
-
diff --git a/src/gallium/auxiliary/draw/draw_vertex_fetch.c b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
deleted file mode 100644
index 9041041006..0000000000
--- a/src/gallium/auxiliary/draw/draw_vertex_fetch.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <keith@tungstengraphics.com>
-  */
-
-#include "pipe/p_util.h"
-#include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
-#include "draw_context.h"
-
-
-#define DRAW_DBG 0
-
-
-/**
- * Fetch a float[4] vertex attribute from memory, doing format/type
- * conversion as needed.
- *
- * This is probably needed/dupliocated elsewhere, eg format
- * conversion, texture sampling etc.
- */
-#define FETCH_ATTRIB( NAME, SZ, CVT )			\
-static void						\
-fetch_##NAME(const void *ptr, float *attrib)		\
-{							\
-   static const float defaults[4] = { 0,0,0,1 };	\
-   int i;						\
-							\
-   for (i = 0; i < SZ; i++) {				\
-      attrib[i] = CVT(i);                              \
-   }							\
-							\
-   for (; i < 4; i++) {					\
-      attrib[i] = defaults[i];				\
-   }							\
-}
-
-#define CVT_64_FLOAT(i)   (float) ((double *) ptr)[i]
-#define CVT_32_FLOAT(i)   ((float *) ptr)[i]
-
-#define CVT_8_USCALED(i)  (float) ((unsigned char *) ptr)[i]
-#define CVT_16_USCALED(i) (float) ((unsigned short *) ptr)[i]
-#define CVT_32_USCALED(i) (float) ((unsigned int *) ptr)[i]
-
-#define CVT_8_SSCALED(i)  (float) ((char *) ptr)[i]
-#define CVT_16_SSCALED(i) (float) ((short *) ptr)[i]
-#define CVT_32_SSCALED(i) (float) ((int *) ptr)[i]
-
-#define CVT_8_UNORM(i)    (float) ((unsigned char *) ptr)[i] / 255.0f
-#define CVT_16_UNORM(i)   (float) ((unsigned short *) ptr)[i] / 65535.0f
-#define CVT_32_UNORM(i)   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
-
-#define CVT_8_SNORM(i)    (float) ((char *) ptr)[i] / 127.0f
-#define CVT_16_SNORM(i)   (float) ((short *) ptr)[i] / 32767.0f
-#define CVT_32_SNORM(i)   (float) ((int *) ptr)[i] / 2147483647.0f
-
-FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
-FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
-FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
-FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
-
-FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
-FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
-FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
-FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
-
-FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
-FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
-FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
-FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
-
-FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
-FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
-FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
-FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
-
-FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
-FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
-FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
-FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
-
-FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
-FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
-FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
-FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
-
-FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
-FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
-FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
-FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
-
-FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
-FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
-FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
-FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
-
-FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
-FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
-FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
-FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
-
-FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
-FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
-FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
-FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
-
-FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
-FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
-FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
-FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
-
-FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
-FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
-FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
-FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
-
-FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
-FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
-FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
-
-FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
-FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
-FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
-FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
-
-FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
-//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
-
-
-
-static void
-fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
-{
-   attrib[2] = CVT_8_UNORM(0);
-   attrib[1] = CVT_8_UNORM(1);
-   attrib[0] = CVT_8_UNORM(2);
-   attrib[3] = CVT_8_UNORM(3);
-}
-
-
-fetch_func draw_get_fetch_func( enum pipe_format format )
-{
-#if 0
-   {
-      char tmp[80];
-      pf_sprint_name(tmp, format);
-      debug_printf("%s: %s\n", __FUNCTION__, tmp);
-   }
-#endif
-
-   switch (format) {
-   case PIPE_FORMAT_R64_FLOAT:
-      return fetch_R64_FLOAT;
-   case PIPE_FORMAT_R64G64_FLOAT:
-      return fetch_R64G64_FLOAT;
-   case PIPE_FORMAT_R64G64B64_FLOAT:
-      return fetch_R64G64B64_FLOAT;
-   case PIPE_FORMAT_R64G64B64A64_FLOAT:
-      return fetch_R64G64B64A64_FLOAT;
-
-   case PIPE_FORMAT_R32_FLOAT:
-      return fetch_R32_FLOAT;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      return fetch_R32G32_FLOAT;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      return fetch_R32G32B32_FLOAT;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      return fetch_R32G32B32A32_FLOAT;
-
-   case PIPE_FORMAT_R32_UNORM:
-      return fetch_R32_UNORM;
-   case PIPE_FORMAT_R32G32_UNORM:
-      return fetch_R32G32_UNORM;
-   case PIPE_FORMAT_R32G32B32_UNORM:
-      return fetch_R32G32B32_UNORM;
-   case PIPE_FORMAT_R32G32B32A32_UNORM:
-      return fetch_R32G32B32A32_UNORM;
-
-   case PIPE_FORMAT_R32_USCALED:
-      return fetch_R32_USCALED;
-   case PIPE_FORMAT_R32G32_USCALED:
-      return fetch_R32G32_USCALED;
-   case PIPE_FORMAT_R32G32B32_USCALED:
-      return fetch_R32G32B32_USCALED;
-   case PIPE_FORMAT_R32G32B32A32_USCALED:
-      return fetch_R32G32B32A32_USCALED;
-
-   case PIPE_FORMAT_R32_SNORM:
-      return fetch_R32_SNORM;
-   case PIPE_FORMAT_R32G32_SNORM:
-      return fetch_R32G32_SNORM;
-   case PIPE_FORMAT_R32G32B32_SNORM:
-      return fetch_R32G32B32_SNORM;
-   case PIPE_FORMAT_R32G32B32A32_SNORM:
-      return fetch_R32G32B32A32_SNORM;
-
-   case PIPE_FORMAT_R32_SSCALED:
-      return fetch_R32_SSCALED;
-   case PIPE_FORMAT_R32G32_SSCALED:
-      return fetch_R32G32_SSCALED;
-   case PIPE_FORMAT_R32G32B32_SSCALED:
-      return fetch_R32G32B32_SSCALED;
-   case PIPE_FORMAT_R32G32B32A32_SSCALED:
-      return fetch_R32G32B32A32_SSCALED;
-
-   case PIPE_FORMAT_R16_UNORM:
-      return fetch_R16_UNORM;
-   case PIPE_FORMAT_R16G16_UNORM:
-      return fetch_R16G16_UNORM;
-   case PIPE_FORMAT_R16G16B16_UNORM:
-      return fetch_R16G16B16_UNORM;
-   case PIPE_FORMAT_R16G16B16A16_UNORM:
-      return fetch_R16G16B16A16_UNORM;
-
-   case PIPE_FORMAT_R16_USCALED:
-      return fetch_R16_USCALED;
-   case PIPE_FORMAT_R16G16_USCALED:
-      return fetch_R16G16_USCALED;
-   case PIPE_FORMAT_R16G16B16_USCALED:
-      return fetch_R16G16B16_USCALED;
-   case PIPE_FORMAT_R16G16B16A16_USCALED:
-      return fetch_R16G16B16A16_USCALED;
-
-   case PIPE_FORMAT_R16_SNORM:
-      return fetch_R16_SNORM;
-   case PIPE_FORMAT_R16G16_SNORM:
-      return fetch_R16G16_SNORM;
-   case PIPE_FORMAT_R16G16B16_SNORM:
-      return fetch_R16G16B16_SNORM;
-   case PIPE_FORMAT_R16G16B16A16_SNORM:
-      return fetch_R16G16B16A16_SNORM;
-
-   case PIPE_FORMAT_R16_SSCALED:
-      return fetch_R16_SSCALED;
-   case PIPE_FORMAT_R16G16_SSCALED:
-      return fetch_R16G16_SSCALED;
-   case PIPE_FORMAT_R16G16B16_SSCALED:
-      return fetch_R16G16B16_SSCALED;
-   case PIPE_FORMAT_R16G16B16A16_SSCALED:
-      return fetch_R16G16B16A16_SSCALED;
-
-   case PIPE_FORMAT_R8_UNORM:
-      return fetch_R8_UNORM;
-   case PIPE_FORMAT_R8G8_UNORM:
-      return fetch_R8G8_UNORM;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      return fetch_R8G8B8_UNORM;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      return fetch_R8G8B8A8_UNORM;
-
-   case PIPE_FORMAT_R8_USCALED:
-      return fetch_R8_USCALED;
-   case PIPE_FORMAT_R8G8_USCALED:
-      return fetch_R8G8_USCALED;
-   case PIPE_FORMAT_R8G8B8_USCALED:
-      return fetch_R8G8B8_USCALED;
-   case PIPE_FORMAT_R8G8B8A8_USCALED:
-      return fetch_R8G8B8A8_USCALED;
-
-   case PIPE_FORMAT_R8_SNORM:
-      return fetch_R8_SNORM;
-   case PIPE_FORMAT_R8G8_SNORM:
-      return fetch_R8G8_SNORM;
-   case PIPE_FORMAT_R8G8B8_SNORM:
-      return fetch_R8G8B8_SNORM;
-   case PIPE_FORMAT_R8G8B8A8_SNORM:
-      return fetch_R8G8B8A8_SNORM;
-
-   case PIPE_FORMAT_R8_SSCALED:
-      return fetch_R8_SSCALED;
-   case PIPE_FORMAT_R8G8_SSCALED:
-      return fetch_R8G8_SSCALED;
-   case PIPE_FORMAT_R8G8B8_SSCALED:
-      return fetch_R8G8B8_SSCALED;
-   case PIPE_FORMAT_R8G8B8A8_SSCALED:
-      return fetch_R8G8B8A8_SSCALED;
-
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      return fetch_A8R8G8B8_UNORM;
-
-
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return fetch_B8G8R8A8_UNORM;
-
-   case 0:
-      return NULL;		/* not sure why this is needed */
-
-   default:
-      /* This can get hit because draw-state-validation is too eager,
-         and can jump in here validating stuff before the state tracker has set
-         up everything.
-      */
-      /* assert(0); */
-      return NULL;
-   }
-}
-
-
-static void 
-transpose_4x4( float *out, const float *in )
-{
-   /* This can be achieved in 12 sse instructions, plus the final
-    * stores I guess.  This is probably a bit more than that - maybe
-    * 32 or so?
-    */
-   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
-   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
-   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
-   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
-}
-
-
-
-static void fetch_xyz_rgb( struct draw_context *draw,
-			   struct tgsi_exec_machine *machine,
-			   const unsigned *elts,
-			   unsigned count )
-{
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   assert(count <= 4);
-
-//   debug_printf("%s\n", __FUNCTION__);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   for (i = 0; i < 4; i++) {
-      {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-static void fetch_xyz_rgb_st( struct draw_context *draw,
-			      struct tgsi_exec_machine *machine,
-			      const unsigned *elts,
-			      unsigned count )
-{
-   const unsigned *pitch   = draw->vertex_fetch.pitch;
-   const ubyte **src       = draw->vertex_fetch.src_ptr;
-   int i;
-
-   assert(count <= 4);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-
-   for (i = 0; i < 4; i++) {
-      {
-	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
-	 float *out = &machine->Inputs[0].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
-      {
-	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
-	 float *out = &machine->Inputs[1].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = in[2];
- 	 out[12] = 1.0f;
-      }
-
-      {
-	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
-	 float *out = &machine->Inputs[2].xyzw[0].f[i];
-	 out[0] = in[0];
-	 out[4] = in[1];
-	 out[8] = 0.0f;
- 	 out[12] = 1.0f;
-      }
-   }
-}
-
-
-
-
-/**
- * Fetch vertex attributes for 'count' vertices.
- */
-static void generic_vertex_fetch( struct draw_context *draw,
-				  struct tgsi_exec_machine *machine,
-				  const unsigned *elts,
-				  unsigned count )
-{
-   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
-   unsigned attr;
-
-   assert(count <= 4);
-
-//   debug_printf("%s %d\n", __FUNCTION__, count);
-
-   /* loop over vertex attributes (vertex shader inputs)
-    */
-   for (attr = 0; attr < nr_attrs; attr++) {
-
-      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
-      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
-      const fetch_func fetch = draw->vertex_fetch.fetch[attr];
-      unsigned i;
-      float p[4][4];
-
-
-      /* Fetch four attributes for four vertices.  
-       * 
-       * Could fetch directly into AOS format, but this is meant to be
-       * a prototype for an sse implementation, which would have
-       * difficulties doing that.
-       */
-      for (i = 0; i < count; i++) 
-	 fetch( src + elts[i] * pitch, p[i] );
-
-      /* Be nice and zero out any missing vertices: 
-       */
-      for ( ; i < 4; i++) 
-	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
-      
-      /* Transpose/swizzle into sse-friendly format.  Currently
-       * assuming that all vertex shader inputs are float[4], but this
-       * isn't true -- if the vertex shader only wants tex0.xy, we
-       * could optimize for that.
-       *
-       * To do so fully without codegen would probably require an
-       * excessive number of fetch functions, but we could at least
-       * minimize the transpose step:
-       */
-      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
-   }
-}
-
-
-			       
-void draw_update_vertex_fetch( struct draw_context *draw )
-{
-   unsigned nr_attrs, i;
-
-//   debug_printf("%s\n", __FUNCTION__);
-   
-   /* this may happend during context init */
-   if (!draw->vertex_shader)
-      return;
-
-   nr_attrs = draw->vertex_shader->info.num_inputs;
-
-   for (i = 0; i < nr_attrs; i++) {
-      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
-      enum pipe_format format  = draw->vertex_element[i].src_format;
-
-      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
-						       draw->vertex_buffer[buf].buffer_offset + 
-						       draw->vertex_element[i].src_offset;
-
-      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
-      draw->vertex_fetch.fetch[i] = draw_get_fetch_func( format );
-   }
-
-   draw->vertex_fetch.nr_attrs = nr_attrs;
-
-   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
-
-   switch (nr_attrs) {
-   case 2:
-      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
-	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
-	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
-      break;
-   case 3:
-      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
-	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
-	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
-	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
-      break;
-   default:
-      break;
-   }
-
-}
diff --git a/src/gallium/auxiliary/draw/draw_vf.c b/src/gallium/auxiliary/draw/draw_vf.c
deleted file mode 100644
index 9d0154c50d..0000000000
--- a/src/gallium/auxiliary/draw/draw_vf.c
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-
-#include <stddef.h>
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_util.h"
-#include "rtasm/rtasm_execmem.h"
-
-#include "draw_vf.h"
-
-
-#define DRAW_VF_DBG 0
-
-
-static boolean match_fastpath( struct draw_vertex_fetch *vf,
-				 const struct draw_vf_fastpath *fp)
-{
-   unsigned j;
-
-   if (vf->attr_count != fp->attr_count) 
-      return FALSE;
-
-   for (j = 0; j < vf->attr_count; j++) 
-      if (vf->attr[j].format != fp->attr[j].format ||
-	  vf->attr[j].inputsize != fp->attr[j].size ||
-	  vf->attr[j].vertoffset != fp->attr[j].offset) 
-	 return FALSE;
-      
-   if (fp->match_strides) {
-      if (vf->vertex_stride != fp->vertex_stride)
-	 return FALSE;
-
-      for (j = 0; j < vf->attr_count; j++) 
-	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
-	    return FALSE;
-   }
-   
-   return TRUE;
-}
-
-static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
-{
-   struct draw_vf_fastpath *fp = vf->fastpath;
-
-   for ( ; fp ; fp = fp->next) {
-      if (match_fastpath(vf, fp)) {
-         vf->emit = fp->func;
-	 return TRUE;
-      }
-   }
-
-   return FALSE;
-}
-
-void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
-			     boolean match_strides )
-{
-   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
-   unsigned i;
-
-   fastpath->vertex_stride = vf->vertex_stride;
-   fastpath->attr_count = vf->attr_count;
-   fastpath->match_strides = match_strides;
-   fastpath->func = vf->emit;
-   fastpath->attr = (struct draw_vf_attr_type *)
-      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
-
-   for (i = 0; i < vf->attr_count; i++) {
-      fastpath->attr[i].format = vf->attr[i].format;
-      fastpath->attr[i].stride = vf->attr[i].inputstride;
-      fastpath->attr[i].size = vf->attr[i].inputsize;
-      fastpath->attr[i].offset = vf->attr[i].vertoffset;
-   }
-
-   fastpath->next = vf->fastpath;
-   vf->fastpath = fastpath;
-}
-
-
-
-
-/***********************************************************************
- * Build codegen functions or return generic ones:
- */
-static void choose_emit_func( struct draw_vertex_fetch *vf, 
-			      unsigned count, 
-			      uint8_t *dest)
-{
-   vf->emit = NULL;
-   
-   /* Does this match an existing (hardwired, codegen or known-bad)
-    * fastpath?
-    */
-   if (search_fastpath_emit(vf)) {
-      /* Use this result.  If it is null, then it is already known
-       * that the current state will fail for codegen and there is no
-       * point trying again.
-       */
-   }
-   else if (vf->codegen_emit) {
-      vf->codegen_emit( vf );
-   }
-
-   if (!vf->emit) {
-      draw_vf_generate_hardwired_emit(vf);
-   }
-
-   /* Otherwise use the generic version:
-    */
-   if (!vf->emit)
-      vf->emit = draw_vf_generic_emit;
-
-   vf->emit( vf, count, dest );
-}
-
-
-
-
-
-/***********************************************************************
- * Public entrypoints, mostly dispatch to the above:
- */
-
-
-
-static unsigned 
-draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
-                               const struct draw_vf_attr_map *map,
-                               unsigned nr, 
-                               unsigned vertex_stride )
-{
-   unsigned offset = 0;
-   unsigned i, j;
-
-   assert(nr < PIPE_MAX_ATTRIBS);
-
-   for (j = 0, i = 0; i < nr; i++) {
-      const unsigned format = map[i].format;
-      if (format == DRAW_EMIT_PAD) {
-#if (DRAW_VF_DBG)
-	    debug_printf("%d: pad %d, offset %d\n", i,  
-			 map[i].offset, offset);  
-#endif
-
-	 offset += map[i].offset;
-
-      }
-      else {
-	 vf->attr[j].attrib = map[i].attrib;
-	 vf->attr[j].format = format;
-	 vf->attr[j].insert = draw_vf_format_info[format].insert;
-	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
-	 vf->attr[j].vertoffset = offset;
-	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
-	 if(vf->attr[j].isconst)
-	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
-	 
-#if (DRAW_VF_DBG)
-	    debug_printf("%d: %s, offset %d\n", i,  
-			 draw_vf_format_info[format].name,
-			 vf->attr[j].vertoffset);   
-#endif
-
-	 offset += draw_vf_format_info[format].attrsize;
-	 j++;
-      }
-   }
-
-   vf->attr_count = j;
-   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
-   vf->emit = choose_emit_func;
-
-   assert(vf->vertex_stride >= offset);
-   return vf->vertex_stride;
-}
-
-
-void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
-                              const struct vertex_info *vinfo,
-                              float point_size )
-{
-   unsigned i, j;
-   struct draw_vf_attr *a = vf->attr;
-   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
-   unsigned count = 0;  /* for debug/sanity */
-   unsigned nr_attrs = 0;
-   
-   for (i = 0; i < vinfo->num_attribs; i++) {
-      j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
-      case EMIT_1F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_1F_PSIZE:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
-	 attrs[nr_attrs].offset = 0;
-	 attrs[nr_attrs].data.f[0] = point_size;
-	 nr_attrs++;
-         count++;
-         break;
-      case EMIT_2F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_2F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 2;
-         break;
-      case EMIT_3F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_3F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 3;
-         break;
-      case EMIT_4F:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4F;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 4;
-         break;
-      case EMIT_4UB:
-	 attrs[nr_attrs].attrib = j;
-	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
-	 attrs[nr_attrs].offset = 0;
-	 nr_attrs++;
-         count += 1;
-         break;
-      default:
-         assert(0);
-      }
-   }
-   
-   assert(count == vinfo->size);  
-   
-   draw_vf_set_vertex_attributes(vf, 
-                                 attrs, 
-                                 nr_attrs, 
-                                 vinfo->size * sizeof(float) );
-
-   for (j = 0; j < vf->attr_count; j++) {
-      a[j].inputsize = 4;
-      a[j].do_insert = a[j].insert[4 - 1];
-      if(a[j].isconst) {
-	 a[j].inputptr = a[j].data;
-	 a[j].inputstride = 0;
-      }
-   }
-}
-
-
-#if 0
-/* Set attribute pointers, adjusted for start position:
- */
-void draw_vf_set_sources( struct draw_vertex_fetch *vf,
-		     GLvector4f * const sources[],
-		     unsigned start )
-{
-   struct draw_vf_attr *a = vf->attr;
-   unsigned j;
-   
-   for (j = 0; j < vf->attr_count; j++) {
-      const GLvector4f *vptr = sources[a[j].attrib];
-      
-      if ((a[j].inputstride != vptr->stride) ||
-	  (a[j].inputsize != vptr->size))
-	 vf->emit = choose_emit_func;
-      
-      a[j].inputstride = vptr->stride;
-      a[j].inputsize = vptr->size;
-      a[j].do_insert = a[j].insert[vptr->size - 1]; 
-      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
-   }
-}
-#endif
-
-
-/**
- * Emit a vertex to dest.  
- */
-void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
-                          struct vertex_header *vertex,
-                          void *dest )
-{
-   struct draw_vf_attr *a = vf->attr;
-   unsigned j;
-   
-   for (j = 0; j < vf->attr_count; j++) {
-      if (!a[j].isconst) {
-	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
-	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
-      }
-   }
-   
-   vf->emit( vf, 1, (uint8_t*) dest );
-}
-
-
-
-struct draw_vertex_fetch *draw_vf_create( void )
-{
-   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
-   unsigned i;
-
-   for (i = 0; i < PIPE_MAX_ATTRIBS; i++)
-      vf->attr[i].vf = vf;
-
-   vf->identity[0] = 0.0;
-   vf->identity[1] = 0.0;
-   vf->identity[2] = 0.0;
-   vf->identity[3] = 1.0;
-
-   vf->codegen_emit = NULL;
-
-#ifdef USE_SSE_ASM
-   if (!GETENV("GALLIUM_NO_CODEGEN"))
-      vf->codegen_emit = draw_vf_generate_sse_emit;
-#endif
-
-   return vf;
-}
-
-
-void draw_vf_destroy( struct draw_vertex_fetch *vf )
-{
-   struct draw_vf_fastpath *fp, *tmp;
-
-   for (fp = vf->fastpath ; fp ; fp = tmp) {
-      tmp = fp->next;
-      FREE(fp->attr);
-
-      /* KW: At the moment, fp->func is constrained to be allocated by
-       * rtasm_exec_alloc(), as the hardwired fastpaths in
-       * t_vertex_generic.c are handled specially.  It would be nice
-       * to unify them, but this probably won't change until this
-       * module gets another overhaul.
-       */
-      //rtasm_exec_free((void *) fp->func);
-      FREE(fp);
-   }
-   
-   vf->fastpath = NULL;
-   FREE(vf);
-}
diff --git a/src/gallium/auxiliary/draw/draw_vf.h b/src/gallium/auxiliary/draw/draw_vf.h
deleted file mode 100644
index 0ef98d6257..0000000000
--- a/src/gallium/auxiliary/draw/draw_vf.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright 2008 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/**
- * Vertex fetch/store/convert code.  This functionality is used in two places:
- * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
- *    arrays and convert to format needed by vertex shaders.
- * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
- *    (which is the organization used throughout the draw/prim pipeline) to
- *    hardware-specific formats and emit into hardware vertex buffers.
- *
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-#ifndef DRAW_VF_H
-#define DRAW_VF_H
-
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_state.h"
-
-#include "draw_vertex.h"
-#include "draw_private.h" /* for vertex_header */
-
-
-enum draw_vf_attr_format {
-   DRAW_EMIT_1F,
-   DRAW_EMIT_2F,
-   DRAW_EMIT_3F,
-   DRAW_EMIT_4F,
-   DRAW_EMIT_3F_XYW,			/**< for projective texture */
-   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
-   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
-   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
-   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
-   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
-   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
-   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
-   DRAW_EMIT_1F_CONST,
-   DRAW_EMIT_2F_CONST,
-   DRAW_EMIT_3F_CONST,
-   DRAW_EMIT_4F_CONST,
-   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
-   DRAW_EMIT_MAX
-};
-
-struct draw_vf_attr_map 
-{
-   /** Input attribute number */
-   unsigned attrib;
-   
-   enum draw_vf_attr_format format;
-   
-   unsigned offset;
-   
-   /** 
-    * Constant data for DRAW_EMIT_*_CONST 
-    */
-   union {
-      uint8_t ub[4];
-      float f[4];
-   } data;
-};
-
-struct draw_vertex_fetch;
-
-
-
-#if 0
-unsigned 
-draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
-                               const struct draw_vf_attr_map *map,
-                               unsigned nr, 
-                               unsigned vertex_stride );
-#endif
-
-void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
-                              const struct vertex_info *vinfo,
-                              float point_size );
-
-#if 0
-void 
-draw_vf_set_sources( struct draw_vertex_fetch *vf,
-		     GLvector4f * const attrib[],
-		     unsigned start );
-#endif
-
-void 
-draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
-                     struct vertex_header *vertex,
-                     void *dest );
-
-struct draw_vertex_fetch *
-draw_vf_create( void );
-
-void 
-draw_vf_destroy( struct draw_vertex_fetch *vf );
-
-
-
-/***********************************************************************
- * Internal functions and structs:
- */
-
-struct draw_vf_attr;
-
-
-typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
-				     uint8_t *v, 
-				     const float *in );
-
-typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
-      				   unsigned count, 
-      				   uint8_t *dest );
-
-
-
-/**
- * Describes how to convert/move a vertex attribute from a vertex
- * array to a vertex structure.
- */
-struct draw_vf_attr
-{
-   struct draw_vertex_fetch *vf;
-
-   unsigned format;
-   unsigned inputsize;
-   unsigned inputstride;
-   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
-   
-   boolean isconst;              /**< read from const data below */
-   uint8_t data[16];
-
-   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
-   unsigned vertattrsize;    /**< size of the attribute in bytes */
-
-   uint8_t *inputptr;
-   const draw_vf_insert_func *insert;
-   draw_vf_insert_func do_insert;
-};
-
-struct draw_vertex_fetch
-{
-   struct draw_vf_attr attr[PIPE_MAX_ATTRIBS];
-   unsigned attr_count;
-   unsigned vertex_stride;
-
-   draw_vf_emit_func emit;
-
-   /* Parameters and constants for codegen:
-    */
-   float identity[4];
-
-   struct draw_vf_fastpath *fastpath;
-   
-   void (*codegen_emit)( struct draw_vertex_fetch *vf );
-};
-
-
-struct draw_vf_attr_type {
-   unsigned format;
-   unsigned size;
-   unsigned stride;
-   unsigned offset;
-};
-
-/** XXX this could be moved into draw_vf.c */
-struct draw_vf_fastpath {
-   unsigned vertex_stride;
-   unsigned attr_count;
-   boolean match_strides;
-
-   struct draw_vf_attr_type *attr;
-
-   draw_vf_emit_func func;
-   struct draw_vf_fastpath *next;
-};
-
-
-void 
-draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
-                           boolean match_strides );
-
-void 
-draw_vf_generic_emit( struct draw_vertex_fetch *vf,
-                      unsigned count,
-                      uint8_t *v );
-
-void 
-draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
-
-void 
-draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
-
-
-/** XXX this type and function could probably be moved into draw_vf.c */
-struct draw_vf_format_info {
-   const char *name;
-   draw_vf_insert_func insert[4];
-   const unsigned attrsize;
-   const boolean isconst;
-};
-
-extern const struct draw_vf_format_info 
-draw_vf_format_info[DRAW_EMIT_MAX];
-
-
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_vf_generic.c b/src/gallium/auxiliary/draw/draw_vf_generic.c
deleted file mode 100644
index 7a60a9db9c..0000000000
--- a/src/gallium/auxiliary/draw/draw_vf_generic.c
+++ /dev/null
@@ -1,585 +0,0 @@
-
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_debug.h"
-#include "pipe/p_util.h"
-
-#include "draw_vf.h"
-
-
-
-static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = in[3];
-}
-
-static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-   out[3] = 1;
-}
-
-static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[3];
-}
-
-static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   (void) a; (void) v; (void) in;
-   assert(0);
-}
-
-static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = in[2];
-}
-
-static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-   out[2] = 0;
-}
-
-static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-   out[2] = 0;
-}
-
-
-static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = in[1];
-}
-
-static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-   
-   out[0] = in[0];
-   out[1] = 0;
-}
-
-static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   float *out = (float *)(v);
-   (void) a;
-
-   out[0] = in[0];
-}
-
-static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
-{
-   (void) a; (void) v; (void) in;
-}
-
-static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
-}
-
-static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[2] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   v[1] = 0;
-   v[2] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
-}
-
-static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[0] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   v[1] = 0;
-   v[0] = 0;
-   v[3] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
-}
-
-static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   v[3] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
-   v[2] = 0x00;
-   v[3] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
-}
-
-static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
-   v[1] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
-					 const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
-   v[2] = 0x00;
-   v[1] = 0x00;
-   v[0] = 0xff;
-}
-
-static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
-}
-
-static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[2] = 0;
-}
-
-static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-   v[1] = 0;
-   v[2] = 0;
-}
-
-static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
-}
-
-static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
-   v[0] = 0;
-}
-
-static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
-					const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
-   v[1] = 0;
-   v[0] = 0;
-}
-
-
-static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
-				    const float *in )
-{
-   (void) a;
-   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
-}
-
-
-const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
-{
-   { "1f",
-     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
-     sizeof(float), FALSE },
-
-   { "2f",
-     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
-     2 * sizeof(float), FALSE },
-
-   { "3f",
-     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
-     3 * sizeof(float), FALSE },
-
-   { "4f",
-     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
-     4 * sizeof(float), FALSE },
-
-   { "3f_xyw",
-     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
-       insert_3f_xyw_4 },
-     3 * sizeof(float), FALSE },
-
-   { "1ub_1f",
-     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
-     sizeof(uint8_t), FALSE },
-
-   { "3ub_3f_rgb",
-     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
-       insert_3ub_3f_rgb_3 },
-     3 * sizeof(uint8_t), FALSE },
-
-   { "3ub_3f_bgr",
-     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
-       insert_3ub_3f_bgr_3 },
-     3 * sizeof(uint8_t), FALSE },
-
-   { "4ub_4f_rgba",
-     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
-       insert_4ub_4f_rgba_4 },
-     4 * sizeof(uint8_t), FALSE },
-
-   { "4ub_4f_bgra",
-     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
-       insert_4ub_4f_bgra_4 },
-     4 * sizeof(uint8_t), FALSE },
-
-   { "4ub_4f_argb",
-     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
-       insert_4ub_4f_argb_4 },
-     4 * sizeof(uint8_t), FALSE },
-
-   { "4ub_4f_abgr",
-     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
-       insert_4ub_4f_abgr_4 },
-     4 * sizeof(uint8_t), FALSE },
-
-   { "1f_const",
-     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
-     sizeof(float), TRUE },
-   
-   { "2f_const",
-     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
-     2 * sizeof(float), TRUE },
-   
-   { "3f_const",
-     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
-     3 * sizeof(float), TRUE },
-   
-   { "4f_const",
-     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
-     4 * sizeof(float), TRUE },
-
-   { "pad",
-     { NULL, NULL, NULL, NULL },
-     0, FALSE },
-
-};
-
-
-
-    
-/***********************************************************************
- * Hardwired fastpaths for emitting whole vertices or groups of
- * vertices
- */
-#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
-static void NAME( struct draw_vertex_fetch *vf,				\
-		  unsigned count,						\
-		  uint8_t *v )						\
-{									\
-   struct draw_vf_attr *a = vf->attr;				\
-   unsigned i;								\
-									\
-   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
-      if (NR > 0) {							\
-	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
-	 a[0].inputptr += a[0].inputstride;				\
-      }									\
-      									\
-      if (NR > 1) {							\
-	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
-	 a[1].inputptr += a[1].inputstride;				\
-      }									\
-      									\
-      if (NR > 2) {							\
-	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
-	 a[2].inputptr += a[2].inputstride;				\
-      }									\
-      									\
-      if (NR > 3) {							\
-	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
-	 a[3].inputptr += a[3].inputstride;				\
-      }									\
-									\
-      if (NR > 4) {							\
-	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
-	 a[4].inputptr += a[4].inputstride;				\
-      }									\
-   }									\
-}
-
-   
-#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
-				  insert_null, insert_null, NAME)
-
-#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
-				      insert_null, NAME)
-   
-#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
-				          insert_null, NAME)
-   
-
-EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
-
-EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
-
-EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
-
-
-/* Use the codegen paths to select one of a number of hardwired
- * fastpaths.
- */
-void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
-{
-   draw_vf_emit_func func = NULL;
-
-   /* Does it fit a hardwired fastpath?  Help! this is growing out of
-    * control!
-    */
-   switch (vf->attr_count) {
-   case 2:
-      if (vf->attr[0].do_insert == insert_3f_3 &&
-	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
- 	 func = emit_xyz3_rgba4; 
-      }
-      break;
-   case 3:
-      if (vf->attr[2].do_insert == insert_2f_2) {
-	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_4) 
-	       func = emit_xyzw4_rgba4_st2;
-	 }
-      }
-      break;
-   case 4:
-      if (vf->attr[2].do_insert == insert_2f_2 &&
-	  vf->attr[3].do_insert == insert_2f_2) {
-	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
-	    if (vf->attr[0].do_insert == insert_4f_4) 
-	       func = emit_xyzw4_rgba4_st2_st2;
-	 }
-      }
-      break;
-   }
-
-   vf->emit = func;
-}
-
-/***********************************************************************
- * Generic (non-codegen) functions for whole vertices or groups of
- * vertices
- */
-
-void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
-		      unsigned count,
-		      uint8_t *v )
-{
-   struct draw_vf_attr *a = vf->attr;
-   const unsigned attr_count = vf->attr_count;
-   const unsigned stride = vf->vertex_stride;
-   unsigned i, j;
-
-   for (i = 0 ; i < count ; i++, v += stride) {
-      for (j = 0; j < attr_count; j++) {
-	 float *in = (float *)a[j].inputptr;
-	 a[j].inputptr += a[j].inputstride;
-	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
-      }
-   }
-}
-
-
diff --git a/src/gallium/auxiliary/draw/draw_vf_sse.c b/src/gallium/auxiliary/draw/draw_vf_sse.c
deleted file mode 100644
index aff4ffd985..0000000000
--- a/src/gallium/auxiliary/draw/draw_vf_sse.c
+++ /dev/null
@@ -1,613 +0,0 @@
-/*
- * Copyright 2003 Tungsten Graphics, inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Keith Whitwell <keithw@tungstengraphics.com>
- */
-
-
-#include "pipe/p_compiler.h"
-#include "util/u_simple_list.h"
-
-#include "draw_vf.h"
-
-
-#if defined(USE_SSE_ASM)
-
-#include "rtasm/rtasm_cpu.h"
-#include "rtasm/rtasm_x86sse.h"
-
-
-#define X    0
-#define Y    1
-#define Z    2
-#define W    3
-
-
-struct x86_program {
-   struct x86_function func;
-
-   struct draw_vertex_fetch *vf;
-   boolean inputs_safe;
-   boolean outputs_safe;
-   boolean have_sse2;
-   
-   struct x86_reg identity;
-   struct x86_reg chan0;
-};
-
-
-static struct x86_reg get_identity( struct x86_program *p )
-{
-   return p->identity;
-}
-
-static void emit_load4f_4( struct x86_program *p, 			   
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movups(&p->func, dest, arg0);
-}
-
-static void emit_load4f_3( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Have to jump through some hoops:
-    *
-    * c 0 0 0
-    * c 0 0 1
-    * 0 0 c 1
-    * a b c 1
-    */
-   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
-   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
-   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load4f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Initialize from identity, then pull in low two words:
-    */
-   sse_movups(&p->func, dest, get_identity(p));
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load4f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Pull in low word, then swizzle in identity */
-   sse_movss(&p->func, dest, arg0);
-   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
-}
-
-
-
-static void emit_load3f_3( struct x86_program *p, 			   
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   /* Over-reads by 1 dword - potential SEGV if input is a vertex
-    * array.
-    */
-   if (p->inputs_safe) {
-      sse_movups(&p->func, dest, arg0);
-   } 
-   else {
-      /* c 0 0 0
-       * c c c c
-       * a b c c 
-       */
-      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
-      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
-      sse_movlps(&p->func, dest, arg0);
-   }
-}
-
-static void emit_load3f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   emit_load4f_2(p, dest, arg0);
-}
-
-static void emit_load3f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   emit_load4f_1(p, dest, arg0);
-}
-
-static void emit_load2f_2( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_load2f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   emit_load4f_1(p, dest, arg0);
-}
-
-static void emit_load1f_1( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movss(&p->func, dest, arg0);
-}
-
-static void (*load[4][4])( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 ) = {
-   { emit_load1f_1, 
-     emit_load1f_1, 
-     emit_load1f_1, 
-     emit_load1f_1 },
-
-   { emit_load2f_1, 
-     emit_load2f_2, 
-     emit_load2f_2, 
-     emit_load2f_2 },
-
-   { emit_load3f_1, 
-     emit_load3f_2, 
-     emit_load3f_3, 
-     emit_load3f_3 },
-
-   { emit_load4f_1, 
-     emit_load4f_2, 
-     emit_load4f_3, 
-     emit_load4f_4 } 
-};
-
-static void emit_load( struct x86_program *p,
-		       struct x86_reg dest,
-		       unsigned sz,
-		       struct x86_reg src,
-		       unsigned src_sz)
-{
-   load[sz-1][src_sz-1](p, dest, src);
-}
-
-static void emit_store4f( struct x86_program *p, 			   
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   sse_movups(&p->func, dest, arg0);
-}
-
-static void emit_store3f( struct x86_program *p, 
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   if (p->outputs_safe) {
-      /* Emit the extra dword anyway.  This may hurt writecombining,
-       * may cause other problems.
-       */
-      sse_movups(&p->func, dest, arg0);
-   }
-   else {
-      /* Alternate strategy - emit two, shuffle, emit one.
-       */
-      sse_movlps(&p->func, dest, arg0);
-      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
-   }
-}
-
-static void emit_store2f( struct x86_program *p, 
-			   struct x86_reg dest,
-			   struct x86_reg arg0 )
-{
-   sse_movlps(&p->func, dest, arg0);
-}
-
-static void emit_store1f( struct x86_program *p, 
-			  struct x86_reg dest,
-			  struct x86_reg arg0 )
-{
-   sse_movss(&p->func, dest, arg0);
-}
-
-
-static void (*store[4])( struct x86_program *p, 
-			 struct x86_reg dest,
-			 struct x86_reg arg0 ) = 
-{
-   emit_store1f, 
-   emit_store2f, 
-   emit_store3f, 
-   emit_store4f 
-};
-
-static void emit_store( struct x86_program *p,
-			struct x86_reg dest,
-			unsigned sz,
-			struct x86_reg temp )
-
-{
-   store[sz-1](p, dest, temp);
-}
-
-static void emit_pack_store_4ub( struct x86_program *p,
-				 struct x86_reg dest,
-				 struct x86_reg temp )
-{
-   /* Scale by 255.0
-    */
-   sse_mulps(&p->func, temp, p->chan0);
-
-   if (p->have_sse2) {
-      sse2_cvtps2dq(&p->func, temp, temp);
-      sse2_packssdw(&p->func, temp, temp);
-      sse2_packuswb(&p->func, temp, temp);
-      sse_movss(&p->func, dest, temp);
-   }
-   else {
-      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
-      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
-      sse_cvtps2pi(&p->func, mmx0, temp);
-      sse_movhlps(&p->func, temp, temp);
-      sse_cvtps2pi(&p->func, mmx1, temp);
-      mmx_packssdw(&p->func, mmx0, mmx1);
-      mmx_packuswb(&p->func, mmx0, mmx0);
-      mmx_movd(&p->func, dest, mmx0);
-   }
-}
-
-static int get_offset( const void *a, const void *b )
-{
-   return (const char *)b - (const char *)a;
-}
-
-/* Not much happens here.  Eventually use this function to try and
- * avoid saving/reloading the source pointers each vertex (if some of
- * them can fit in registers).
- */
-static void get_src_ptr( struct x86_program *p,
-			 struct x86_reg srcREG,
-			 struct x86_reg vfREG,
-			 struct draw_vf_attr *a )
-{
-   struct draw_vertex_fetch *vf = p->vf;
-   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
-
-   /* Load current a[j].inputptr
-    */
-   x86_mov(&p->func, srcREG, ptr_to_src);
-}
-
-static void update_src_ptr( struct x86_program *p,
-			 struct x86_reg srcREG,
-			 struct x86_reg vfREG,
-			 struct draw_vf_attr *a )
-{
-   if (a->inputstride) {
-      struct draw_vertex_fetch *vf = p->vf;
-      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
-
-      /* add a[j].inputstride (hardcoded value - could just as easily
-       * pull the stride value from memory each time).
-       */
-      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
-      
-      /* save new value of a[j].inputptr 
-       */
-      x86_mov(&p->func, ptr_to_src, srcREG);
-   }
-}
-
-
-/* Lots of hardcoding
- *
- * EAX -- pointer to current output vertex
- * ECX -- pointer to current attribute 
- * 
- */
-static boolean build_vertex_emit( struct x86_program *p )
-{
-   struct draw_vertex_fetch *vf = p->vf;
-   unsigned j = 0;
-
-   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
-   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
-   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
-   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
-   struct x86_reg temp = x86_make_reg(file_XMM, 0);
-   uint8_t *fixup, *label;
-
-   /* Push a few regs?
-    */
-   x86_push(&p->func, countEBP);
-   x86_push(&p->func, vfESI);
-
-
-   /* Get vertex count, compare to zero
-    */
-   x86_xor(&p->func, srcECX, srcECX);
-   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
-   x86_cmp(&p->func, countEBP, srcECX);
-   fixup = x86_jcc_forward(&p->func, cc_E);
-
-   /* Initialize destination register. 
-    */
-   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
-
-   /* Move argument 1 (vf) into a reg:
-    */
-   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
-
-   
-   /* always load, needed or not:
-    */
-   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
-
-   /* Note address for loop jump */
-   label = x86_get_label(&p->func);
-
-   /* Emit code for each of the attributes.  Currently routes
-    * everything through SSE registers, even when it might be more
-    * efficient to stick with regular old x86.  No optimization or
-    * other tricks - enough new ground to cover here just getting
-    * things working.
-    */
-   while (j < vf->attr_count) {
-      struct draw_vf_attr *a = &vf->attr[j];
-      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
-
-      /* Now, load an XMM reg from src, perhaps transform, then save.
-       * Could be shortcircuited in specific cases:
-       */
-      switch (a->format) {
-      case DRAW_EMIT_1F:
-      case DRAW_EMIT_1F_CONST:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 1, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_2F:
-      case DRAW_EMIT_2F_CONST:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 2, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_3F:
-      case DRAW_EMIT_3F_CONST:
-	 /* Potentially the worst case - hardcode 2+1 copying:
-	  */
-	 if (0) {
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 3, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	 }
-	 else {
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
-	    emit_store(p, dest, 2, temp);
-	    if (a->inputsize > 2) {
-	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
-	       emit_store(p, x86_make_disp(dest,8), 1, temp);
-	    }
-	    else {
-	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
-	    }
-	    update_src_ptr(p, srcECX, vfESI, a);
-	 }
-	 break;
-      case DRAW_EMIT_4F:
-      case DRAW_EMIT_4F_CONST:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 emit_store(p, dest, 4, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_3F_XYW:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
-	 emit_store(p, dest, 3, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-
-      case DRAW_EMIT_1UB_1F:	 
-	 /* Test for PAD3 + 1UB:
-	  */
-	 if (j > 0 &&
-	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
-	 {
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
-	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
-	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
-	    update_src_ptr(p, srcECX, vfESI, a);
-	 }
-	 else {
-	    debug_printf("Can't emit 1ub %x %x %d\n", 
-	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
-	    return FALSE;
-	 }
-	 break;
-      case DRAW_EMIT_3UB_3F_RGB:
-      case DRAW_EMIT_3UB_3F_BGR:
-	 /* Test for 3UB + PAD1:
-	  */
-	 if (j == vf->attr_count - 1 ||
-	     a[1].vertoffset >= a->vertoffset + 4) {
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
-	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
-	    emit_pack_store_4ub(p, dest, temp);
-	    update_src_ptr(p, srcECX, vfESI, a);
-	 }
-	 /* Test for 3UB + 1UB:
-	  */
-	 else if (j < vf->attr_count - 1 &&
-		  a[1].format == DRAW_EMIT_1UB_1F &&
-		  a[1].vertoffset == a->vertoffset + 3) {
-	    get_src_ptr(p, srcECX, vfESI, a);
-	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
-	    update_src_ptr(p, srcECX, vfESI, a);
-
-	    /* Make room for incoming value:
-	     */
-	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
-
-	    get_src_ptr(p, srcECX, vfESI, &a[1]);
-	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
-	    update_src_ptr(p, srcECX, vfESI, &a[1]);
-
-	    /* Rearrange and possibly do BGR conversion:
-	     */
-	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
-	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
-	    else
-	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
-
-	    emit_pack_store_4ub(p, dest, temp);
-	    j++;		/* NOTE: two attrs consumed */
-	 }
-	 else {
-	    debug_printf("Can't emit 3ub\n");
-	 }
-	 return FALSE;	/* add this later */
-	 break;
-
-      case DRAW_EMIT_4UB_4F_RGBA:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_4UB_4F_BGRA:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_4UB_4F_ARGB:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      case DRAW_EMIT_4UB_4F_ABGR:
-	 get_src_ptr(p, srcECX, vfESI, a);
-	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
-	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
-	 emit_pack_store_4ub(p, dest, temp);
-	 update_src_ptr(p, srcECX, vfESI, a);
-	 break;
-      default:
-	 debug_printf("unknown a[%d].format %d\n", j, a->format);
-	 return FALSE;	/* catch any new opcodes */
-      }
-      
-      /* Increment j by at least 1 - may have been incremented above also:
-       */
-      j++;
-   }
-
-   /* Next vertex:
-    */
-   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
-
-   /* decr count, loop if not zero
-    */
-   x86_dec(&p->func, countEBP);
-   x86_test(&p->func, countEBP, countEBP); 
-   x86_jcc(&p->func, cc_NZ, label);
-
-   /* Exit mmx state?
-    */
-   if (p->func.need_emms)
-      mmx_emms(&p->func);
-
-   /* Land forward jump here:
-    */
-   x86_fixup_fwd_jump(&p->func, fixup);
-
-   /* Pop regs and return
-    */
-   x86_pop(&p->func, x86_get_base_reg(vfESI));
-   x86_pop(&p->func, countEBP);
-   x86_ret(&p->func);
-
-   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
-   return TRUE;
-}
-
-
-
-void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
-{
-   struct x86_program p;   
-
-   if (!rtasm_cpu_has_sse()) {
-      vf->codegen_emit = NULL;
-      return;
-   }
-
-   memset(&p, 0, sizeof(p));
-
-   p.vf = vf;
-   p.inputs_safe = 0;		/* for now */
-   p.outputs_safe = 1;		/* for now */
-   p.have_sse2 = rtasm_cpu_has_sse2();
-   p.identity = x86_make_reg(file_XMM, 6);
-   p.chan0 = x86_make_reg(file_XMM, 7);
-
-   x86_init_func(&p.func);
-
-   if (build_vertex_emit(&p)) {
-      draw_vf_register_fastpath( vf, TRUE );
-   }
-   else {
-      /* Note the failure so that we don't keep trying to codegen an
-       * impossible state:
-       */
-      draw_vf_register_fastpath( vf, FALSE );
-      x86_release_func(&p.func);
-   }
-}
-
-#else
-
-void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
-{
-   /* Dummy version for when USE_SSE_ASM not defined */
-}
-
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_vertex_shader.c b/src/gallium/auxiliary/draw/draw_vs.c
index 8572a6d40c..03fe00a951 100644
--- a/src/gallium/auxiliary/draw/draw_vertex_shader.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -37,49 +37,6 @@
 #include "draw_context.h"
 #include "draw_vs.h"
 
-/**
- * Run the vertex shader on all vertices in the vertex queue.
- * Called by the draw module when the vertx cache needs to be flushed.
- */
-void
-draw_vertex_shader_queue_flush(struct draw_context *draw)
-{
-   struct draw_vertex_shader *shader = draw->vertex_shader;
-   unsigned i;
-
-   assert(draw->vs.queue_nr != 0);
-
-   /* XXX: do this on statechange: 
-    */
-   shader->prepare( shader, draw );
-
-//   fprintf(stderr, "%s %d\n", __FUNCTION__, draw->vs.queue_nr );
-
-   /* run vertex shader on vertex cache entries, four per invokation */
-   for (i = 0; i < draw->vs.queue_nr; i += MAX_SHADER_VERTICES) {
-      unsigned elts[MAX_SHADER_VERTICES];
-      int j, n = MIN2(MAX_SHADER_VERTICES, draw->vs.queue_nr  - i);
-      struct vertex_header *dests =
-         draw_header_from_block(draw->vs.vertex_cache,
-                                MAX_VERTEX_ALLOCATION, i);
-
-      for (j = 0; j < n; j++) {
-         elts[j] = draw->vs.elts[i + j];
-      }
-
-      for ( ; j < MAX_SHADER_VERTICES; j++) {
-	 elts[j] = elts[0];
-      }
-
-      assert(n > 0);
-      assert(n <= MAX_SHADER_VERTICES);
-
-      shader->run(shader, draw, elts, n, dests, MAX_VERTEX_ALLOCATION);
-   }
-
-   draw->vs.post_nr = draw->vs.queue_nr;
-   draw->vs.queue_nr = 0;
-}
 
 
 struct draw_vertex_shader *
@@ -95,10 +52,8 @@ draw_create_vertex_shader(struct draw_context *draw,
          vs = draw_create_vs_exec( draw, shader );
       }
    }
-   assert(vs);
-
-   tgsi_scan_shader(shader->tokens, &vs->info);
 
+   assert(vs);
    return vs;
 }
 
@@ -113,9 +68,6 @@ draw_bind_vertex_shader(struct draw_context *draw,
    {
       draw->vertex_shader = dvs;
       draw->num_vs_outputs = dvs->info.num_outputs;
-
-      tgsi_exec_machine_init(&draw->machine);
-
       dvs->prepare( dvs, draw );
    }
    else {
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 33ce1e335e..f9772b83b8 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -35,10 +35,39 @@
 #include "draw_private.h"
 
 
-struct draw_vertex_shader;
 struct draw_context;
 struct pipe_shader_state;
 
+/**
+ * Private version of the compiled vertex_shader
+ */
+struct draw_vertex_shader {
+
+   /* This member will disappear shortly:
+    */
+   struct pipe_shader_state   state;
+
+   struct tgsi_shader_info info;
+
+   void (*prepare)( struct draw_vertex_shader *shader,
+		    struct draw_context *draw );
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   void (*run_linear)( struct draw_vertex_shader *shader,
+		       const float (*input)[4],
+		       float (*output)[4],
+		       const float (*constants)[4],
+		       unsigned count,
+		       unsigned input_stride,
+		       unsigned output_stride );
+
+
+   void (*delete)( struct draw_vertex_shader * );
+};
+
+
 struct draw_vertex_shader *
 draw_create_vs_exec(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
@@ -52,32 +81,7 @@ draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
 
 
-/* Should be part of the generated shader:
- */
-static INLINE unsigned
-compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
-{
-   unsigned mask = 0x0;
-   unsigned i;
-
-   /* Do the hardwired planes first:
-    */
-   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
-   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
-   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
-   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
-   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
-   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
-
-   /* Followed by any remaining ones:
-    */
-   for (i = 6; i < nr; i++) {
-      if (dot4(clip, plane[i]) < 0) 
-         mask |= (1<<i);
-   }
-
-   return mask;
-}
-
+#define MAX_TGSI_VERTICES 4
+   
 
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 5c88c2e24e..54a2b2ab04 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -40,145 +40,118 @@
 
 #include "tgsi/util/tgsi_parse.h"
 
-#define MAX_TGSI_VERTICES 4
 
+struct exec_vertex_shader {
+   struct draw_vertex_shader base;
+   struct tgsi_exec_machine *machine;
+};
+
+static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
+{
+   return (struct exec_vertex_shader *)vs;
+}
+
+
+/* Not required for run_linear.
+ */
 static void
 vs_exec_prepare( struct draw_vertex_shader *shader,
 		 struct draw_context *draw )
 {
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+
    /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(&draw->machine,
+   tgsi_exec_machine_bind_shader(evs->machine,
 				 shader->state.tokens,
 				 PIPE_MAX_SAMPLERS,
 				 NULL /*samplers*/ );
 
-   draw_update_vertex_fetch( draw );
 }
 
 
-/**
- * Transform vertices with the current vertex program/shader
- * Up to four vertices can be shaded at a time.
- * \param vbuffer  the input vertex data
- * \param elts  indexes of four input vertices
- * \param count  number of vertices to shade [1..4]
- * \param vOut  array of pointers to four output vertices
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
  */
-static boolean
-vs_exec_run( struct draw_vertex_shader *shader,
-	     struct draw_context *draw,
-	     const unsigned *elts, 
-	     unsigned count,
-	     void *vOut,
-             unsigned vertex_size)
+static void
+vs_exec_run_linear( struct draw_vertex_shader *shader,
+		    const float (*input)[4],
+		    float (*output)[4],
+		    const float (*constants)[4],
+		    unsigned count,
+		    unsigned input_stride,
+		    unsigned output_stride )
 {
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct exec_vertex_shader *evs = exec_vertex_shader(shader);
+   struct tgsi_exec_machine *machine = evs->machine;
    unsigned int i, j;
-   unsigned int clipped = 0;
+   unsigned slot;
 
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
-   const float *scale = draw->viewport.scale;
-   const float *trans = draw->viewport.translate;
-
-   assert(draw->vertex_shader->info.output_semantic_name[0]
-          == TGSI_SEMANTIC_POSITION);
-
-   machine->Consts = (float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   if (draw->rasterizer->bypass_vs) {
-      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
-   }
-   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
-   }
+   machine->Consts = constants;
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
-      draw->vertex_fetch.fetch_func( draw, machine, &elts[i], max_vertices );
-
-      if (!draw->rasterizer->bypass_vs) {
-         /* run interpreter */
-         tgsi_exec_machine_run( machine );
-      }
 
-      /* store machine results */
+      /* Swizzle inputs.  
+       */
       for (j = 0; j < max_vertices; j++) {
-         unsigned slot;
-         float x, y, z, w;
-         struct vertex_header *out =
-            draw_header_from_block(vOut, vertex_size, i + j);
-
-         /* Handle attr[0] (position) specially:
-          *
-          * XXX: Computing the clipmask should be done in the vertex
-          * program as a set of DP4 instructions appended to the
-          * user-provided code.
-          */
-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-         if (!draw->rasterizer->bypass_clipping) {
-            out->clipmask = compute_clipmask(out->clip, draw->plane,
-                                             draw->nr_planes);
-            clipped += out->clipmask;
-
-            /* divide by w */
-            w = 1.0f / w;
-            x *= w;
-            y *= w;
-            z *= w;
+#if 0
+         debug_printf("%d) Input vert:\n", i + j);
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            debug_printf("\t%d: %f %f %f %f\n", slot,
+			 input[slot][0],
+			 input[slot][1],
+			 input[slot][2],
+			 input[slot][3]);
          }
-         else {
-            out->clipmask = 0;
-         }
-         out->edgeflag = 1;
-	 out->vertex_id = UNDEFINED_VERTEX_ID;
-
-         if (!draw->identity_viewport) {
-            /* Viewport mapping */
-            out->data[0][0] = x * scale[0] + trans[0];
-            out->data[0][1] = y * scale[1] + trans[1];
-            out->data[0][2] = z * scale[2] + trans[2];
-            out->data[0][3] = w;
-         }
-         else {
-            out->data[0][0] = x;
-            out->data[0][1] = y;
-            out->data[0][2] = z;
-            out->data[0][3] = w;
+#endif
+
+         for (slot = 0; slot < shader->info.num_inputs; slot++) {
+            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
+            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
+            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
+            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
          }
 
-         /* Remaining attributes are packed into sequential post-transform
-          * vertex attrib slots.
-          */
-         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+	 input = (const float (*)[4])((const char *)input + input_stride);
+      } 
+
+      /* run interpreter */
+      tgsi_exec_machine_run( machine );
+
+      /* Unswizzle all output results.  
+       */
+      for (j = 0; j < max_vertices; j++) {
+         for (slot = 0; slot < shader->info.num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+
          }
 
-#if 0 /*DEBUG*/
-         printf("%d) Post xform vert:\n", i + j);
-         for (slot = 0; slot < draw->num_vs_outputs; slot++) {
-            printf("\t%d: %f %f %f %f\n", slot,
-                   out->data[slot][0],
-                   out->data[slot][1],
-                   out->data[slot][2],
-                   out->data[slot][3]);
+#if 0
+	 debug_printf("%d) Post xform vert:\n", i + j);
+	 for (slot = 0; slot < shader->info.num_outputs; slot++) {
+	    debug_printf("\t%d: %f %f %f %f\n", slot,
+			 output[slot][0],
+			 output[slot][1],
+			 output[slot][2],
+			 output[slot][3]);
          }
 #endif
-      } /* loop over vertices */
+
+	 output = (float (*)[4])((char *)output + output_stride);
+      } 
+
    }
-   return clipped != 0;
 }
 
 
 
+
 static void
 vs_exec_delete( struct draw_vertex_shader *dvs )
 {
@@ -191,17 +164,22 @@ struct draw_vertex_shader *
 draw_create_vs_exec(struct draw_context *draw,
 		    const struct pipe_shader_state *state)
 {
-   struct draw_vertex_shader *vs = CALLOC_STRUCT( draw_vertex_shader );
+   struct exec_vertex_shader *vs = CALLOC_STRUCT( exec_vertex_shader );
    uint nt = tgsi_num_tokens(state->tokens);
 
    if (vs == NULL) 
       return NULL;
 
    /* we make a private copy of the tokens */
-   vs->state.tokens = mem_dup(state->tokens, nt * sizeof(state->tokens[0]));
-   vs->prepare = vs_exec_prepare;
-   vs->run = vs_exec_run;
-   vs->delete = vs_exec_delete;
+   vs->base.state.tokens = mem_dup(state->tokens, nt * sizeof(state->tokens[0]));
+   tgsi_scan_shader(state->tokens, &vs->base.info);
+
+
+   vs->base.prepare = vs_exec_prepare;
+   vs->base.run_linear = vs_exec_run_linear;
+   vs->base.delete = vs_exec_delete;
+   vs->machine = &draw->machine;
+
 
-   return vs;
+   return &vs->base;
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 73076d2467..dcada66514 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -47,6 +47,7 @@
 struct draw_llvm_vertex_shader {
    struct draw_vertex_shader base;
    struct gallivm_prog *llvm_prog;
+   struct tgsi_exec_machine *machine;
 };
 
 
@@ -54,121 +55,68 @@ static void
 vs_llvm_prepare( struct draw_vertex_shader *base,
 		 struct draw_context *draw )
 {
-   draw_update_vertex_fetch( draw );
 }
 
 
 
-/**
- * Transform vertices with the current vertex program/shader
- * Up to four vertices can be shaded at a time.
- * \param vbuffer  the input vertex data
- * \param elts  indexes of four input vertices
- * \param count  number of vertices to shade [1..4]
- * \param vOut  array of pointers to four output vertices
- */
-static boolean
-vs_llvm_run( struct draw_vertex_shader *base,
-	     struct draw_context *draw,
-	     const unsigned *elts,
-	     unsigned count,
-	     void *vOut )
+
+static void
+vs_llvm_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
 {
    struct draw_llvm_vertex_shader *shader =
       (struct draw_llvm_vertex_shader *)base;
 
-   struct tgsi_exec_machine *machine = &draw->machine;
-   unsigned int j;
-   unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
-   const float *scale = draw->viewport.scale;
-   const float *trans = draw->viewport.translate;
-
-
-   assert(count <= 4);
-   assert(draw->vertex_shader->state->output_semantic_name[0]
-          == TGSI_SEMANTIC_POSITION);
+   struct tgsi_exec_machine *machine = shader->machine;
+   unsigned int i, j;
+   unsigned slot;
 
-   /* Consts does not require 16 byte alignment. */
-   machine->Consts = (float (*)[4]) draw->user.constants;
 
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   if (draw->rasterizer->bypass_vs) {
-      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
-   }
-   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
-   }
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
+      /* Swizzle inputs.
+       */
+      for (j = 0; j < max_vertices; j++) {
+	 for (slot = 0; slot < base->info.num_inputs; slot++) {
+	    machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
+	    machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
+	    machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
+	    machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
+	 }
 
-   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+	 input = (const float (*)[4])((const char *)input + input_stride);
+      } 
 
-   if (!draw->rasterizer->bypass_vs) {
       /* run shader */
       gallivm_cpu_vs_exec(shader->llvm_prog,
                           machine->Inputs,
                           machine->Outputs,
-                          machine->Consts,
+			  (float (*)[4]) constants,
                           machine->Temps);
-   }
 
-   /* store machine results */
-   for (j = 0; j < count; j++) {
-      unsigned slot;
-      float x, y, z, w;
-
-      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-      if (!draw->rasterizer->bypass_clipping) {
-         vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane,
-                                              draw->nr_planes);
-         clipped += vOut[j]->clipmask;
-
-         /* divide by w */
-         w = 1.0f / w;
-         x *= w;
-         y *= w;
-         z *= w;
-      }
-      else {
-         vOut[j]->clipmask = 0;
-      }
-      vOut[j]->edgeflag = 1;
-      vOut[j]->vertex_id = UNDEFINED_VERTEX_ID;
-
-      if (!draw->identity_viewport) {
-         /* Viewport mapping */
-         vOut[j]->data[0][0] = x * scale[0] + trans[0];
-         vOut[j]->data[0][1] = y * scale[1] + trans[1];
-         vOut[j]->data[0][2] = z * scale[2] + trans[2];
-         vOut[j]->data[0][3] = w;
-      }
-      else {
-         vOut[j]->data[0][0] = x;
-         vOut[j]->data[0][1] = y;
-         vOut[j]->data[0][2] = z;
-         vOut[j]->data[0][3] = w;
-      }
 
-      /* Remaining attributes are packed into sequential post-transform
-       * vertex attrib slots.
+      /* Unswizzle all output results
        */
-      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      for (j = 0; j < max_vertices; j++) {
+         for (slot = 0; slot < base->info.num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+         }
+         output = (float (*)[4])((char *)output + output_stride);
       }
-   } /* loop over vertices */
-   return clipped != 0;
+   }
 }
 
+
+
 static void
 vs_llvm_delete( struct draw_vertex_shader *base )
 {
@@ -198,15 +146,19 @@ draw_create_vs_llvm(struct draw_context *draw,
 
    /* we make a private copy of the tokens */
    vs->base.state.tokens = mem_dup(templ->tokens, nt * sizeof(templ->tokens[0]));
+
+   tgsi_scan_shader(vs->base.state.tokens, &vs->base.info);
+
    vs->base.prepare = vs_llvm_prepare;
-   vs->base.run = vs_llvm_run;
+   vs->base.run_linear = vs_llvm_run_linear;
    vs->base.delete = vs_llvm_delete;
+   vs->machine = &draw->machine;
 
    {
       struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
       gallivm_ir_set_layout(ir, GALLIVM_SOA);
       gallivm_ir_set_components(ir, 4);
-      gallivm_ir_fill_from_tgsi(ir, vs->base.state->tokens);
+      gallivm_ir_fill_from_tgsi(ir, vs->base.state.tokens);
       vs->llvm_prog = gallivm_ir_compile(ir);
       gallivm_ir_delete(ir);
    }
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index ee0a3105b9..b1e9f67114 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -41,6 +41,7 @@
 #include "draw_private.h"
 #include "draw_context.h"
 
+#include "rtasm/rtasm_cpu.h"
 #include "rtasm/rtasm_x86sse.h"
 #include "tgsi/exec/tgsi_sse2.h"
 #include "tgsi/util/tgsi_parse.h"
@@ -58,7 +59,11 @@ typedef void (XSTDCALL *codegen_function) (
 struct draw_sse_vertex_shader {
    struct draw_vertex_shader base;
    struct x86_function sse2_program;
+
    codegen_function func;
+   
+   struct tgsi_exec_machine *machine;
+
    float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
 };
 
@@ -67,140 +72,71 @@ static void
 vs_sse_prepare( struct draw_vertex_shader *base,
 		struct draw_context *draw )
 {
-   draw_update_vertex_fetch( draw );
 }
 
-/**
- * Transform vertices with the current vertex program/shader
- * Up to four vertices can be shaded at a time.
- * \param vbuffer  the input vertex data
- * \param elts  indexes of four input vertices
- * \param count  number of vertices to shade [1..4]
- * \param vOut  array of pointers to four output vertices
+
+
+/* Simplified vertex shader interface for the pt paths.  Given the
+ * complexity of code-generating all the above operations together,
+ * it's time to try doing all the other stuff separately.
  */
-static boolean
-vs_sse_run( struct draw_vertex_shader *base,
-	    struct draw_context *draw, 
-	    const unsigned *elts, 
-	    unsigned count,
-	    void *vOut,
-            unsigned vertex_size )
+static void
+vs_sse_run_linear( struct draw_vertex_shader *base,
+		   const float (*input)[4],
+		   float (*output)[4],
+		   const float (*constants)[4],
+		   unsigned count,
+		   unsigned input_stride,
+		   unsigned output_stride )
 {
    struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = &draw->machine;
+   struct tgsi_exec_machine *machine = shader->machine;
    unsigned int i, j;
-   unsigned int clipped = 0;
-
-   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_MAX_ATTRIBS);
-   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_MAX_ATTRIBS);
-   const float *scale = draw->viewport.scale;
-   const float *trans = draw->viewport.translate;
-
-   assert(draw->vertex_shader->info.output_semantic_name[0]
-          == TGSI_SEMANTIC_POSITION);
-
-   /* Consts does not require 16 byte alignment. */
-   machine->Consts = (float (*)[4]) draw->user.constants;
-   machine->Inputs = ALIGN16_ASSIGN(inputs);
-   if (draw->rasterizer->bypass_vs) {
-      /* outputs are just the inputs */
-      machine->Outputs = machine->Inputs;
-   }
-   else {
-      machine->Outputs = ALIGN16_ASSIGN(outputs);
-   }
+   unsigned slot;
 
-   for (i = 0; i < count; i += SSE_MAX_VERTICES) {
-      unsigned int max_vertices = MIN2(SSE_MAX_VERTICES, count - i);
-      /* Fetch vertices.  This may at some point be integrated into the
-       * compiled shader -- that would require a reorganization where
-       * multiple versions of the compiled shader might exist,
-       * specialized for each fetch state.
-       */
-      draw->vertex_fetch.fetch_func(draw, machine, &elts[i], max_vertices);
-
-      if (!draw->rasterizer->bypass_vs) {
-         /* run compiled shader
-          */
-         shader->func(machine->Inputs,
-                      machine->Outputs,
-                      machine->Consts,
-                      machine->Temps,
-                      shader->immediates);
-      }
-
-      /* XXX: Computing the clipmask and emitting results should be done
-       *      in the vertex program as a set of instructions appended to
-       *      the user-provided code.
+   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
+      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
+
+      /* Swizzle inputs.  
        */
       for (j = 0; j < max_vertices; j++) {
-         unsigned slot;
-         float x, y, z, w;
-         struct vertex_header *out =
-            draw_header_from_block(vOut, vertex_size, i + j);
-
-         x = out->clip[0] = machine->Outputs[0].xyzw[0].f[j];
-         y = out->clip[1] = machine->Outputs[0].xyzw[1].f[j];
-         z = out->clip[2] = machine->Outputs[0].xyzw[2].f[j];
-         w = out->clip[3] = machine->Outputs[0].xyzw[3].f[j];
-
-         if (!draw->rasterizer->bypass_clipping) {
-            out->clipmask = compute_clipmask(out->clip, draw->plane,
-                                             draw->nr_planes);
-            clipped += out->clipmask;
-
-            /* divide by w */
-            w = 1.0f / w;
-            x *= w;
-            y *= w;
-            z *= w;
-         }
-         else {
-            out->clipmask = 0;
-         }
-         out->edgeflag = 1;
-	 out->vertex_id = UNDEFINED_VERTEX_ID;
-
-         if (!draw->identity_viewport) {
-            /* Viewport mapping */
-            out->data[0][0] = x * scale[0] + trans[0];
-            out->data[0][1] = y * scale[1] + trans[1];
-            out->data[0][2] = z * scale[2] + trans[2];
-            out->data[0][3] = w;
-         }
-         else {
-            out->data[0][0] = x;
-            out->data[0][1] = y;
-            out->data[0][2] = z;
-            out->data[0][3] = w;
+         for (slot = 0; slot < base->info.num_inputs; slot++) {
+            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
+            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
+            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
+            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
          }
 
-         /* Remaining attributes are packed into sequential post-transform
-          * vertex attrib slots.
-          */
-         for (slot = 1; slot < draw->num_vs_outputs; slot++) {
-            out->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            out->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            out->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            out->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-         }
-#if 0 /*DEBUG*/
-         printf("%d) Post xform vert:\n", i + j);
-         for (slot = 0; slot < draw->num_vs_outputs; slot++) {
-            printf("\t%d: %f %f %f %f\n", slot,
-                   out->data[slot][0],
-                   out->data[slot][1],
-                   out->data[slot][2],
-                   out->data[slot][3]);
+	 input = (const float (*)[4])((const char *)input + input_stride);
+      } 
+
+      /* run compiled shader
+       */
+      shader->func(machine->Inputs,
+		   machine->Outputs,
+		   (float (*)[4])constants,
+		   machine->Temps,
+		   shader->immediates);
+
+
+      /* Unswizzle all output results.  
+       */
+      for (j = 0; j < max_vertices; j++) {
+         for (slot = 0; slot < base->info.num_outputs; slot++) {
+            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
          }
-#endif
-      }
+
+	 output = (float (*)[4])((char *)output + output_stride);
+      } 
    }
-   return clipped != 0;
 }
 
 
 
+
 static void
 vs_sse_delete( struct draw_vertex_shader *base )
 {
@@ -220,7 +156,7 @@ draw_create_vs_sse(struct draw_context *draw,
    struct draw_sse_vertex_shader *vs;
    uint nt = tgsi_num_tokens(templ->tokens);
 
-   if (!draw->use_sse) 
+   if (!rtasm_cpu_has_sse2())
       return NULL;
 
    vs = CALLOC_STRUCT( draw_sse_vertex_shader );
@@ -229,9 +165,13 @@ draw_create_vs_sse(struct draw_context *draw,
 
    /* we make a private copy of the tokens */
    vs->base.state.tokens = mem_dup(templ->tokens, nt * sizeof(templ->tokens[0]));
+
+   tgsi_scan_shader(templ->tokens, &vs->base.info);
+
    vs->base.prepare = vs_sse_prepare;
-   vs->base.run = vs_sse_run;
+   vs->base.run_linear = vs_sse_run_linear;
    vs->base.delete = vs_sse_delete;
+   vs->machine = &draw->machine;
    
    x86_init_func( &vs->sse2_program );
 
diff --git a/src/gallium/auxiliary/draw/draw_wide_prims.c b/src/gallium/auxiliary/draw/draw_wide_prims.c
deleted file mode 100644
index d6bff110b4..0000000000
--- a/src/gallium/auxiliary/draw/draw_wide_prims.c
+++ /dev/null
@@ -1,366 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
- */
-
-#include "pipe/p_util.h"
-#include "pipe/p_defines.h"
-#include "pipe/p_shader_tokens.h"
-#include "draw_private.h"
-
-
-struct wide_stage {
-   struct draw_stage stage;
-
-   float half_line_width;
-   float half_point_size;
-
-   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
-   uint texcoord_mode[PIPE_MAX_SHADER_OUTPUTS];
-   uint num_texcoords;
-
-   int psize_slot;
-};
-
-
-
-static INLINE struct wide_stage *wide_stage( struct draw_stage *stage )
-{
-   return (struct wide_stage *)stage;
-}
-
-
-static void passthrough_point( struct draw_stage *stage,
-                               struct prim_header *header )
-{
-   stage->next->point( stage->next, header );
-}
-
-static void passthrough_line( struct draw_stage *stage,
-                              struct prim_header *header )
-{
-   stage->next->line(stage->next, header);
-}
-
-static void passthrough_tri( struct draw_stage *stage,
-                             struct prim_header *header )
-{
-   stage->next->tri(stage->next, header);
-}
-
-
-/**
- * Draw a wide line by drawing a quad (two triangles).
- * XXX need to disable polygon stipple.
- */
-static void wide_line( struct draw_stage *stage,
-		       struct prim_header *header )
-{
-   const struct wide_stage *wide = wide_stage(stage);
-   const float half_width = wide->half_line_width;
-
-   struct prim_header tri;
-
-   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
-   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
-   struct vertex_header *v2 = dup_vert(stage, header->v[1], 2);
-   struct vertex_header *v3 = dup_vert(stage, header->v[1], 3);
-
-   float *pos0 = v0->data[0];
-   float *pos1 = v1->data[0];
-   float *pos2 = v2->data[0];
-   float *pos3 = v3->data[0];
-
-   const float dx = FABSF(pos0[0] - pos2[0]);
-   const float dy = FABSF(pos0[1] - pos2[1]);
-   
-   /*
-    * Draw wide line as a quad (two tris) by "stretching" the line along
-    * X or Y.
-    * We need to tweak coords in several ways to be conformant here.
-    */
-
-   if (dx > dy) {
-      /* x-major line */
-      pos0[1] = pos0[1] - half_width - 0.25f;
-      pos1[1] = pos1[1] + half_width - 0.25f;
-      pos2[1] = pos2[1] - half_width - 0.25f;
-      pos3[1] = pos3[1] + half_width - 0.25f;
-      if (pos0[0] < pos2[0]) {
-         /* left to right line */
-         pos0[0] -= 0.5f;
-         pos1[0] -= 0.5f;
-         pos2[0] -= 0.5f;
-         pos3[0] -= 0.5f;
-      }
-      else {
-         /* right to left line */
-         pos0[0] += 0.5f;
-         pos1[0] += 0.5f;
-         pos2[0] += 0.5f;
-         pos3[0] += 0.5f;
-      }
-   }
-   else {
-      /* y-major line */
-      pos0[0] = pos0[0] - half_width + 0.25f;
-      pos1[0] = pos1[0] + half_width + 0.25f;
-      pos2[0] = pos2[0] - half_width + 0.25f;
-      pos3[0] = pos3[0] + half_width + 0.25f;
-      if (pos0[1] < pos2[1]) {
-         /* top to bottom line */
-         pos0[1] -= 0.5f;
-         pos1[1] -= 0.5f;
-         pos2[1] -= 0.5f;
-         pos3[1] -= 0.5f;
-      }
-      else {
-         /* bottom to top line */
-         pos0[1] += 0.5f;
-         pos1[1] += 0.5f;
-         pos2[1] += 0.5f;
-         pos3[1] += 0.5f;
-      }
-   }
-
-   tri.det = header->det;  /* only the sign matters */
-   tri.v[0] = v0;
-   tri.v[1] = v2;
-   tri.v[2] = v3;
-   stage->next->tri( stage->next, &tri );
-
-   tri.v[0] = v0;
-   tri.v[1] = v3;
-   tri.v[2] = v1;
-   stage->next->tri( stage->next, &tri );
-}
-
-
-/**
- * Set the vertex texcoords for sprite mode.
- * Coords may be left untouched or set to a right-side-up or upside-down
- * orientation.
- */
-static void set_texcoords(const struct wide_stage *wide,
-                          struct vertex_header *v, const float tc[4])
-{
-   uint i;
-   for (i = 0; i < wide->num_texcoords; i++) {
-      if (wide->texcoord_mode[i] != PIPE_SPRITE_COORD_NONE) {
-         uint j = wide->texcoord_slot[i];
-         v->data[j][0] = tc[0];
-         if (wide->texcoord_mode[i] == PIPE_SPRITE_COORD_LOWER_LEFT)
-            v->data[j][1] = 1.0f - tc[1];
-         else
-            v->data[j][1] = tc[1];
-         v->data[j][2] = tc[2];
-         v->data[j][3] = tc[3];
-      }
-   }
-}
-
-
-/* If there are lots of sprite points (and why wouldn't there be?) it
- * would probably be more sensible to change hardware setup to
- * optimize this rather than doing the whole thing in software like
- * this.
- */
-static void wide_point( struct draw_stage *stage,
-			struct prim_header *header )
-{
-   const struct wide_stage *wide = wide_stage(stage);
-   const boolean sprite = (boolean) stage->draw->rasterizer->point_sprite;
-   float half_size;
-   float left_adj, right_adj;
-
-   struct prim_header tri;
-
-   /* four dups of original vertex */
-   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
-   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
-   struct vertex_header *v2 = dup_vert(stage, header->v[0], 2);
-   struct vertex_header *v3 = dup_vert(stage, header->v[0], 3);
-
-   float *pos0 = v0->data[0];
-   float *pos1 = v1->data[0];
-   float *pos2 = v2->data[0];
-   float *pos3 = v3->data[0];
-
-   /* point size is either per-vertex or fixed size */
-   if (wide->psize_slot >= 0) {
-      half_size = 0.5f * header->v[0]->data[wide->psize_slot][0];
-   }
-   else {
-      half_size = wide->half_point_size;
-   }
-
-   left_adj = -half_size; /* + 0.25f;*/
-   right_adj = half_size; /* + 0.25f;*/
-
-   pos0[0] += left_adj;
-   pos0[1] -= half_size;
-
-   pos1[0] += left_adj;
-   pos1[1] += half_size;
-
-   pos2[0] += right_adj;
-   pos2[1] -= half_size;
-
-   pos3[0] += right_adj;
-   pos3[1] += half_size;
-
-   if (sprite) {
-      static const float tex00[4] = { 0, 0, 0, 1 };
-      static const float tex01[4] = { 0, 1, 0, 1 };
-      static const float tex11[4] = { 1, 1, 0, 1 };
-      static const float tex10[4] = { 1, 0, 0, 1 };
-      set_texcoords( wide, v0, tex00 );
-      set_texcoords( wide, v1, tex01 );
-      set_texcoords( wide, v2, tex10 );
-      set_texcoords( wide, v3, tex11 );
-   }
-
-   tri.det = header->det;  /* only the sign matters */
-   tri.v[0] = v0;
-   tri.v[1] = v2;
-   tri.v[2] = v3;
-   stage->next->tri( stage->next, &tri );
-
-   tri.v[0] = v0;
-   tri.v[1] = v3;
-   tri.v[2] = v1;
-   stage->next->tri( stage->next, &tri );
-}
-
-
-static void wide_first_point( struct draw_stage *stage, 
-			      struct prim_header *header )
-{
-   struct wide_stage *wide = wide_stage(stage);
-   struct draw_context *draw = stage->draw;
-
-   wide->half_point_size = 0.5f * draw->rasterizer->point_size;
-
-   /* XXX we won't know the real size if it's computed by the vertex shader! */
-   if (draw->rasterizer->point_size > draw->wide_point_threshold) {
-      stage->point = wide_point;
-   }
-   else {
-      stage->point = passthrough_point;
-   }
-
-   if (draw->rasterizer->point_sprite) {
-      /* find vertex shader texcoord outputs */
-      const struct draw_vertex_shader *vs = draw->vertex_shader;
-      uint i, j = 0;
-      for (i = 0; i < vs->info.num_outputs; i++) {
-         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
-            wide->texcoord_slot[j] = i;
-            wide->texcoord_mode[j] = draw->rasterizer->sprite_coord_mode[j];
-            j++;
-         }
-      }
-      wide->num_texcoords = j;
-   }
-
-   wide->psize_slot = -1;
-
-   if (draw->rasterizer->point_size_per_vertex) {
-      /* find PSIZ vertex output */
-      const struct draw_vertex_shader *vs = draw->vertex_shader;
-      uint i;
-      for (i = 0; i < vs->info.num_outputs; i++) {
-         if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
-            wide->psize_slot = i;
-            break;
-         }
-      }
-   }
-   
-   stage->point( stage, header );
-}
-
-
-
-static void wide_first_line( struct draw_stage *stage,
-			     struct prim_header *header )
-{
-   struct wide_stage *wide = wide_stage(stage);
-   struct draw_context *draw = stage->draw;
-
-   wide->half_line_width = 0.5f * draw->rasterizer->line_width;
-
-   if (draw->rasterizer->line_width != 1.0) {
-      wide->stage.line = wide_line;
-   }
-   else {
-      wide->stage.line = passthrough_line;
-   }
-   
-   stage->line( stage, header );
-}
-
-
-static void wide_flush( struct draw_stage *stage, unsigned flags )
-{
-   stage->line = wide_first_line;
-   stage->point = wide_first_point;
-   stage->next->flush( stage->next, flags );
-}
-
-
-static void wide_reset_stipple_counter( struct draw_stage *stage )
-{
-   stage->next->reset_stipple_counter( stage->next );
-}
-
-
-static void wide_destroy( struct draw_stage *stage )
-{
-   draw_free_temp_verts( stage );
-   FREE( stage );
-}
-
-
-struct draw_stage *draw_wide_stage( struct draw_context *draw )
-{
-   struct wide_stage *wide = CALLOC_STRUCT(wide_stage);
-
-   draw_alloc_temp_verts( &wide->stage, 4 );
-
-   wide->stage.draw = draw;
-   wide->stage.next = NULL;
-   wide->stage.point = wide_first_point;
-   wide->stage.line = wide_first_line;
-   wide->stage.tri = passthrough_tri;
-   wide->stage.flush = wide_flush;
-   wide->stage.reset_stipple_counter = wide_reset_stipple_counter;
-   wide->stage.destroy = wide_destroy;
-
-   return &wide->stage;
-}
diff --git a/src/gallium/auxiliary/gallivm/Makefile b/src/gallium/auxiliary/gallivm/Makefile
index c24e19e062..c3f7bfba93 100644
--- a/src/gallium/auxiliary/gallivm/Makefile
+++ b/src/gallium/auxiliary/gallivm/Makefile
@@ -65,10 +65,14 @@ depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(INC_SOURCES)
 
 
 gallivm_builtins.cpp: llvm_builtins.c
-	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-contents -o=$@ -f -for=shader -funcname=createGallivmBuiltins
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp1.bin
+	(echo "static const unsigned char llvm_builtins_data[] = {"; od -txC temp1.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/};/") >$@
+	rm temp1.bin
 
 gallivmsoabuiltins.cpp: soabuiltins.c
-	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-module -o=$@ -f -for=shader -funcname=createSoaBuiltins
+	clang --emit-llvm < $< |llvm-as|opt -std-compile-opts > temp2.bin
+	(echo "static const unsigned char soabuiltins_data[] = {"; od -txC temp2.bin | sed -e "s/^[0-9]*//" -e s"/ \([0-9a-f][0-9a-f]\)/0x\1,/g" -e"\$$d" | sed -e"\$$s/,$$/};/") >$@
+	rm temp2.bin
 
 # Emacs tags
 tags:
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 1796f0a177..a6f8cd043b 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,567 +1,141 @@
-// Generated by llvm2cpp - DO NOT MODIFY!
-
-
-Module* createGallivmBuiltins(Module *mod) {
-
-mod->setModuleIdentifier("shader");
-
-// Type Definitions
-ArrayType* ArrayTy_0 = ArrayType::get(IntegerType::get(8), 25);
-
-PointerType* PointerTy_1 = PointerType::get(ArrayTy_0, 0);
-
-std::vector<const Type*>FuncTy_2_args;
-FuncTy_2_args.push_back(Type::FloatTy);
-FuncTy_2_args.push_back(Type::FloatTy);
-FunctionType* FuncTy_2 = FunctionType::get(
-  /*Result=*/Type::FloatTy,
-  /*Params=*/FuncTy_2_args,
-  /*isVarArg=*/false);
-
-PointerType* PointerTy_3 = PointerType::get(FuncTy_2, 0);
-
-VectorType* VectorTy_4 = VectorType::get(Type::FloatTy, 4);
-
-std::vector<const Type*>FuncTy_5_args;
-FuncTy_5_args.push_back(VectorTy_4);
-FunctionType* FuncTy_5 = FunctionType::get(
-  /*Result=*/VectorTy_4,
-  /*Params=*/FuncTy_5_args,
-  /*isVarArg=*/false);
-
-std::vector<const Type*>FuncTy_6_args;
-FuncTy_6_args.push_back(VectorTy_4);
-FuncTy_6_args.push_back(VectorTy_4);
-FuncTy_6_args.push_back(VectorTy_4);
-FunctionType* FuncTy_6 = FunctionType::get(
-  /*Result=*/VectorTy_4,
-  /*Params=*/FuncTy_6_args,
-  /*isVarArg=*/false);
-
-VectorType* VectorTy_7 = VectorType::get(IntegerType::get(32), 4);
-
-std::vector<const Type*>FuncTy_9_args;
-FunctionType* FuncTy_9 = FunctionType::get(
-  /*Result=*/IntegerType::get(32),
-  /*Params=*/FuncTy_9_args,
-  /*isVarArg=*/true);
-
-PointerType* PointerTy_8 = PointerType::get(FuncTy_9, 0);
-
-PointerType* PointerTy_10 = PointerType::get(IntegerType::get(8), 0);
-
-std::vector<const Type*>FuncTy_12_args;
-FuncTy_12_args.push_back(Type::FloatTy);
-FunctionType* FuncTy_12 = FunctionType::get(
-  /*Result=*/Type::FloatTy,
-  /*Params=*/FuncTy_12_args,
-  /*isVarArg=*/false);
-
-PointerType* PointerTy_11 = PointerType::get(FuncTy_12, 0);
-
-std::vector<const Type*>FuncTy_13_args;
-FuncTy_13_args.push_back(VectorTy_4);
-FunctionType* FuncTy_13 = FunctionType::get(
-  /*Result=*/IntegerType::get(32),
-  /*Params=*/FuncTy_13_args,
-  /*isVarArg=*/false);
-
-
-// Function Declarations
-
-Function* func_approx = new Function(
-  /*Type=*/FuncTy_2,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"approx", mod); 
-func_approx->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_approx_PAL = 0;
-func_approx->setParamAttrs(func_approx_PAL);
-
-Function* func_powf = new Function(
-  /*Type=*/FuncTy_2,
-  /*Linkage=*/GlobalValue::ExternalLinkage,
-  /*Name=*/"powf", mod); // (external, no body)
-func_powf->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_powf_PAL = 0;
-func_powf->setParamAttrs(func_powf_PAL);
-
-Function* func_lit = new Function(
-  /*Type=*/FuncTy_5,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"lit", mod); 
-func_lit->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_lit_PAL = 0;
-func_lit->setParamAttrs(func_lit_PAL);
-
-Function* func_cmp = new Function(
-  /*Type=*/FuncTy_6,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"cmp", mod); 
-func_cmp->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_cmp_PAL = 0;
-{
-  ParamAttrsVector Attrs;
-  ParamAttrsWithIndex PAWI;
-  PAWI.index = 0; PAWI.attrs = 0  | ParamAttr::NoUnwind;
-  Attrs.push_back(PAWI);
-  func_cmp_PAL = ParamAttrsList::get(Attrs);
-  
-}
-func_cmp->setParamAttrs(func_cmp_PAL);
-
-Function* func_vcos = new Function(
-  /*Type=*/FuncTy_5,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"vcos", mod); 
-func_vcos->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_vcos_PAL = 0;
-func_vcos->setParamAttrs(func_vcos_PAL);
-
-Function* func_printf = new Function(
-  /*Type=*/FuncTy_9,
-  /*Linkage=*/GlobalValue::ExternalLinkage,
-  /*Name=*/"printf", mod); // (external, no body)
-func_printf->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_printf_PAL = 0;
-func_printf->setParamAttrs(func_printf_PAL);
-
-Function* func_cosf = new Function(
-  /*Type=*/FuncTy_12,
-  /*Linkage=*/GlobalValue::ExternalLinkage,
-  /*Name=*/"cosf", mod); // (external, no body)
-func_cosf->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_cosf_PAL = 0;
-func_cosf->setParamAttrs(func_cosf_PAL);
-
-Function* func_scs = new Function(
-  /*Type=*/FuncTy_5,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"scs", mod); 
-func_scs->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_scs_PAL = 0;
-func_scs->setParamAttrs(func_scs_PAL);
-
-Function* func_sinf = new Function(
-  /*Type=*/FuncTy_12,
-  /*Linkage=*/GlobalValue::ExternalLinkage,
-  /*Name=*/"sinf", mod); // (external, no body)
-func_sinf->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_sinf_PAL = 0;
-func_sinf->setParamAttrs(func_sinf_PAL);
-
-Function* func_vsin = new Function(
-  /*Type=*/FuncTy_5,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"vsin", mod); 
-func_vsin->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_vsin_PAL = 0;
-func_vsin->setParamAttrs(func_vsin_PAL);
-
-Function* func_kilp = new Function(
-  /*Type=*/FuncTy_13,
-  /*Linkage=*/GlobalValue::WeakLinkage,
-  /*Name=*/"kilp", mod); 
-func_kilp->setCallingConv(CallingConv::C);
-const ParamAttrsList *func_kilp_PAL = 0;
-{
-  ParamAttrsVector Attrs;
-  ParamAttrsWithIndex PAWI;
-  PAWI.index = 0; PAWI.attrs = 0  | ParamAttr::NoUnwind;
-  Attrs.push_back(PAWI);
-  func_kilp_PAL = ParamAttrsList::get(Attrs);
-  
-}
-func_kilp->setParamAttrs(func_kilp_PAL);
-
-// Global Variable Declarations
-
-
-GlobalVariable* gvar_array__str = new GlobalVariable(
-/*Type=*/ArrayTy_0,
-/*isConstant=*/true,
-/*Linkage=*/GlobalValue::InternalLinkage,
-/*Initializer=*/0, // has initializer, specified below
-/*Name=*/".str",
-mod);
-
-GlobalVariable* gvar_array__str1 = new GlobalVariable(
-/*Type=*/ArrayTy_0,
-/*isConstant=*/true,
-/*Linkage=*/GlobalValue::InternalLinkage,
-/*Initializer=*/0, // has initializer, specified below
-/*Name=*/".str1",
-mod);
-
-// Constant Definitions
-Constant* const_array_14 = ConstantArray::get("VEC IN   is %f %f %f %f\x0A", true);
-Constant* const_array_15 = ConstantArray::get("VEC OUT  is %f %f %f %f\x0A", true);
-ConstantFP* const_float_16 = ConstantFP::get(Type::FloatTy, APFloat(-1.280000e+02f));
-ConstantFP* const_float_17 = ConstantFP::get(Type::FloatTy, APFloat(1.280000e+02f));
-Constant* const_float_18 = Constant::getNullValue(Type::FloatTy);
-Constant* const_int32_19 = Constant::getNullValue(IntegerType::get(32));
-std::vector<Constant*> const_packed_20_elems;
-ConstantFP* const_float_21 = ConstantFP::get(Type::FloatTy, APFloat(1.000000e+00f));
-const_packed_20_elems.push_back(const_float_21);
-UndefValue* const_float_22 = UndefValue::get(Type::FloatTy);
-const_packed_20_elems.push_back(const_float_22);
-const_packed_20_elems.push_back(const_float_22);
-const_packed_20_elems.push_back(const_float_21);
-Constant* const_packed_20 = ConstantVector::get(VectorTy_4, const_packed_20_elems);
-ConstantInt* const_int32_23 = ConstantInt::get(APInt(32,  "1", 10));
-ConstantInt* const_int32_24 = ConstantInt::get(APInt(32,  "3", 10));
-ConstantInt* const_int32_25 = ConstantInt::get(APInt(32,  "2", 10));
-std::vector<Constant*> const_packed_26_elems;
-const_packed_26_elems.push_back(const_float_21);
-const_packed_26_elems.push_back(const_float_18);
-const_packed_26_elems.push_back(const_float_18);
-const_packed_26_elems.push_back(const_float_21);
-Constant* const_packed_26 = ConstantVector::get(VectorTy_4, const_packed_26_elems);
-Constant* const_double_27 = Constant::getNullValue(Type::DoubleTy);
-std::vector<Constant*> const_packed_28_elems;
-const_packed_28_elems.push_back(const_int32_19);
-ConstantInt* const_int32_29 = ConstantInt::get(APInt(32,  "5", 10));
-const_packed_28_elems.push_back(const_int32_29);
-const_packed_28_elems.push_back(const_int32_25);
-const_packed_28_elems.push_back(const_int32_24);
-Constant* const_packed_28 = ConstantVector::get(VectorTy_7, const_packed_28_elems);
-std::vector<Constant*> const_packed_30_elems;
-const_packed_30_elems.push_back(const_int32_19);
-const_packed_30_elems.push_back(const_int32_23);
-ConstantInt* const_int32_31 = ConstantInt::get(APInt(32,  "6", 10));
-const_packed_30_elems.push_back(const_int32_31);
-const_packed_30_elems.push_back(const_int32_24);
-Constant* const_packed_30 = ConstantVector::get(VectorTy_7, const_packed_30_elems);
-std::vector<Constant*> const_packed_32_elems;
-const_packed_32_elems.push_back(const_int32_19);
-const_packed_32_elems.push_back(const_int32_23);
-const_packed_32_elems.push_back(const_int32_25);
-ConstantInt* const_int32_33 = ConstantInt::get(APInt(32,  "7", 10));
-const_packed_32_elems.push_back(const_int32_33);
-Constant* const_packed_32 = ConstantVector::get(VectorTy_7, const_packed_32_elems);
-std::vector<Constant*> const_ptr_34_indices;
-const_ptr_34_indices.push_back(const_int32_19);
-const_ptr_34_indices.push_back(const_int32_19);
-Constant* const_ptr_34 = ConstantExpr::getGetElementPtr(gvar_array__str, &const_ptr_34_indices[0], const_ptr_34_indices.size() );
-UndefValue* const_packed_35 = UndefValue::get(VectorTy_4);
-std::vector<Constant*> const_ptr_36_indices;
-const_ptr_36_indices.push_back(const_int32_19);
-const_ptr_36_indices.push_back(const_int32_19);
-Constant* const_ptr_36 = ConstantExpr::getGetElementPtr(gvar_array__str1, &const_ptr_36_indices[0], const_ptr_36_indices.size() );
-
-// Global Variable Definitions
-gvar_array__str->setInitializer(const_array_14);
-gvar_array__str1->setInitializer(const_array_15);
-
-// Function Definitions
-
-// Function: approx (func_approx)
-{
-  Function::arg_iterator args = func_approx->arg_begin();
-  Value* float_a = args++;
-  float_a->setName("a");
-  Value* float_b = args++;
-  float_b->setName("b");
-  
-  BasicBlock* label_entry = new BasicBlock("entry",func_approx,0);
-  
-  // Block entry (label_entry)
-  FCmpInst* int1_cmp = new FCmpInst(FCmpInst::FCMP_OLT, float_b, const_float_16, "cmp", label_entry);
-  SelectInst* float_b_addr_0 = new SelectInst(int1_cmp, const_float_16, float_b, "b.addr.0", label_entry);
-  FCmpInst* int1_cmp3 = new FCmpInst(FCmpInst::FCMP_OGT, float_b_addr_0, const_float_17, "cmp3", label_entry);
-  SelectInst* float_b_addr_1 = new SelectInst(int1_cmp3, const_float_17, float_b_addr_0, "b.addr.1", label_entry);
-  FCmpInst* int1_cmp7 = new FCmpInst(FCmpInst::FCMP_OLT, float_a, const_float_18, "cmp7", label_entry);
-  SelectInst* float_a_addr_0 = new SelectInst(int1_cmp7, const_float_18, float_a, "a.addr.0", label_entry);
-  std::vector<Value*> float_call_params;
-  float_call_params.push_back(float_a_addr_0);
-  float_call_params.push_back(float_b_addr_1);
-  CallInst* float_call = new CallInst(func_powf, float_call_params.begin(), float_call_params.end(), "call", label_entry);
-  float_call->setCallingConv(CallingConv::C);
-  float_call->setTailCall(true);const ParamAttrsList *float_call_PAL = 0;
-  float_call->setParamAttrs(float_call_PAL);
-  
-  new ReturnInst(float_call, label_entry);
-  
-}
-
-// Function: lit (func_lit)
-{
-  Function::arg_iterator args = func_lit->arg_begin();
-  Value* packed_tmp = args++;
-  packed_tmp->setName("tmp");
-  
-  BasicBlock* label_entry_38 = new BasicBlock("entry",func_lit,0);
-  BasicBlock* label_ifthen = new BasicBlock("ifthen",func_lit,0);
-  BasicBlock* label_UnifiedReturnBlock = new BasicBlock("UnifiedReturnBlock",func_lit,0);
-  
-  // Block entry (label_entry_38)
-  ExtractElementInst* float_tmp6 = new ExtractElementInst(packed_tmp, const_int32_19, "tmp6", label_entry_38);
-  FCmpInst* int1_cmp_39 = new FCmpInst(FCmpInst::FCMP_OGT, float_tmp6, const_float_18, "cmp", label_entry_38);
-  new BranchInst(label_ifthen, label_UnifiedReturnBlock, int1_cmp_39, label_entry_38);
-  
-  // Block ifthen (label_ifthen)
-  InsertElementInst* packed_tmp10 = new InsertElementInst(const_packed_20, float_tmp6, const_int32_23, "tmp10", label_ifthen);
-  ExtractElementInst* float_tmp12 = new ExtractElementInst(packed_tmp, const_int32_23, "tmp12", label_ifthen);
-  ExtractElementInst* float_tmp14 = new ExtractElementInst(packed_tmp, const_int32_24, "tmp14", label_ifthen);
-  std::vector<Value*> float_call_41_params;
-  float_call_41_params.push_back(float_tmp12);
-  float_call_41_params.push_back(float_tmp14);
-  CallInst* float_call_41 = new CallInst(func_approx, float_call_41_params.begin(), float_call_41_params.end(), "call", label_ifthen);
-  float_call_41->setCallingConv(CallingConv::C);
-  float_call_41->setTailCall(true);const ParamAttrsList *float_call_41_PAL = 0;
-  float_call_41->setParamAttrs(float_call_41_PAL);
-  
-  InsertElementInst* packed_tmp16 = new InsertElementInst(packed_tmp10, float_call_41, const_int32_25, "tmp16", label_ifthen);
-  new ReturnInst(packed_tmp16, label_ifthen);
-  
-  // Block UnifiedReturnBlock (label_UnifiedReturnBlock)
-  new ReturnInst(const_packed_26, label_UnifiedReturnBlock);
-  
-}
-
-// Function: cmp (func_cmp)
-{
-  Function::arg_iterator args = func_cmp->arg_begin();
-  Value* packed_tmp0 = args++;
-  packed_tmp0->setName("tmp0");
-  Value* packed_tmp1 = args++;
-  packed_tmp1->setName("tmp1");
-  Value* packed_tmp2 = args++;
-  packed_tmp2->setName("tmp2");
-  
-  BasicBlock* label_entry_44 = new BasicBlock("entry",func_cmp,0);
-  BasicBlock* label_cond__14 = new BasicBlock("cond.?14",func_cmp,0);
-  BasicBlock* label_cond_cont20 = new BasicBlock("cond.cont20",func_cmp,0);
-  BasicBlock* label_cond__28 = new BasicBlock("cond.?28",func_cmp,0);
-  BasicBlock* label_cond_cont34 = new BasicBlock("cond.cont34",func_cmp,0);
-  BasicBlock* label_cond__42 = new BasicBlock("cond.?42",func_cmp,0);
-  BasicBlock* label_cond_cont48 = new BasicBlock("cond.cont48",func_cmp,0);
-  
-  // Block entry (label_entry_44)
-  ExtractElementInst* float_tmp3 = new ExtractElementInst(packed_tmp0, const_int32_19, "tmp3", label_entry_44);
-  CastInst* double_conv = new FPExtInst(float_tmp3, Type::DoubleTy, "conv", label_entry_44);
-  FCmpInst* int1_cmp_45 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv, const_double_27, "cmp", label_entry_44);
-  ExtractElementInst* float_tmp11 = new ExtractElementInst(packed_tmp0, const_int32_23, "tmp11", label_entry_44);
-  CastInst* double_conv12 = new FPExtInst(float_tmp11, Type::DoubleTy, "conv12", label_entry_44);
-  FCmpInst* int1_cmp13 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv12, const_double_27, "cmp13", label_entry_44);
-  SelectInst* packed_tmp1_tmp2 = new SelectInst(int1_cmp_45, packed_tmp1, packed_tmp2, "tmp1.tmp2", label_entry_44);
-  new BranchInst(label_cond__14, label_cond_cont20, int1_cmp13, label_entry_44);
-  
-  // Block cond.?14 (label_cond__14)
-  ShuffleVectorInst* packed_tmp233 = new ShuffleVectorInst(packed_tmp1_tmp2, packed_tmp1, const_packed_28, "tmp233", label_cond__14);
-  ExtractElementInst* float_tmp254 = new ExtractElementInst(packed_tmp0, const_int32_25, "tmp254", label_cond__14);
-  CastInst* double_conv265 = new FPExtInst(float_tmp254, Type::DoubleTy, "conv265", label_cond__14);
-  FCmpInst* int1_cmp276 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv265, const_double_27, "cmp276", label_cond__14);
-  new BranchInst(label_cond__28, label_cond_cont34, int1_cmp276, label_cond__14);
-  
-  // Block cond.cont20 (label_cond_cont20)
-  ShuffleVectorInst* packed_tmp23 = new ShuffleVectorInst(packed_tmp1_tmp2, packed_tmp2, const_packed_28, "tmp23", label_cond_cont20);
-  ExtractElementInst* float_tmp25 = new ExtractElementInst(packed_tmp0, const_int32_25, "tmp25", label_cond_cont20);
-  CastInst* double_conv26 = new FPExtInst(float_tmp25, Type::DoubleTy, "conv26", label_cond_cont20);
-  FCmpInst* int1_cmp27 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv26, const_double_27, "cmp27", label_cond_cont20);
-  new BranchInst(label_cond__28, label_cond_cont34, int1_cmp27, label_cond_cont20);
-  
-  // Block cond.?28 (label_cond__28)
-  PHINode* packed_tmp23_reg2mem_0 = new PHINode(VectorTy_4, "tmp23.reg2mem.0", label_cond__28);
-  packed_tmp23_reg2mem_0->reserveOperandSpace(2);
-  packed_tmp23_reg2mem_0->addIncoming(packed_tmp233, label_cond__14);
-  packed_tmp23_reg2mem_0->addIncoming(packed_tmp23, label_cond_cont20);
-  
-  ShuffleVectorInst* packed_tmp378 = new ShuffleVectorInst(packed_tmp23_reg2mem_0, packed_tmp1, const_packed_30, "tmp378", label_cond__28);
-  ExtractElementInst* float_tmp399 = new ExtractElementInst(packed_tmp0, const_int32_24, "tmp399", label_cond__28);
-  CastInst* double_conv4010 = new FPExtInst(float_tmp399, Type::DoubleTy, "conv4010", label_cond__28);
-  FCmpInst* int1_cmp4111 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv4010, const_double_27, "cmp4111", label_cond__28);
-  new BranchInst(label_cond__42, label_cond_cont48, int1_cmp4111, label_cond__28);
-  
-  // Block cond.cont34 (label_cond_cont34)
-  PHINode* packed_tmp23_reg2mem_1 = new PHINode(VectorTy_4, "tmp23.reg2mem.1", label_cond_cont34);
-  packed_tmp23_reg2mem_1->reserveOperandSpace(2);
-  packed_tmp23_reg2mem_1->addIncoming(packed_tmp233, label_cond__14);
-  packed_tmp23_reg2mem_1->addIncoming(packed_tmp23, label_cond_cont20);
-  
-  ShuffleVectorInst* packed_tmp37 = new ShuffleVectorInst(packed_tmp23_reg2mem_1, packed_tmp2, const_packed_30, "tmp37", label_cond_cont34);
-  ExtractElementInst* float_tmp39 = new ExtractElementInst(packed_tmp0, const_int32_24, "tmp39", label_cond_cont34);
-  CastInst* double_conv40 = new FPExtInst(float_tmp39, Type::DoubleTy, "conv40", label_cond_cont34);
-  FCmpInst* int1_cmp41 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv40, const_double_27, "cmp41", label_cond_cont34);
-  new BranchInst(label_cond__42, label_cond_cont48, int1_cmp41, label_cond_cont34);
-  
-  // Block cond.?42 (label_cond__42)
-  PHINode* packed_tmp37_reg2mem_0 = new PHINode(VectorTy_4, "tmp37.reg2mem.0", label_cond__42);
-  packed_tmp37_reg2mem_0->reserveOperandSpace(2);
-  packed_tmp37_reg2mem_0->addIncoming(packed_tmp378, label_cond__28);
-  packed_tmp37_reg2mem_0->addIncoming(packed_tmp37, label_cond_cont34);
-  
-  ShuffleVectorInst* packed_tmp5113 = new ShuffleVectorInst(packed_tmp37_reg2mem_0, packed_tmp1, const_packed_32, "tmp5113", label_cond__42);
-  new ReturnInst(packed_tmp5113, label_cond__42);
-  
-  // Block cond.cont48 (label_cond_cont48)
-  PHINode* packed_tmp37_reg2mem_1 = new PHINode(VectorTy_4, "tmp37.reg2mem.1", label_cond_cont48);
-  packed_tmp37_reg2mem_1->reserveOperandSpace(2);
-  packed_tmp37_reg2mem_1->addIncoming(packed_tmp378, label_cond__28);
-  packed_tmp37_reg2mem_1->addIncoming(packed_tmp37, label_cond_cont34);
-  
-  ShuffleVectorInst* packed_tmp51 = new ShuffleVectorInst(packed_tmp37_reg2mem_1, packed_tmp2, const_packed_32, "tmp51", label_cond_cont48);
-  new ReturnInst(packed_tmp51, label_cond_cont48);
-  
-}
-
-// Function: vcos (func_vcos)
-{
-  Function::arg_iterator args = func_vcos->arg_begin();
-  Value* packed_val = args++;
-  packed_val->setName("val");
-  
-  BasicBlock* label_entry_53 = new BasicBlock("entry",func_vcos,0);
-  
-  // Block entry (label_entry_53)
-  ExtractElementInst* float_tmp1 = new ExtractElementInst(packed_val, const_int32_19, "tmp1", label_entry_53);
-  CastInst* double_conv_54 = new FPExtInst(float_tmp1, Type::DoubleTy, "conv", label_entry_53);
-  ExtractElementInst* float_tmp3_55 = new ExtractElementInst(packed_val, const_int32_23, "tmp3", label_entry_53);
-  CastInst* double_conv4 = new FPExtInst(float_tmp3_55, Type::DoubleTy, "conv4", label_entry_53);
-  ExtractElementInst* float_tmp6_56 = new ExtractElementInst(packed_val, const_int32_25, "tmp6", label_entry_53);
-  CastInst* double_conv7 = new FPExtInst(float_tmp6_56, Type::DoubleTy, "conv7", label_entry_53);
-  ExtractElementInst* float_tmp9 = new ExtractElementInst(packed_val, const_int32_24, "tmp9", label_entry_53);
-  CastInst* double_conv10 = new FPExtInst(float_tmp9, Type::DoubleTy, "conv10", label_entry_53);
-  std::vector<Value*> int32_call_params;
-  int32_call_params.push_back(const_ptr_34);
-  int32_call_params.push_back(double_conv_54);
-  int32_call_params.push_back(double_conv4);
-  int32_call_params.push_back(double_conv7);
-  int32_call_params.push_back(double_conv10);
-  CallInst* int32_call = new CallInst(func_printf, int32_call_params.begin(), int32_call_params.end(), "call", label_entry_53);
-  int32_call->setCallingConv(CallingConv::C);
-  int32_call->setTailCall(true);const ParamAttrsList *int32_call_PAL = 0;
-  int32_call->setParamAttrs(int32_call_PAL);
-  
-  CallInst* float_call13 = new CallInst(func_cosf, float_tmp1, "call13", label_entry_53);
-  float_call13->setCallingConv(CallingConv::C);
-  float_call13->setTailCall(true);const ParamAttrsList *float_call13_PAL = 0;
-  float_call13->setParamAttrs(float_call13_PAL);
-  
-  InsertElementInst* packed_tmp15 = new InsertElementInst(const_packed_35, float_call13, const_int32_19, "tmp15", label_entry_53);
-  CallInst* float_call18 = new CallInst(func_cosf, float_tmp1, "call18", label_entry_53);
-  float_call18->setCallingConv(CallingConv::C);
-  float_call18->setTailCall(true);const ParamAttrsList *float_call18_PAL = 0;
-  float_call18->setParamAttrs(float_call18_PAL);
-  
-  InsertElementInst* packed_tmp20 = new InsertElementInst(packed_tmp15, float_call18, const_int32_23, "tmp20", label_entry_53);
-  CallInst* float_call23 = new CallInst(func_cosf, float_tmp1, "call23", label_entry_53);
-  float_call23->setCallingConv(CallingConv::C);
-  float_call23->setTailCall(true);const ParamAttrsList *float_call23_PAL = 0;
-  float_call23->setParamAttrs(float_call23_PAL);
-  
-  InsertElementInst* packed_tmp25 = new InsertElementInst(packed_tmp20, float_call23, const_int32_25, "tmp25", label_entry_53);
-  CallInst* float_call28 = new CallInst(func_cosf, float_tmp1, "call28", label_entry_53);
-  float_call28->setCallingConv(CallingConv::C);
-  float_call28->setTailCall(true);const ParamAttrsList *float_call28_PAL = 0;
-  float_call28->setParamAttrs(float_call28_PAL);
-  
-  InsertElementInst* packed_tmp30 = new InsertElementInst(packed_tmp25, float_call28, const_int32_24, "tmp30", label_entry_53);
-  CastInst* double_conv33 = new FPExtInst(float_call13, Type::DoubleTy, "conv33", label_entry_53);
-  CastInst* double_conv36 = new FPExtInst(float_call18, Type::DoubleTy, "conv36", label_entry_53);
-  CastInst* double_conv39 = new FPExtInst(float_call23, Type::DoubleTy, "conv39", label_entry_53);
-  CastInst* double_conv42 = new FPExtInst(float_call28, Type::DoubleTy, "conv42", label_entry_53);
-  std::vector<Value*> int32_call43_params;
-  int32_call43_params.push_back(const_ptr_36);
-  int32_call43_params.push_back(double_conv33);
-  int32_call43_params.push_back(double_conv36);
-  int32_call43_params.push_back(double_conv39);
-  int32_call43_params.push_back(double_conv42);
-  CallInst* int32_call43 = new CallInst(func_printf, int32_call43_params.begin(), int32_call43_params.end(), "call43", label_entry_53);
-  int32_call43->setCallingConv(CallingConv::C);
-  int32_call43->setTailCall(true);const ParamAttrsList *int32_call43_PAL = 0;
-  int32_call43->setParamAttrs(int32_call43_PAL);
-  
-  new ReturnInst(packed_tmp30, label_entry_53);
-  
-}
-
-// Function: scs (func_scs)
-{
-  Function::arg_iterator args = func_scs->arg_begin();
-  Value* packed_val_58 = args++;
-  packed_val_58->setName("val");
-  
-  BasicBlock* label_entry_59 = new BasicBlock("entry",func_scs,0);
-  
-  // Block entry (label_entry_59)
-  ExtractElementInst* float_tmp2 = new ExtractElementInst(packed_val_58, const_int32_19, "tmp2", label_entry_59);
-  CallInst* float_call_60 = new CallInst(func_cosf, float_tmp2, "call", label_entry_59);
-  float_call_60->setCallingConv(CallingConv::C);
-  float_call_60->setTailCall(true);const ParamAttrsList *float_call_60_PAL = 0;
-  float_call_60->setParamAttrs(float_call_60_PAL);
-  
-  InsertElementInst* packed_tmp5 = new InsertElementInst(const_packed_35, float_call_60, const_int32_19, "tmp5", label_entry_59);
-  CallInst* float_call7 = new CallInst(func_sinf, float_tmp2, "call7", label_entry_59);
-  float_call7->setCallingConv(CallingConv::C);
-  float_call7->setTailCall(true);const ParamAttrsList *float_call7_PAL = 0;
-  float_call7->setParamAttrs(float_call7_PAL);
-  
-  InsertElementInst* packed_tmp9 = new InsertElementInst(packed_tmp5, float_call7, const_int32_23, "tmp9", label_entry_59);
-  new ReturnInst(packed_tmp9, label_entry_59);
-  
-}
-
-// Function: vsin (func_vsin)
-{
-  Function::arg_iterator args = func_vsin->arg_begin();
-  Value* packed_val_62 = args++;
-  packed_val_62->setName("val");
-  
-  BasicBlock* label_entry_63 = new BasicBlock("entry",func_vsin,0);
-  
-  // Block entry (label_entry_63)
-  ExtractElementInst* float_tmp2_64 = new ExtractElementInst(packed_val_62, const_int32_19, "tmp2", label_entry_63);
-  CallInst* float_call_65 = new CallInst(func_sinf, float_tmp2_64, "call", label_entry_63);
-  float_call_65->setCallingConv(CallingConv::C);
-  float_call_65->setTailCall(true);const ParamAttrsList *float_call_65_PAL = 0;
-  float_call_65->setParamAttrs(float_call_65_PAL);
-  
-  InsertElementInst* packed_tmp6 = new InsertElementInst(const_packed_35, float_call_65, const_int32_19, "tmp6", label_entry_63);
-  InsertElementInst* packed_tmp9_66 = new InsertElementInst(packed_tmp6, float_call_65, const_int32_23, "tmp9", label_entry_63);
-  InsertElementInst* packed_tmp12 = new InsertElementInst(packed_tmp9_66, float_call_65, const_int32_25, "tmp12", label_entry_63);
-  InsertElementInst* packed_tmp15_67 = new InsertElementInst(packed_tmp12, float_call_65, const_int32_24, "tmp15", label_entry_63);
-  new ReturnInst(packed_tmp15_67, label_entry_63);
-  
-}
-
-// Function: kilp (func_kilp)
-{
-  Function::arg_iterator args = func_kilp->arg_begin();
-  Value* packed_val_69 = args++;
-  packed_val_69->setName("val");
-  
-  BasicBlock* label_entry_70 = new BasicBlock("entry",func_kilp,0);
-  BasicBlock* label_lor_rhs = new BasicBlock("lor_rhs",func_kilp,0);
-  BasicBlock* label_lor_rhs5 = new BasicBlock("lor_rhs5",func_kilp,0);
-  BasicBlock* label_lor_rhs11 = new BasicBlock("lor_rhs11",func_kilp,0);
-  BasicBlock* label_UnifiedReturnBlock_71 = new BasicBlock("UnifiedReturnBlock",func_kilp,0);
-  
-  // Block entry (label_entry_70)
-  ExtractElementInst* float_tmp1_72 = new ExtractElementInst(packed_val_69, const_int32_19, "tmp1", label_entry_70);
-  FCmpInst* int1_cmp_73 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp1_72, const_float_18, "cmp", label_entry_70);
-  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs, int1_cmp_73, label_entry_70);
-  
-  // Block lor_rhs (label_lor_rhs)
-  ExtractElementInst* float_tmp3_75 = new ExtractElementInst(packed_val_69, const_int32_23, "tmp3", label_lor_rhs);
-  FCmpInst* int1_cmp4 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp3_75, const_float_18, "cmp4", label_lor_rhs);
-  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs5, int1_cmp4, label_lor_rhs);
-  
-  // Block lor_rhs5 (label_lor_rhs5)
-  ExtractElementInst* float_tmp7 = new ExtractElementInst(packed_val_69, const_int32_25, "tmp7", label_lor_rhs5);
-  FCmpInst* int1_cmp8 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp7, const_float_18, "cmp8", label_lor_rhs5);
-  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs11, int1_cmp8, label_lor_rhs5);
-  
-  // Block lor_rhs11 (label_lor_rhs11)
-  ExtractElementInst* float_tmp13 = new ExtractElementInst(packed_val_69, const_int32_24, "tmp13", label_lor_rhs11);
-  FCmpInst* int1_cmp14 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp13, const_float_18, "cmp14", label_lor_rhs11);
-  CastInst* int32_retval = new ZExtInst(int1_cmp14, IntegerType::get(32), "retval", label_lor_rhs11);
-  new ReturnInst(int32_retval, label_lor_rhs11);
-  
-  // Block UnifiedReturnBlock (label_UnifiedReturnBlock_71)
-  new ReturnInst(const_int32_23, label_UnifiedReturnBlock_71);
-  
-}
-
-return mod;
-
-}
+static const unsigned char llvm_builtins_data[] = {
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x2b,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
+0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
+0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
+0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
+0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x13,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
+0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
+0x26,0x08,0xb0,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x21,0x80,0x39,0x82,
+0x60,0x0a,0x80,0x2e,0xd5,0x61,0x04,0x42,0x20,0x49,0x90,0x22,0x4d,0xa2,0x73,0x04,
+0x08,0xb9,0x32,0x00,0x00,0x8a,0x10,0xc2,0x65,0xb8,0x42,0x84,0x10,0x42,0x0d,0x44,
+0x11,0x00,0x18,0x01,0x28,0x82,0x08,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,
+0x36,0x80,0x87,0x71,0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,
+0x70,0x87,0x7a,0xd8,0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,
+0x07,0x78,0xd0,0x06,0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x71,0x60,0x07,0x7a,0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,
+0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,
+0x07,0x7a,0x30,0x07,0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,
+0x7a,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,
+0x30,0x07,0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,
+0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,
+0x00,0x07,0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,
+0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,
+0x04,0x60,0x76,0xc6,0xfc,0x6c,0x48,0xa2,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,
+0x14,0x20,0x00,0x00,0x00,0x00,0x80,0x21,0xc9,0x02,0x00,0x01,0x00,0x00,0x00,0x30,
+0x24,0x61,0x00,0x20,0x08,0x00,0x00,0x00,0x86,0x24,0x0b,0x00,0x04,0x00,0x00,0x00,
+0xc0,0x90,0xa4,0x01,0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,
+0x00,0x00,0x43,0x92,0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,
+0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x89,0x01,0x00,
+0x41,0x00,0x00,0x00,0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,
+0x19,0x11,0x4c,0x90,0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x92,0x8a,0x59,0x8b,0x43,
+0x50,0xd2,0x09,0x02,0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,
+0x64,0x56,0x28,0x66,0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,
+0xa8,0x64,0x06,0x95,0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,
+0x04,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,
+0x05,0x00,0x00,0x00,0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,
+0x10,0x84,0xc3,0x60,0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,
+0x34,0xc9,0x30,0x41,0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,
+0x8d,0x35,0x56,0x01,0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,
+0x46,0x41,0x08,0xcc,0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,
+0x35,0x04,0x80,0x39,0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,
+0x04,0x3e,0x30,0x0c,0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,
+0x05,0xd1,0x4c,0x11,0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,
+0x08,0x00,0x00,0x00,0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,
+0x55,0xd6,0x1c,0x84,0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,
+0x41,0xfc,0x03,0x00,0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,
+0x73,0x0c,0xd3,0x15,0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,
+0x6d,0x73,0x0c,0x19,0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,
+0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,
+0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,
+0x0c,0x13,0xf3,0x9c,0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,
+0x8b,0x23,0x28,0x76,0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,
+0xd1,0x4c,0x11,0x66,0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x82,0x00,0x00,0x00,0x13,0x04,0x47,0x2c,0x10,0x00,0x00,0x00,
+0x08,0x00,0x00,0x00,0x24,0x46,0x00,0x8a,0xa0,0x0c,0x4a,0xa0,0x14,0x8a,0xa1,0x1c,
+0x68,0x8c,0x00,0x10,0x9a,0x83,0x80,0xa8,0x48,0x9a,0x83,0x80,0xa6,0x4a,0x9a,0x83,
+0x80,0xa6,0xc8,0x02,0x63,0x08,0x0d,0x64,0xdb,0xc0,0x49,0x06,0xee,0x22,0xc6,0x10,
+0x9a,0xc9,0xbc,0x81,0x93,0x0c,0xdf,0x45,0x4c,0x31,0x38,0x4f,0x37,0xcb,0x10,0x08,
+0x60,0x30,0xc8,0x10,0x06,0x0e,0x36,0x86,0xd0,0x44,0x36,0x06,0x03,0x27,0x19,0xc8,
+0xe0,0x22,0x66,0x19,0x06,0xa2,0x0c,0x06,0x19,0xc2,0xe0,0xc1,0xc6,0x10,0x9a,0xc8,
+0xce,0x60,0xe0,0x24,0x03,0x1a,0x5c,0xc4,0x2c,0xc3,0x40,0xa4,0xc1,0x40,0x45,0x20,
+0x06,0x81,0x19,0x08,0x83,0x0c,0x6a,0xe0,0x64,0x63,0x08,0x8d,0x64,0x6c,0x30,0x70,
+0x92,0xa1,0x0d,0x2e,0x62,0x96,0xa1,0x30,0xdc,0x60,0xa0,0x22,0x10,0x83,0xc0,0x0c,
+0x84,0x41,0x86,0x37,0x78,0xb2,0x31,0x84,0x46,0xb2,0x38,0x18,0x38,0xc9,0x20,0x07,
+0x17,0x31,0xcb,0x50,0x18,0x73,0x30,0x50,0x11,0xac,0xc1,0x00,0x07,0xc4,0x20,0x03,
+0x1d,0x38,0x1a,0xd6,0xc1,0x40,0x45,0xb0,0x06,0x03,0x1c,0x10,0x83,0x0c,0x76,0xf0,
+0x68,0x78,0x07,0xe1,0x40,0x00,0x00,0x00,0x4c,0x00,0x00,0x00,0x56,0x62,0x08,0xcc,
+0x63,0xef,0x3a,0xa9,0x00,0x19,0x7b,0x73,0x23,0x73,0xf9,0xa1,0x91,0x31,0x98,0x62,
+0x62,0x9e,0x7b,0xb7,0x06,0x62,0x62,0x1e,0xda,0x1c,0x88,0x89,0x79,0x6a,0x7b,0x20,
+0x26,0xe6,0xb1,0x6d,0x83,0x98,0x98,0xe7,0x36,0x92,0x43,0x70,0x9a,0xca,0xd6,0x73,
+0xa3,0x79,0x26,0xe6,0xb9,0x77,0x3f,0x22,0x0c,0x9b,0x21,0x18,0x9f,0xb6,0x90,0x64,
+0x62,0x9e,0xda,0x9f,0x98,0xc7,0x36,0x9b,0x67,0x62,0x9e,0x7b,0xf7,0x23,0xc2,0xb0,
+0x19,0x82,0xf1,0x6b,0x53,0x79,0x26,0xe6,0xb1,0x6f,0x3f,0x22,0x0c,0x9b,0x21,0x18,
+0x9f,0xb6,0x98,0x62,0x62,0x9e,0xbb,0xb7,0x97,0x67,0x62,0x1e,0xfb,0xf6,0x23,0xc2,
+0xb0,0x19,0x82,0xf1,0x6b,0x43,0x31,0x04,0xa7,0xa9,0x6c,0xdd,0x66,0x0a,0x81,0x79,
+0xf0,0xfa,0x08,0x16,0xc1,0x69,0x06,0x5f,0x70,0x9a,0xe9,0xc6,0x49,0x01,0xc8,0xd8,
+0x9b,0x1b,0x99,0xcb,0x4f,0x0c,0x8d,0xad,0x18,0x13,0xf3,0xdc,0x3b,0x6f,0x35,0xc7,
+0xc4,0x3c,0x79,0x5d,0xdf,0x06,0x52,0x08,0xcc,0x53,0xdf,0x26,0x62,0x4c,0xcc,0x63,
+0xdf,0xb7,0xb9,0x1c,0x02,0xf3,0xe0,0x75,0x5d,0x5b,0xc7,0x20,0x30,0x8f,0x79,0x14,
+0x13,0xf3,0xd4,0xf5,0x19,0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0x13,0xce,0x5b,0x0b,
+0x22,0x38,0x4d,0x85,0xd3,0x35,0x6d,0x37,0xc5,0xc4,0x3c,0x79,0x4d,0x1a,0x40,0xc6,
+0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x8c,0x83,0x10,0x9c,0xa6,0xb2,0x94,0x42,0x60,
+0x1e,0x7b,0x37,0x19,0x43,0x70,0x9a,0x0a,0xa7,0xcd,0xa4,0x98,0x98,0xc7,0xbe,0x8d,
+0xc5,0x98,0x98,0xe7,0xee,0x7b,0x3b,0x29,0x26,0xe6,0xb1,0xf3,0x13,0x58,0x04,0xa7,
+0x19,0x7c,0xc1,0x69,0x26,0x9b,0xb6,0x0f,0x43,0x70,0x9a,0xaa,0xb6,0x6d,0xc4,0x98,
+0x98,0xc7,0xce,0xf1,0x03,0x28,0x88,0x66,0x8a,0x30,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,
+0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xbd,
+0x61,0x8c,0x04,0x10,0x1e,0xe1,0x19,0xc6,0x48,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,
+0x63,0x08,0xcd,0x63,0xd5,0xc0,0x31,0x84,0x06,0xb2,0x6b,0xe0,0x18,0x42,0x13,0x59,
+0x36,0x70,0x0c,0xa1,0x71,0x6c,0x1b,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,
+0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,
+0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,
+0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,0x98,0x37,0xd0,0xc0,0x60,0xa0,0x89,0xc1,
+0x40,0x23,0x83,0x81,0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,
+0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,
+0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,
+0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,
+0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,
+0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,
+0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,
+0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,
+0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,
+0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,
+0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,
+0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,
+0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,
+0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,
+0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,
+0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,
+0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,
+0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
+0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,
+0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,
+0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,
+0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,
+0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,
+0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,
+0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,
+0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,
+0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,
+0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,
+0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,
+0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,
+0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,
+0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,
+0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index 8919491792..95a670edaf 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -35,6 +35,8 @@
 
 #include "storage.h"
 
+#include "pipe/p_util.h"
+
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
 #include <llvm/DerivedTypes.h>
@@ -42,7 +44,8 @@
 #include <llvm/InstrTypes.h>
 #include <llvm/Instructions.h>
 #include <llvm/ParameterAttributes.h>
-#include <llvm/ParamAttrsList.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
 
 #include <sstream>
 #include <fstream>
@@ -53,7 +56,6 @@ using namespace llvm;
 #include "gallivm_builtins.cpp"
 
 #if 0
-
 llvm::Value *arrayFromChannels(std::vector<llvm::Value*> &vals)
 {
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -84,7 +86,10 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_llvmLit  = 0;
    m_fmtPtr = 0;
 
-   createGallivmBuiltins(m_mod);
+   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
+      (const char*)&llvm_builtins_data[0],
+      (const char*)&llvm_builtins_data[Elements(llvm_builtins_data)-1]);
+   m_mod = ParseBitcodeFile(buffer);
 }
 
 llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
@@ -134,12 +139,12 @@ llvm::Value *Instructions::callFSqrt(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fsqrtArgs;
       fsqrtArgs.push_back(Type::FloatTy);
-      ParamAttrsList *fsqrtPal = 0;
+      PAListPtr fsqrtPal;
       FunctionType* fsqrtType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fsqrtArgs,
          /*isVarArg=*/false);
-      m_llvmFSqrt = new Function(
+      m_llvmFSqrt = Function::Create(
          /*Type=*/fsqrtType,
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.sqrt.f32", m_mod);
@@ -191,12 +196,12 @@ llvm::Value *Instructions::callFAbs(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> fabsArgs;
       fabsArgs.push_back(Type::FloatTy);
-      ParamAttrsList *fabsPal = 0;
+      PAListPtr fabsPal;
       FunctionType* fabsType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/fabsArgs,
          /*isVarArg=*/false);
-      m_llvmFAbs = new Function(
+      m_llvmFAbs = Function::Create(
          /*Type=*/fabsType,
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"fabs", m_mod);
@@ -234,12 +239,12 @@ llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
       std::vector<const Type*> powArgs;
       powArgs.push_back(Type::FloatTy);
       powArgs.push_back(Type::FloatTy);
-      ParamAttrsList *powPal = 0;
+      PAListPtr powPal;
       FunctionType* powType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/powArgs,
          /*isVarArg=*/false);
-      m_llvmPow = new Function(
+      m_llvmPow = Function::Create(
          /*Type=*/powType,
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"llvm.pow.f32", m_mod);
@@ -333,12 +338,12 @@ llvm::Value * Instructions::callFloor(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> floorArgs;
       floorArgs.push_back(Type::FloatTy);
-      ParamAttrsList *floorPal = 0;
+      PAListPtr floorPal;
       FunctionType* floorType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/floorArgs,
          /*isVarArg=*/false);
-      m_llvmFloor = new Function(
+      m_llvmFloor = Function::Create(
          /*Type=*/floorType,
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"floorf", m_mod);
@@ -376,12 +381,12 @@ llvm::Value * Instructions::callFLog(llvm::Value *val)
       // predeclare the intrinsic
       std::vector<const Type*> flogArgs;
       flogArgs.push_back(Type::FloatTy);
-      ParamAttrsList *flogPal = 0;
+      PAListPtr flogPal;
       FunctionType* flogType = FunctionType::get(
          /*Result=*/Type::FloatTy,
          /*Params=*/flogArgs,
          /*isVarArg=*/false);
-      m_llvmFlog = new Function(
+      m_llvmFlog = Function::Create(
          /*Type=*/flogType,
          /*Linkage=*/GlobalValue::ExternalLinkage,
          /*Name=*/"logf", m_mod);
@@ -504,12 +509,12 @@ void Instructions::printVector(llvm::Value *val)
 llvm::Function * Instructions::declarePrintf()
 {
    std::vector<const Type*> args;
-   ParamAttrsList *params = 0;
+   PAListPtr params;
    FunctionType* funcTy = FunctionType::get(
       /*Result=*/IntegerType::get(32),
       /*Params=*/args,
       /*isVarArg=*/true);
-   Function* func_printf = new Function(
+   Function* func_printf = Function::Create(
       /*Type=*/funcTy,
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/"printf", m_mod);
@@ -633,8 +638,8 @@ llvm::Value * Instructions::abs(llvm::Value *in)
 
 void Instructions::ifop(llvm::Value *in)
 {
-   BasicBlock *ifthen = new BasicBlock(name("ifthen"), m_func,0);
-   BasicBlock *ifend = new BasicBlock(name("ifthenend"), m_func,0);
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
 
    //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
    //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
@@ -660,7 +665,7 @@ llvm::BasicBlock * Instructions::currentBlock() const
 void Instructions::elseop()
 {
    assert(!m_ifStack.empty());
-   BasicBlock *ifend = new BasicBlock(name("ifend"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
    m_builder.CreateBr(ifend);
    m_builder.SetInsertPoint(m_ifStack.top());
    currentBlock()->setName(name("ifelse"));
@@ -687,8 +692,8 @@ llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
 
 void Instructions::beginLoop()
 {
-   BasicBlock *begin = new BasicBlock(name("loop"), m_func,0);
-   BasicBlock *end = new BasicBlock(name("endloop"), m_func,0);
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
 
    m_builder.CreateBr(begin);
    Loop loop;
@@ -711,7 +716,7 @@ void Instructions::endLoop()
 void Instructions::brk()
 {
    assert(!m_loopStack.empty());
-   BasicBlock *unr = new BasicBlock(name("unreachable"), m_func,0);
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
    m_builder.CreateBr(m_loopStack.top().end);
    m_builder.SetInsertPoint(unr);
 }
@@ -760,13 +765,13 @@ llvm::Function * Instructions::declareFunc(int label)
    args.push_back(vecPtr);
    args.push_back(vecPtr);
    args.push_back(vecPtr);
-   ParamAttrsList *params = 0;
+   PAListPtr params;
    FunctionType *funcType = FunctionType::get(
       /*Result=*/Type::VoidTy,
       /*Params=*/args,
       /*isVarArg=*/false);
    std::string name = createFuncName(label);
-   Function *func = new Function(
+   Function *func = Function::Create(
       /*Type=*/funcType,
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/name.c_str(), m_mod);
@@ -784,7 +789,7 @@ void Instructions::bgnSub(unsigned label)
    ptr_INPUT->setName("INPUT");
    m_storage->pushArguments(ptr_INPUT);
 
-   llvm::BasicBlock *entry = new BasicBlock("entry", func, 0);
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
 
    m_func = func;
    m_builder.SetInsertPoint(entry);
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index 9ebc17dd8e..19ca84ddc6 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -36,7 +36,7 @@
 #include <llvm/BasicBlock.h>
 #include <llvm/Module.h>
 #include <llvm/Value.h>
-#include <llvm/Support/LLVMBuilder.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <map>
 #include <stack>
@@ -125,7 +125,7 @@ private:
    llvm::Module             *m_mod;
    llvm::Function           *m_func;
    char                      m_name[32];
-   llvm::LLVMFoldingBuilder  m_builder;
+   llvm::IRBuilder           m_builder;
    int                       m_idx;
 
    llvm::VectorType *m_floatVecType;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index 6f83b56a72..f0122802db 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -1,8 +1,35 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
 #include "instructionssoa.h"
 
 #include "storagesoa.h"
 
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_util.h"
 
 #include <llvm/CallingConv.h>
 #include <llvm/Constants.h>
@@ -10,7 +37,11 @@
 #include <llvm/Function.h>
 #include <llvm/Instructions.h>
 #include <llvm/Transforms/Utils/Cloning.h>
-#include <llvm/ParamAttrsList.h>
+#include <llvm/ParameterAttributes.h>
+//#include <llvm/ParamAttrsList.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+
 
 #include <iostream>
 
@@ -183,7 +214,10 @@ llvm::Module * InstructionsSoa::currentModule() const
 
 void InstructionsSoa::createBuiltins()
 {
-   m_builtins = createSoaBuiltins();
+   MemoryBuffer *buffer = MemoryBuffer::getMemBuffer(
+      (const char*)&soabuiltins_data[0],
+      (const char*)&soabuiltins_data[Elements(soabuiltins_data)-1]);
+   m_builtins = ParseBitcodeFile(buffer);
    createDependencies();
 }
 
@@ -204,32 +238,32 @@ llvm::Value * InstructionsSoa::allocaTemp()
    std::vector<Value*> indices;
    indices.push_back(m_storage->constantInt(0));
    indices.push_back(m_storage->constantInt(0));
-   GetElementPtrInst *getElem = new GetElementPtrInst(alloca,
-                                                      indices.begin(),
-                                                      indices.end(),
-                                                      name("allocaPtr"),
-                                                      m_builder.GetInsertBlock());
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(alloca,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("allocaPtr"),
+                                                          m_builder.GetInsertBlock());
    return getElem;
 }
 
 std::vector<llvm::Value*> InstructionsSoa::allocaToResult(llvm::Value *allocaPtr)
 {
-   GetElementPtrInst *xElemPtr =  new GetElementPtrInst(allocaPtr,
-                                                        m_storage->constantInt(0),
-                                                        name("xPtr"),
-                                                        m_builder.GetInsertBlock());
-   GetElementPtrInst *yElemPtr =  new GetElementPtrInst(allocaPtr,
-                                                        m_storage->constantInt(1),
-                                                        name("yPtr"),
-                                                        m_builder.GetInsertBlock());
-   GetElementPtrInst *zElemPtr =  new GetElementPtrInst(allocaPtr,
-                                                        m_storage->constantInt(2),
-                                                        name("zPtr"),
-                                                        m_builder.GetInsertBlock());
-   GetElementPtrInst *wElemPtr =  new GetElementPtrInst(allocaPtr,
-                                                        m_storage->constantInt(3),
-                                                        name("wPtr"),
-                                                        m_builder.GetInsertBlock());
+   GetElementPtrInst *xElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(0),
+                                                            name("xPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *yElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(1),
+                                                            name("yPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *zElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(2),
+                                                            name("zPtr"),
+                                                            m_builder.GetInsertBlock());
+   GetElementPtrInst *wElemPtr =  GetElementPtrInst::Create(allocaPtr,
+                                                            m_storage->constantInt(3),
+                                                            name("wPtr"),
+                                                            m_builder.GetInsertBlock());
 
    std::vector<llvm::Value*> res(4);
    res[0] = new LoadInst(xElemPtr, name("xRes"), false, m_builder.GetInsertBlock());
@@ -355,10 +389,10 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
    llvm::Function *func = 0;
    if (originalFunc->isDeclaration()) {
       std::cout << "function decleration" <<std::endl;
-      func = new Function(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
-                          originalFunc->getName(), currentModule());
+      func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
+                              originalFunc->getName(), currentModule());
       func->setCallingConv(CallingConv::C);
-      const ParamAttrsList *pal = 0;
+      const PAListPtr pal;
       func->setParamAttrs(pal);
       currentModule()->dump();
    } else {
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index b9104ea286..060ee72f2e 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -29,7 +29,7 @@
 #define INSTRUCTIONSSOA_H
 
 #include <pipe/p_shader_tokens.h>
-#include <llvm/Support/LLVMBuilder.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <map>
 #include <vector>
@@ -87,7 +87,7 @@ private:
                                          const std::vector<llvm::Value*> in3);
    void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
 private:
-   llvm::LLVMFoldingBuilder  m_builder;
+   llvm::IRBuilder  m_builder;
    StorageSoa *m_storage;
 
    std::map<int, std::string> m_functionsMap;
diff --git a/src/gallium/auxiliary/gallivm/llvm_builtins.c b/src/gallium/auxiliary/gallivm/llvm_builtins.c
index 4f98d754ba..64b5d499a8 100644
--- a/src/gallium/auxiliary/gallivm/llvm_builtins.c
+++ b/src/gallium/auxiliary/gallivm/llvm_builtins.c
@@ -30,7 +30,7 @@
   * Authors:
   *   Zack Rusin zack@tungstengraphics.com
   */
-typedef __attribute__(( ocu_vector_type(4) )) float float4;
+typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 extern float powf(float a, float b);
 
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 4d658be520..40addebd8c 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -31,7 +31,7 @@
   * Authors:
   *   Zack Rusin zack@tungstengraphics.com
   */
-typedef __attribute__(( ocu_vector_type(4) )) float float4;
+typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 void dp3(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
diff --git a/src/gallium/auxiliary/gallivm/storage.cpp b/src/gallium/auxiliary/gallivm/storage.cpp
index c4326de8c5..9d9fd12360 100644
--- a/src/gallium/auxiliary/gallivm/storage.cpp
+++ b/src/gallium/auxiliary/gallivm/storage.cpp
@@ -186,26 +186,26 @@ llvm::Value *Storage::maskWrite(llvm::Value *src, int mask, llvm::Value *templ)
    if ((mask & TGSI_WRITEMASK_X)) {
       llvm::Value *x = new ExtractElementInst(src, unsigned(0),
                                               name("x"), m_block);
-      dst = new InsertElementInst(dst, x, unsigned(0),
-                                  name("dstx"), m_block);
+      dst = InsertElementInst::Create(dst, x, unsigned(0),
+                                      name("dstx"), m_block);
    }
    if ((mask & TGSI_WRITEMASK_Y)) {
       llvm::Value *y = new ExtractElementInst(src, unsigned(1),
                                               name("y"), m_block);
-      dst = new InsertElementInst(dst, y, unsigned(1),
-                                  name("dsty"), m_block);
+      dst = InsertElementInst::Create(dst, y, unsigned(1),
+                                      name("dsty"), m_block);
    }
    if ((mask & TGSI_WRITEMASK_Z)) {
       llvm::Value *z = new ExtractElementInst(src, unsigned(2),
                                               name("z"), m_block);
-      dst = new InsertElementInst(dst, z, unsigned(2),
-                                  name("dstz"), m_block);
+      dst = InsertElementInst::Create(dst, z, unsigned(2),
+                                      name("dstz"), m_block);
    }
    if ((mask & TGSI_WRITEMASK_W)) {
       llvm::Value *w = new ExtractElementInst(src, unsigned(3),
                                               name("w"), m_block);
-      dst = new InsertElementInst(dst, w, unsigned(3),
-                                  name("dstw"), m_block);
+      dst = InsertElementInst::Create(dst, w, unsigned(3),
+                                      name("dstw"), m_block);
    }
    return dst;
 }
@@ -308,11 +308,11 @@ llvm::Value * Storage::elemPtr(Args arg)
    std::vector<Value*> indices;
    indices.push_back(constantInt(0));
    indices.push_back(constantInt(static_cast<int>(arg)));
-   GetElementPtrInst *getElem = new GetElementPtrInst(m_INPUT,
-                                                      indices.begin(),
-                                                      indices.end(),
-                                                      name("input_ptr"),
-                                                      m_block);
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(m_INPUT,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("input_ptr"),
+                                                          m_block);
    return new LoadInst(getElem, name("input_field"), false, m_block);
 }
 
@@ -322,7 +322,7 @@ llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
    GetElementPtrInst *getElem = 0;
 
    if (indIdx) {
-      getElem = new GetElementPtrInst(ptr,
+      getElem = GetElementPtrInst::Create(ptr,
                                       BinaryOperator::create(Instruction::Add,
                                                              indIdx,
                                                              constantInt(idx),
@@ -331,7 +331,7 @@ llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
                                       name("field"),
                                       m_block);
    } else {
-      getElem = new GetElementPtrInst(ptr,
+      getElem = GetElementPtrInst::Create(ptr,
                                       constantInt(idx),
                                       name("field"),
                                       m_block);
@@ -350,7 +350,7 @@ void Storage::setKilElement(llvm::Value *val)
    std::vector<Value*> indices;
    indices.push_back(constantInt(0));
    indices.push_back(constantInt(static_cast<int>(KilArg)));
-   GetElementPtrInst *elem = new GetElementPtrInst(m_INPUT,
+   GetElementPtrInst *elem = GetElementPtrInst::Create(m_INPUT,
                                                    indices.begin(),
                                                    indices.end(),
                                                    name("kil_ptr"),
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index bb6fe3d7e1..0e6e68c9d7 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -207,11 +207,11 @@ llvm::Value * StorageSoa::elementPointer(llvm::Value *ptr, llvm::Value *index,
    indices.push_back(index);
    indices.push_back(constantInt(channel));
 
-   GetElementPtrInst *getElem = new GetElementPtrInst(ptr,
-                                                      indices.begin(),
-                                                      indices.end(),
-                                                      name("ptr"),
-                                                      m_block);
+   GetElementPtrInst *getElem = GetElementPtrInst::Create(ptr,
+                                                          indices.begin(),
+                                                          indices.end(),
+                                                          name("ptr"),
+                                                          m_block);
    return getElem;
 }
 
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index ab9e7a06fb..ab8c851f14 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -1014,7 +1014,7 @@ tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
    Value *ptr_INPUT = args++;
    ptr_INPUT->setName("input");
 
-   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
 
    tgsi_parse_init(&parse, tokens);
 
@@ -1085,7 +1085,7 @@ llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
    Value *temps = args++;
    temps->setName("temps");
 
-   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+   BasicBlock *label_entry = BasicBlock::Create("entry", shader, 0);
 
    tgsi_parse_init(&parse, tokens);
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
index 4b09c80b2a..49705cb862 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -193,6 +193,17 @@ pb_reference(struct pb_buffer **dst,
 
 
 /**
+ * Utility function to check whether a requested alignment is consistent with
+ * the provided alignment or not.
+ */
+static INLINE int
+pb_check_alignment(size_t requested, size_t provided)
+{
+   return requested <= provided && (provided % requested) == 0;
+}
+
+
+/**
  * Malloc-based buffer to store data that can't be used by the graphics 
  * hardware.
  */
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 65b6584003..27032b0c4c 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -215,15 +215,21 @@ fenced_buffer_serialize(struct fenced_buffer *fenced_buf, unsigned flags)
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
    struct pipe_winsys *winsys = fenced_list->winsys;
 
+   /* Allow concurrent reads */
    if(((fenced_buf->flags | flags) & PIPE_BUFFER_USAGE_WRITE) == 0)
       return PIPE_OK;
 
+   /* Wait for the CPU to finish */
    if(fenced_buf->mapcount) {
-      /* FIXME */
+      /* FIXME: Use thread conditions variables to signal when mapcount 
+       * reaches zero */
       debug_warning("attemp to write concurrently to buffer");
+      /* XXX: we must not fail here in order to support texture mipmap generation
       return PIPE_ERROR_RETRY;
+       */
    }
 
+   /* Wait for the GPU to finish */
    if(fenced_buf->fence) {
       if(winsys->fence_finish(winsys, fenced_buf->fence, 0) != 0)
 	 return PIPE_ERROR_RETRY; 
@@ -353,6 +359,16 @@ buffer_fence(struct pb_buffer *buf,
    /* FIXME: receive this as a parameter */
    unsigned flags = fence ? PIPE_BUFFER_USAGE_GPU_READ_WRITE : 0;
    
+   if(fence == fenced_buf->fence) {
+      /* Handle the same fence case specially, not only because it is a fast 
+       * path, but mostly to avoid serializing two writes with the same fence, 
+       * as that would bring the hardware down to synchronous operation without
+       * any benefit.
+       */
+      fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE;
+      return;
+   }
+   
    if(fenced_buffer_serialize(fenced_buf, flags) != PIPE_OK) {
       /* FIXME: propagate error */
       (void)0;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index b2d2520b67..96f9af3825 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -118,13 +118,21 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
  * Slab sub-allocator.
  */
 struct pb_manager *
-pb_slab_manager_create(struct pb_manager *provider, 
-                       const struct pb_desc *desc,
-                       size_t smallestSize,
-                       size_t numSizes,
-                       size_t desiredNumBuffers,
-                       size_t maxSlabSize,
-                       size_t pageAlignment);
+pb_slab_manager_create(struct pb_manager *provider,
+                       size_t bufSize,
+                       size_t slabSize,
+                       const struct pb_desc *desc);
+
+/**
+ * Allow a range of buffer size, by aggregating multiple slabs sub-allocators 
+ * with different bucket sizes.
+ */
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             size_t minBufSize,
+                             size_t maxBufSize,
+                             size_t slabSize,
+                             const struct pb_desc *desc);
 
 
 /** 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 06de0bb6c3..543fd51253 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -136,7 +136,7 @@ _pb_cache_buffer_list_check_free(struct pb_cache_manager *mgr)
    while(curr != &mgr->delayed) {
       buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
 
-      if(util_time_timeout(&buf->start, &buf->end, &now) != 0)
+      if(!util_time_timeout(&buf->start, &buf->end, &now))
 	 break;
 	 
       _pb_cache_buffer_destroy(buf);
@@ -202,6 +202,24 @@ pb_cache_buffer_vtbl = {
 };
 
 
+static INLINE boolean
+pb_cache_is_buffer_compat(struct pb_cache_buffer *buf,  
+                          size_t size,
+                          const struct pb_desc *desc)
+{
+   /* TODO: be more lenient with size */
+   if(buf->base.base.size != size)
+      return FALSE;
+   
+   if(!pb_check_alignment(desc->alignment, buf->base.base.alignment))
+      return FALSE;
+   
+   /* XXX: check usage too? */
+   
+   return TRUE;
+}
+
+
 static struct pb_buffer *
 pb_cache_manager_create_buffer(struct pb_manager *_mgr, 
                                size_t size,
@@ -209,29 +227,45 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
 {
    struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
    struct pb_cache_buffer *buf;
+   struct pb_cache_buffer *curr_buf;
    struct list_head *curr, *next;
    struct util_time now;
    
-   util_time_get(&now);
+   _glthread_LOCK_MUTEX(mgr->mutex);
+
+   buf = NULL;
    curr = mgr->delayed.next;
    next = curr->next;
+   
+   /* search in the expired buffers, freeing them in the process */
+   util_time_get(&now);
    while(curr != &mgr->delayed) {
-      buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-
-      if(buf->base.base.size == size &&
-	 buf->base.base.alignment >= desc->alignment &&
-	 (buf->base.base.alignment % desc->alignment) == 0 &&
-	 /* buf->base.base.usage == usage */ 1) {
-	 ++buf->base.base.refcount;
-	 return &buf->base;
-      }
-      
-      if(util_time_timeout(&buf->start, &buf->end, &now) != 0)
-	 _pb_cache_buffer_destroy(buf);
+      curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      if(!buf && pb_cache_is_buffer_compat(curr_buf, size, desc))
+	 buf = curr_buf;
+      else if(util_time_timeout(&curr_buf->start, &curr_buf->end, &now))
+	 _pb_cache_buffer_destroy(curr_buf);
+      curr = next; 
+      next = curr->next;
+   }
 
+   /* keep searching in the hot buffers */
+   while(!buf && curr != &mgr->delayed) {
+      curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+      if(pb_cache_is_buffer_compat(curr_buf, size, desc))
+	 buf = curr_buf;
       curr = next; 
       next = curr->next;
    }
+   
+   if(buf) {
+      LIST_DEL(&buf->head);
+      _glthread_UNLOCK_MUTEX(mgr->mutex);
+      ++buf->base.base.refcount;
+      return &buf->base;
+   }
+   
+   _glthread_UNLOCK_MUTEX(mgr->mutex);
 
    buf = CALLOC_STRUCT(pb_cache_buffer);
    if(!buf)
@@ -243,6 +277,11 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
       return NULL;
    }
    
+   assert(buf->buffer->base.refcount >= 1);
+   assert(pb_check_alignment(desc->alignment, buf->buffer->base.alignment));
+   assert((buf->buffer->base.usage & desc->usage) == desc->usage);
+   assert(buf->buffer->base.size >= size);
+   
    buf->base.base.refcount = 1;
    buf->base.base.alignment = buf->buffer->base.alignment;
    buf->base.base.usage = buf->buffer->base.usage;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index bffca5b244..9d809e2f9b 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -30,7 +30,7 @@
  * \file
  * A buffer manager that wraps buffers in fenced buffers.
  * 
- * \author Jos� Fonseca <jrfonseca@tungstengraphics.dot.com>
+ * \author José Fonseca <jrfonseca@tungstengraphics.dot.com>
  */
 
 
@@ -101,7 +101,8 @@ fenced_bufmgr_destroy(struct pb_manager *mgr)
 
    fenced_buffer_list_destroy(fenced_mgr->fenced_list);
 
-   fenced_mgr->provider->destroy(fenced_mgr->provider);
+   if(fenced_mgr->provider)
+      fenced_mgr->provider->destroy(fenced_mgr->provider);
    
    FREE(fenced_mgr);
 }
@@ -113,6 +114,9 @@ fenced_bufmgr_create(struct pb_manager *provider,
 {
    struct fenced_pb_manager *fenced_mgr;
 
+   if(!provider)
+      return NULL;
+   
    fenced_mgr = (struct fenced_pb_manager *)CALLOC(1, sizeof(*fenced_mgr));
    if (!fenced_mgr)
       return NULL;
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 676e8e29b9..b931455056 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -70,19 +70,24 @@ struct pb_slab
    size_t numBuffers;
    size_t numFree;
    struct pb_slab_buffer *buffers;
-   struct pb_slab_size_header *header;
+   struct pb_slab_manager *mgr;
    
    struct pb_buffer *bo;
-   size_t pageAlignment;
    void *virtual;   
 };
 
-struct pb_slab_size_header 
+struct pb_slab_manager 
 {
+   struct pb_manager base;
+   
+   struct pb_manager *provider;
+   size_t bufSize;
+   size_t slabSize;
+   struct pb_desc desc;
+
    struct list_head slabs;
    struct list_head freeSlabs;
-   struct pb_slab_manager *pool;
-   size_t bufSize;
+   
    _glthread_Mutex mutex;
 };
 
@@ -90,19 +95,18 @@ struct pb_slab_size_header
  * The data of this structure remains constant after
  * initialization and thus needs no mutex protection.
  */
-struct pb_slab_manager 
+struct pb_slab_range_manager 
 {
    struct pb_manager base;
 
+   struct pb_manager *provider;
+   size_t minBufSize;
+   size_t maxBufSize;
    struct pb_desc desc;
+   
+   unsigned numBuckets;
    size_t *bucketSizes;
-   size_t numBuckets;
-   size_t pageSize;
-   struct pb_manager *provider;
-   unsigned pageAlignment;
-   unsigned maxSlabSize;
-   unsigned desiredNumBuffers;
-   struct pb_slab_size_header *headers;
+   struct pb_manager **buckets;
 };
 
 
@@ -122,8 +126,16 @@ pb_slab_manager(struct pb_manager *mgr)
 }
 
 
+static INLINE struct pb_slab_range_manager *
+pb_slab_range_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pb_slab_range_manager *)mgr;
+}
+
+
 /**
- * Delete a buffer from the slab header delayed list and put
+ * Delete a buffer from the slab delayed list and put
  * it on the slab FREE list.
  */
 static void
@@ -131,10 +143,10 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
 {
    struct pb_slab_buffer *buf = pb_slab_buffer(_buf);
    struct pb_slab *slab = buf->slab;
-   struct pb_slab_size_header *header = slab->header;
+   struct pb_slab_manager *mgr = slab->mgr;
    struct list_head *list = &buf->head;
 
-   _glthread_LOCK_MUTEX(header->mutex);
+   _glthread_LOCK_MUTEX(mgr->mutex);
    
    assert(buf->base.base.refcount == 0);
    
@@ -145,21 +157,21 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
    slab->numFree++;
 
    if (slab->head.next == &slab->head)
-      LIST_ADDTAIL(&slab->head, &header->slabs);
+      LIST_ADDTAIL(&slab->head, &mgr->slabs);
 
    if (slab->numFree == slab->numBuffers) {
       list = &slab->head;
       LIST_DEL(list);
-      LIST_ADDTAIL(list, &header->freeSlabs);
+      LIST_ADDTAIL(list, &mgr->freeSlabs);
    }
 
-   if (header->slabs.next == &header->slabs || slab->numFree
+   if (mgr->slabs.next == &mgr->slabs || slab->numFree
 	 != slab->numBuffers) {
 
       struct list_head *next;
 
-      for (list = header->freeSlabs.next, next = list->next; list
-	    != &header->freeSlabs; list = next, next = list->next) {
+      for (list = mgr->freeSlabs.next, next = list->next; list
+	    != &mgr->freeSlabs; list = next, next = list->next) {
 
 	 slab = LIST_ENTRY(struct pb_slab, list, head);
 
@@ -170,7 +182,7 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
       }
    }
    
-   _glthread_UNLOCK_MUTEX(header->mutex);
+   _glthread_UNLOCK_MUTEX(mgr->mutex);
 }
 
 
@@ -217,15 +229,13 @@ pb_slab_buffer_vtbl = {
 
 
 static enum pipe_error
-pb_slab_create(struct pb_slab_size_header *header)
+pb_slab_create(struct pb_slab_manager *mgr)
 {
-   struct pb_slab_manager *pool = header->pool;
-   size_t size = header->bufSize * pool->desiredNumBuffers;
    struct pb_slab *slab;
    struct pb_slab_buffer *buf;
-   size_t numBuffers;
-   int ret;
+   unsigned numBuffers;
    unsigned i;
+   enum pipe_error ret;
 
    slab = CALLOC_STRUCT(pb_slab);
    if (!slab)
@@ -236,22 +246,23 @@ pb_slab_create(struct pb_slab_size_header *header)
     * to efficiently reuse slabs.
     */
 
-   size = (size <= pool->maxSlabSize) ? size : pool->maxSlabSize;
-   size = (size + pool->pageSize - 1) & ~(pool->pageSize - 1);
-
-   slab->bo = pool->provider->create_buffer(pool->provider, size, &pool->desc);
-   if(!slab->bo)
+   slab->bo = mgr->provider->create_buffer(mgr->provider, mgr->slabSize, &mgr->desc);
+   if(!slab->bo) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err0;
+   }
 
    slab->virtual = pb_map(slab->bo, 
-			 PIPE_BUFFER_USAGE_CPU_READ |
-			 PIPE_BUFFER_USAGE_CPU_WRITE);
-   if(!slab->virtual)
+                          PIPE_BUFFER_USAGE_CPU_READ |
+                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!slab->virtual) {
+      ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err1;
+   }
 
    pb_unmap(slab->bo);
 
-   numBuffers = slab->bo->base.size / header->bufSize;
+   numBuffers = slab->bo->base.size / mgr->bufSize;
 
    slab->buffers = CALLOC(numBuffers, sizeof(*slab->buffers));
    if (!slab->buffers) {
@@ -263,17 +274,17 @@ pb_slab_create(struct pb_slab_size_header *header)
    LIST_INITHEAD(&slab->freeBuffers);
    slab->numBuffers = numBuffers;
    slab->numFree = 0;
-   slab->header = header;
+   slab->mgr = mgr;
 
    buf = slab->buffers;
    for (i=0; i < numBuffers; ++i) {
       buf->base.base.refcount = 0;
-      buf->base.base.size = header->bufSize;
+      buf->base.base.size = mgr->bufSize;
       buf->base.base.alignment = 0;
       buf->base.base.usage = 0;
       buf->base.vtbl = &pb_slab_buffer_vtbl;
       buf->slab = slab;
-      buf->start = i* header->bufSize;
+      buf->start = i* mgr->bufSize;
       buf->mapCount = 0;
       _glthread_INIT_COND(buf->event);
       LIST_ADDTAIL(&buf->head, &slab->freeBuffers);
@@ -281,7 +292,7 @@ pb_slab_create(struct pb_slab_size_header *header)
       buf++;
    }
 
-   LIST_ADDTAIL(&slab->head, &header->slabs);
+   LIST_ADDTAIL(&slab->head, &mgr->slabs);
 
    return PIPE_OK;
 
@@ -294,50 +305,47 @@ out_err0:
 
 
 static struct pb_buffer *
-pb_slab_manager_create_buffer(struct pb_manager *_pool,
+pb_slab_manager_create_buffer(struct pb_manager *_mgr,
                               size_t size,
                               const struct pb_desc *desc)
 {
-   struct pb_slab_manager *pool = pb_slab_manager(_pool);
-   struct pb_slab_size_header *header;
-   unsigned i;
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
    static struct pb_slab_buffer *buf;
    struct pb_slab *slab;
    struct list_head *list;
    int count = DRI_SLABPOOL_ALLOC_RETRIES;
 
-   /*
-    * FIXME: Check for compatibility.
-    */
-
-   header = pool->headers;
-   for (i=0; i<pool->numBuckets; ++i) {
-      if (header->bufSize >= size)
-	 break;
-      header++;
-   }
-
-   if (i >= pool->numBuckets)
-      /* Fall back to allocate a buffer object directly from the provider. */
-      return pool->provider->create_buffer(pool->provider, size, desc);
-
+   /* check size */
+   assert(size == mgr->bufSize);
+   if(size != mgr->bufSize)
+      return NULL;
+   
+   /* check if we can provide the requested alignment */
+   assert(pb_check_alignment(desc->alignment, mgr->desc.alignment));
+   if(!pb_check_alignment(desc->alignment, mgr->desc.alignment))
+      return NULL;
+   assert(pb_check_alignment(desc->alignment, mgr->bufSize));
+   if(!pb_check_alignment(desc->alignment, mgr->bufSize))
+      return NULL;
 
-   _glthread_LOCK_MUTEX(header->mutex);
-   while (header->slabs.next == &header->slabs && count > 0) {
-      if (header->slabs.next != &header->slabs)
+   /* XXX: check for compatible buffer usage too? */
+   
+   _glthread_LOCK_MUTEX(mgr->mutex);
+   while (mgr->slabs.next == &mgr->slabs && count > 0) {
+      if (mgr->slabs.next != &mgr->slabs)
 	 break;
 
-      _glthread_UNLOCK_MUTEX(header->mutex);
+      _glthread_UNLOCK_MUTEX(mgr->mutex);
       if (count != DRI_SLABPOOL_ALLOC_RETRIES)
 	 util_time_sleep(1);
-      _glthread_LOCK_MUTEX(header->mutex);
-      (void) pb_slab_create(header);
+      _glthread_LOCK_MUTEX(mgr->mutex);
+      (void) pb_slab_create(mgr);
       count--;
    }
 
-   list = header->slabs.next;
-   if (list == &header->slabs) {
-      _glthread_UNLOCK_MUTEX(header->mutex);
+   list = mgr->slabs.next;
+   if (list == &mgr->slabs) {
+      _glthread_UNLOCK_MUTEX(mgr->mutex);
       return NULL;
    }
    slab = LIST_ENTRY(struct pb_slab, list, head);
@@ -347,83 +355,141 @@ pb_slab_manager_create_buffer(struct pb_manager *_pool,
    list = slab->freeBuffers.next;
    LIST_DELINIT(list);
 
-   _glthread_UNLOCK_MUTEX(header->mutex);
+   _glthread_UNLOCK_MUTEX(mgr->mutex);
    buf = LIST_ENTRY(struct pb_slab_buffer, list, head);
+   
    ++buf->base.base.refcount;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   
    return &buf->base;
 }
 
 
 static void
-pb_slab_manager_destroy(struct pb_manager *_pool)
+pb_slab_manager_destroy(struct pb_manager *_mgr)
 {
-   struct pb_slab_manager *pool = pb_slab_manager(_pool);
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
 
-   FREE(pool->headers);
-   FREE(pool->bucketSizes);
-   FREE(pool);
+   /* TODO: cleanup all allocated buffers */
+   FREE(mgr);
 }
 
 
 struct pb_manager *
-pb_slab_manager_create(struct pb_manager *provider, 
-                       const struct pb_desc *desc,
-                       size_t smallestSize,
-                       size_t numSizes,
-                       size_t desiredNumBuffers,
-                       size_t maxSlabSize,
-                       size_t pageAlignment)
+pb_slab_manager_create(struct pb_manager *provider,
+                       size_t bufSize,
+                       size_t slabSize,
+                       const struct pb_desc *desc)
 {
-   struct pb_slab_manager *pool;
-   size_t i;
+   struct pb_slab_manager *mgr;
+
+   mgr = CALLOC_STRUCT(pb_slab_manager);
+   if (!mgr)
+      return NULL;
+
+   mgr->base.destroy = pb_slab_manager_destroy;
+   mgr->base.create_buffer = pb_slab_manager_create_buffer;
+
+   mgr->provider = provider;
+   mgr->bufSize = bufSize;
+   mgr->slabSize = slabSize;
+   mgr->desc = *desc;
+
+   LIST_INITHEAD(&mgr->slabs);
+   LIST_INITHEAD(&mgr->freeSlabs);
+   
+   _glthread_INIT_MUTEX(mgr->mutex);
+
+   return &mgr->base;
+}
+
+
+static struct pb_buffer *
+pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
+                                    size_t size,
+                                    const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   size_t bufSize;
+   unsigned i;
+
+   bufSize = mgr->minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      if(bufSize >= size)
+	 return mgr->buckets[i]->create_buffer(mgr->buckets[i], size, desc);
+      bufSize *= 2;
+   }
+
+   /* Fall back to allocate a buffer object directly from the provider. */
+   return mgr->provider->create_buffer(mgr->provider, size, desc);
+}
 
-   pool = CALLOC_STRUCT(pb_slab_manager);
-   if (!pool)
+
+static void
+pb_slab_range_manager_destroy(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+   unsigned i;
+   
+   for (i = 0; i < mgr->numBuckets; ++i)
+      mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
+   FREE(mgr->bucketSizes);
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pb_slab_range_manager_create(struct pb_manager *provider,
+                             size_t minBufSize,
+                             size_t maxBufSize,
+                             size_t slabSize,
+                             const struct pb_desc *desc)
+{
+   struct pb_slab_range_manager *mgr;
+   size_t bufSize;
+   unsigned i;
+
+   mgr = CALLOC_STRUCT(pb_slab_range_manager);
+   if (!mgr)
       goto out_err0;
 
-   pool->bucketSizes = CALLOC(numSizes, sizeof(*pool->bucketSizes));
-   if (!pool->bucketSizes)
-      goto out_err1;
+   mgr->base.destroy = pb_slab_range_manager_destroy;
+   mgr->base.create_buffer = pb_slab_range_manager_create_buffer;
 
-   pool->headers = CALLOC(numSizes, sizeof(*pool->headers));
-   if (!pool->headers)
-      goto out_err2;
-
-   pool->desc = *desc;
-   pool->numBuckets = numSizes;
-#ifdef WIN32
-   pool->pageSize = 4096;
-#else
-   pool->pageSize = getpagesize();
-#endif
-   pool->provider = provider;
-   pool->pageAlignment = pageAlignment;
-   pool->maxSlabSize = maxSlabSize;
-   pool->desiredNumBuffers = desiredNumBuffers;
-
-   for (i=0; i<pool->numBuckets; ++i) {
-      struct pb_slab_size_header *header = &pool->headers[i];
-      
-      pool->bucketSizes[i] = (smallestSize << i);
-      
-      _glthread_INIT_MUTEX(header->mutex);
-
-      LIST_INITHEAD(&header->slabs);
-      LIST_INITHEAD(&header->freeSlabs);
-
-      header->pool = pool;
-      header->bufSize = (smallestSize << i);
+   mgr->provider = provider;
+   mgr->minBufSize = minBufSize;
+   mgr->maxBufSize = maxBufSize;
+
+   mgr->numBuckets = 1;
+   bufSize = minBufSize;
+   while(bufSize < maxBufSize) {
+      bufSize *= 2;
+      ++mgr->numBuckets;
    }
+   
+   mgr->buckets = CALLOC(mgr->numBuckets, sizeof(*mgr->buckets));
+   if (!mgr->buckets)
+      goto out_err1;
 
-   pool->base.destroy = pb_slab_manager_destroy;
-   pool->base.create_buffer = pb_slab_manager_create_buffer;
+   bufSize = minBufSize;
+   for (i = 0; i < mgr->numBuckets; ++i) {
+      mgr->buckets[i] = pb_slab_manager_create(provider, bufSize, slabSize, desc);
+      if(!mgr->buckets[i])
+	 goto out_err2;
+      bufSize *= 2;
+   }
 
-   return &pool->base;
+   return &mgr->base;
 
 out_err2: 
-   FREE(pool->bucketSizes);
+   for (i = 0; i < mgr->numBuckets; ++i)
+      if(mgr->buckets[i])
+	    mgr->buckets[i]->destroy(mgr->buckets[i]);
+   FREE(mgr->buckets);
 out_err1: 
-   FREE(pool);
+   FREE(mgr);
 out_err0:
    return NULL;
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index eb3359750b..f01e12faa0 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -26,14 +26,29 @@
  **************************************************************************/
 
 
+#include "pipe/p_debug.h"
 #include "rtasm_cpu.h"
 
 
+static boolean rtasm_sse_enabled(void)
+{
+   static boolean firsttime = 1;
+   static boolean enabled;
+   
+   /* This gets called quite often at the moment:
+    */
+   if (firsttime) {
+      enabled =  !debug_get_bool_option("GALLIUM_NOSSE", FALSE);
+      firsttime = FALSE;
+   }
+   return enabled;
+}
+
 int rtasm_cpu_has_sse(void)
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__)
-   return 1;
+#if defined(__i386__) || defined(__386__) || defined(i386)
+   return rtasm_sse_enabled();
 #else
    return 0;
 #endif
@@ -42,8 +57,8 @@ int rtasm_cpu_has_sse(void)
 int rtasm_cpu_has_sse2(void) 
 {
    /* FIXME: actually detect this at run-time */
-#if defined(__i386__) || defined(__386__)
-   return 1;
+#if defined(__i386__) || defined(__386__) || defined(i386)
+   return rtasm_sse_enabled();
 #else
    return 0;
 #endif
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index aea8b28e58..3cd45d7dd9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -33,15 +33,114 @@
 #define DISASSEM 0
 #define X86_TWOB 0x0f
 
-static unsigned char *cptr( void (*label)() )
-{
-   return (unsigned char *) label;
-}
+
+#define DUMP_SSE  0
+
+#if DUMP_SSE
+
+static void
+_print_reg(
+   struct x86_reg reg )
+{
+   if (reg.mod != mod_REG) 
+      debug_printf( "[" );
+      
+   switch( reg.file ) {
+   case file_REG32:
+      switch( reg.idx ) {
+      case reg_AX: debug_printf( "EAX" ); break;
+      case reg_CX: debug_printf( "ECX" ); break;
+      case reg_DX: debug_printf( "EDX" ); break;
+      case reg_BX: debug_printf( "EBX" ); break;
+      case reg_SP: debug_printf( "ESP" ); break;
+      case reg_BP: debug_printf( "EBP" ); break;
+      case reg_SI: debug_printf( "ESI" ); break;
+      case reg_DI: debug_printf( "EDI" ); break;
+      }
+      break;
+   case file_MMX:
+      debug_printf( "MMX%u", reg.idx );
+      break;
+   case file_XMM:
+      debug_printf( "XMM%u", reg.idx );
+      break;
+   case file_x87:
+      debug_printf( "fp%u", reg.idx );
+      break;
+   }
+
+   if (reg.mod == mod_DISP8 ||
+       reg.mod == mod_DISP32)
+      debug_printf("+%d", reg.disp);
+
+   if (reg.mod != mod_REG) 
+      debug_printf( "]" );
+}
+
+
+#define DUMP_START() debug_printf( "\n" )
+#define DUMP_END() debug_printf( "\n" )
+
+#define DUMP() do {                             \
+   const char *foo = __FUNCTION__;              \
+   while (*foo && *foo != '_')                  \
+      foo++;                                    \
+   if  (*foo)                                   \
+      foo++;                                    \
+   debug_printf( "\n% 15s ", foo );             \
+} while (0)
+
+#define DUMP_I( I ) do {                        \
+   DUMP();                                      \
+   debug_printf( "%u", I );                     \
+} while( 0 )
+
+#define DUMP_R( R0 ) do {                       \
+   DUMP();                                      \
+   _print_reg( R0 );                            \
+} while( 0 )
+
+#define DUMP_RR( R0, R1 ) do {                  \
+   DUMP();                                      \
+   _print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   _print_reg( R1 );                            \
+} while( 0 )
+
+#define DUMP_RI( R0, I ) do {                   \
+   DUMP();                                      \
+   _print_reg( R0 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#define DUMP_RRI( R0, R1, I ) do {              \
+   DUMP();                                      \
+   _print_reg( R0 );                            \
+   debug_printf( ", " );                        \
+   _print_reg( R1 );                            \
+   debug_printf( ", %u", I );                   \
+} while( 0 )
+
+#else
+
+#define DUMP_START()
+#define DUMP_END()
+#define DUMP( )
+#define DUMP_I( I )
+#define DUMP_R( R0 )
+#define DUMP_RR( R0, R1 )
+#define DUMP_RI( R0, I )
+#define DUMP_RRI( R0, R1, I )
+
+#endif
 
 
 static void do_realloc( struct x86_function *p )
 {
-   if (p->size == 0) {
+   if (p->store == p->error_overflow) {
+      p->csr = p->store;
+   }
+   else if (p->size == 0) {
       p->size = 1024;
       p->store = rtasm_exec_malloc(p->size);
       p->csr = p->store;
@@ -51,10 +150,22 @@ static void do_realloc( struct x86_function *p )
       unsigned char *tmp = p->store;
       p->size *= 2;
       p->store = rtasm_exec_malloc(p->size);
-      memcpy(p->store, tmp, used);
-      p->csr = p->store + used;
+
+      if (p->store) {
+         memcpy(p->store, tmp, used);
+         p->csr = p->store + used;
+      }
+      else {
+         p->csr = p->store;
+      }
+
       rtasm_exec_free(tmp);
    }
+
+   if (p->store == NULL) {
+      p->store = p->csr = p->error_overflow;
+      p->size = sizeof(p->error_overflow);
+   }
 }
 
 /* Emit bytes to the instruction stream:
@@ -253,6 +364,7 @@ void x86_jcc( struct x86_function *p,
 	      unsigned char *label )
 {
    intptr_t offset = pointer_to_intptr( label ) - (pointer_to_intptr( x86_get_label(p) ) + 2);
+   DUMP_I(cc);
    
    if (offset <= 127 && offset >= -128) {
       emit_1ub(p, 0x70 + cc);
@@ -270,6 +382,7 @@ void x86_jcc( struct x86_function *p,
 unsigned char *x86_jcc_forward( struct x86_function *p,
 			  enum x86_cc cc )
 {
+   DUMP_I(cc);
    emit_2ub(p, 0x0f, 0x80 + cc);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -277,6 +390,7 @@ unsigned char *x86_jcc_forward( struct x86_function *p,
 
 unsigned char *x86_jmp_forward( struct x86_function *p)
 {
+   DUMP();
    emit_1ub(p, 0xe9);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -284,6 +398,8 @@ unsigned char *x86_jmp_forward( struct x86_function *p)
 
 unsigned char *x86_call_forward( struct x86_function *p)
 {
+   DUMP();
+
    emit_1ub(p, 0xe8);
    emit_1i(p, 0);
    return x86_get_label(p);
@@ -299,23 +415,31 @@ void x86_fixup_fwd_jump( struct x86_function *p,
 
 void x86_jmp( struct x86_function *p, unsigned char *label)
 {
+   DUMP_I( label );
    emit_1ub(p, 0xe9);
    emit_1i(p, pointer_to_intptr( label ) - pointer_to_intptr( x86_get_label(p) ) - 4);
 }
 
 #if 0
+static unsigned char *cptr( void (*label)() )
+{
+   return (unsigned char *) label;
+}
+
 /* This doesn't work once we start reallocating & copying the
  * generated code on buffer fills, because the call is relative to the
  * current pc.
  */
 void x86_call( struct x86_function *p, void (*label)())
 {
+   DUMP_I( label );
    emit_1ub(p, 0xe8);
    emit_1i(p, cptr(label) - x86_get_label(p) - 4);
 }
 #else
 void x86_call( struct x86_function *p, struct x86_reg reg)
 {
+   DUMP_R( reg );
    emit_1ub(p, 0xff);
    emit_modrm_noreg(p, 2, reg);
 }
@@ -328,6 +452,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
  */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
+   DUMP_RI( dst, imm );
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
@@ -336,6 +461,7 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x50 + reg.idx);
    p->stack_offset += 4;
@@ -344,6 +470,7 @@ void x86_push( struct x86_function *p,
 void x86_pop( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x58 + reg.idx);
    p->stack_offset -= 4;
@@ -352,6 +479,7 @@ void x86_pop( struct x86_function *p,
 void x86_inc( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x40 + reg.idx);
 }
@@ -359,17 +487,20 @@ void x86_inc( struct x86_function *p,
 void x86_dec( struct x86_function *p,
 	      struct x86_reg reg )
 {
+   DUMP_R( reg );
    assert(reg.mod == mod_REG);
    emit_1ub(p, 0x48 + reg.idx);
 }
 
 void x86_ret( struct x86_function *p )
 {
+   DUMP();
    emit_1ub(p, 0xc3);
 }
 
 void x86_sahf( struct x86_function *p )
 {
+   DUMP();
    emit_1ub(p, 0x9e);
 }
 
@@ -377,6 +508,7 @@ void x86_mov( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x8b, 0x89, dst, src );
 }
 
@@ -384,6 +516,7 @@ void x86_xor( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x33, 0x31, dst, src );
 }
 
@@ -391,6 +524,7 @@ void x86_cmp( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x3b, 0x39, dst, src );
 }
 
@@ -398,6 +532,7 @@ void x86_lea( struct x86_function *p,
 	      struct x86_reg dst,
 	      struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_1ub(p, 0x8d);
    emit_modrm( p, dst, src );
 }
@@ -406,6 +541,7 @@ void x86_test( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_1ub(p, 0x85);
    emit_modrm( p, dst, src );
 }
@@ -414,20 +550,36 @@ void x86_add( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm(p, 0x03, 0x01, dst, src );
 }
 
+/* Calculate EAX * src, results in EDX:EAX.
+ */
 void x86_mul( struct x86_function *p,
 	       struct x86_reg src )
 {
-   assert (src.file == file_REG32 && src.mod == mod_REG);
-   emit_op_modrm(p, 0xf7, 0, x86_make_reg (file_REG32, reg_SP), src );
+   DUMP_R(  src );
+   emit_1ub(p, 0xf7);
+   emit_modrm_noreg(p, 4, src );
+}
+
+
+void x86_imul( struct x86_function *p,
+	       struct x86_reg dst,
+	       struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0xAF);
+   emit_modrm(p, dst, src);
 }
 
+
 void x86_sub( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm(p, 0x2b, 0x29, dst, src );
 }
 
@@ -435,6 +587,7 @@ void x86_or( struct x86_function *p,
              struct x86_reg dst,
              struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x0b, 0x09, dst, src );
 }
 
@@ -442,6 +595,7 @@ void x86_and( struct x86_function *p,
               struct x86_reg dst,
               struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_op_modrm( p, 0x23, 0x21, dst, src );
 }
 
@@ -456,6 +610,7 @@ void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, 0xF3, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -464,6 +619,7 @@ void sse_movaps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x28, 0x29, dst, src );
 }
@@ -472,6 +628,7 @@ void sse_movups( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x10, 0x11, dst, src );
 }
@@ -480,6 +637,7 @@ void sse_movhps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x16, 0x17, dst, src ); /* cf movlhps */
@@ -489,6 +647,7 @@ void sse_movlps( struct x86_function *p,
 		 struct x86_reg dst,
 		 struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.mod != mod_REG || src.mod != mod_REG);
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x12, 0x13, dst, src ); /* cf movhlps */
@@ -498,6 +657,7 @@ void sse_maxps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -506,6 +666,7 @@ void sse_maxss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5F);
    emit_modrm( p, dst, src );
 }
@@ -514,6 +675,7 @@ void sse_divss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x5E);
    emit_modrm( p, dst, src );
 }
@@ -522,6 +684,7 @@ void sse_minps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5D);
    emit_modrm( p, dst, src );
 }
@@ -530,6 +693,7 @@ void sse_subps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x5C);
    emit_modrm( p, dst, src );
 }
@@ -538,6 +702,7 @@ void sse_mulps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -546,6 +711,7 @@ void sse_mulss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x59);
    emit_modrm( p, dst, src );
 }
@@ -554,6 +720,7 @@ void sse_addps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -562,6 +729,7 @@ void sse_addss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x58);
    emit_modrm( p, dst, src );
 }
@@ -570,6 +738,7 @@ void sse_andnps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x55);
    emit_modrm( p, dst, src );
 }
@@ -578,6 +747,7 @@ void sse_andps( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x54);
    emit_modrm( p, dst, src );
 }
@@ -586,6 +756,7 @@ void sse_rsqrtps( struct x86_function *p,
                   struct x86_reg dst,
                   struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 }
@@ -594,6 +765,7 @@ void sse_rsqrtss( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x52);
    emit_modrm( p, dst, src );
 
@@ -603,6 +775,7 @@ void sse_movhlps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x12);
    emit_modrm( p, dst, src );
@@ -612,6 +785,7 @@ void sse_movlhps( struct x86_function *p,
 		  struct x86_reg dst,
 		  struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.mod == mod_REG && src.mod == mod_REG);
    emit_2ub(p, X86_TWOB, 0x16);
    emit_modrm( p, dst, src );
@@ -621,6 +795,7 @@ void sse_orps( struct x86_function *p,
                struct x86_reg dst,
                struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x56);
    emit_modrm( p, dst, src );
 }
@@ -629,6 +804,7 @@ void sse_xorps( struct x86_function *p,
                 struct x86_reg dst,
                 struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x57);
    emit_modrm( p, dst, src );
 }
@@ -637,6 +813,7 @@ void sse_cvtps2pi( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_XMM || src.mod != mod_REG));
 
@@ -646,36 +823,48 @@ void sse_cvtps2pi( struct x86_function *p,
    emit_modrm( p, dst, src );
 }
 
+void sse2_cvtdq2ps( struct x86_function *p,
+		   struct x86_reg dst,
+		   struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_2ub(p, X86_TWOB, 0x5b);
+   emit_modrm( p, dst, src );
+}
+
 
 /* Shufps can also be used to implement a reduced swizzle when dest ==
  * arg0.
  */
 void sse_shufps( struct x86_function *p,
-		 struct x86_reg dest,
-		 struct x86_reg arg0,
+		 struct x86_reg dst,
+		 struct x86_reg src,
 		 unsigned char shuf) 
 {
+   DUMP_RRI( dst, src, shuf );
    emit_2ub(p, X86_TWOB, 0xC6);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
 }
 
 void sse_cmpps( struct x86_function *p,
-		struct x86_reg dest,
-		struct x86_reg arg0,
+		struct x86_reg dst,
+		struct x86_reg src,
 		unsigned char cc) 
 {
+   DUMP_RRI( dst, src, cc );
    emit_2ub(p, X86_TWOB, 0xC2);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, cc); 
 }
 
 void sse_pmovmskb( struct x86_function *p,
-                   struct x86_reg dest,
+                   struct x86_reg dst,
                    struct x86_reg src)
 {
-    emit_3ub(p, 0x66, X86_TWOB, 0xD7);
-    emit_modrm(p, dest, src);
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0xD7);
+   emit_modrm(p, dst, src);
 }
 
 /***********************************************************************
@@ -686,12 +875,13 @@ void sse_pmovmskb( struct x86_function *p,
  * Perform a reduced swizzle:
  */
 void sse2_pshufd( struct x86_function *p,
-		  struct x86_reg dest,
-		  struct x86_reg arg0,
+		  struct x86_reg dst,
+		  struct x86_reg src,
 		  unsigned char shuf) 
 {
+   DUMP_RRI( dst, src, shuf );
    emit_3ub(p, 0x66, X86_TWOB, 0x70);
-   emit_modrm(p, dest, arg0);
+   emit_modrm(p, dst, src);
    emit_1ub(p, shuf); 
 }
 
@@ -699,6 +889,7 @@ void sse2_cvttps2dq( struct x86_function *p,
                      struct x86_reg dst,
                      struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub( p, 0xF3, X86_TWOB, 0x5B );
    emit_modrm( p, dst, src );
 }
@@ -707,6 +898,7 @@ void sse2_cvtps2dq( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x5B);
    emit_modrm( p, dst, src );
 }
@@ -715,6 +907,7 @@ void sse2_packssdw( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x6B);
    emit_modrm( p, dst, src );
 }
@@ -723,6 +916,7 @@ void sse2_packsswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x63);
    emit_modrm( p, dst, src );
 }
@@ -731,14 +925,26 @@ void sse2_packuswb( struct x86_function *p,
 		    struct x86_reg dst,
 		    struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0x66, X86_TWOB, 0x67);
    emit_modrm( p, dst, src );
 }
 
+void sse2_punpcklbw( struct x86_function *p,
+		    struct x86_reg dst,
+		    struct x86_reg src )
+{
+   DUMP_RR( dst, src );
+   emit_3ub(p, 0x66, X86_TWOB, 0x60);
+   emit_modrm( p, dst, src );
+}
+
+
 void sse2_rcpps( struct x86_function *p,
                  struct x86_reg dst,
                  struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -747,6 +953,7 @@ void sse2_rcpss( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_3ub(p, 0xF3, X86_TWOB, 0x53);
    emit_modrm( p, dst, src );
 }
@@ -755,6 +962,7 @@ void sse2_movd( struct x86_function *p,
 		struct x86_reg dst,
 		struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    emit_2ub(p, 0x66, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
 }
@@ -767,30 +975,35 @@ void sse2_movd( struct x86_function *p,
  */
 void x87_fist( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 2, dst);
 }
 
 void x87_fistp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 3, dst);
 }
 
 void x87_fild( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    emit_1ub(p, 0xdf);
    emit_modrm_noreg(p, 0, arg);
 }
 
 void x87_fldz( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xee);
 }
 
 
 void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    assert(arg.file == file_REG32);
    assert(arg.mod != mod_REG);
    emit_1ub(p, 0xd9);
@@ -799,26 +1012,31 @@ void x87_fldcw( struct x86_function *p, struct x86_reg arg )
 
 void x87_fld1( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xe8);
 }
 
 void x87_fldl2e( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xea);
 }
 
 void x87_fldln2( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xed);
 }
 
 void x87_fwait( struct x86_function *p )
 {
+   DUMP();
    emit_1ub(p, 0x9b);
 }
 
 void x87_fnclex( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xdb, 0xe2);
 }
 
@@ -855,49 +1073,55 @@ static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86
       assert(0);
 }
 
-void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xc8,
 		0xdc, 0xc8,
 		4);
 }
 
-void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xe0,
 		0xdc, 0xe8,
 		4);
 }
 
-void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xe8,
 		0xdc, 0xe0,
 		5);
 }
 
-void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xc0,
 		0xdc, 0xc0,
 		0);
 }
 
-void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xf0,
 		0xdc, 0xf8,
 		6);
 }
 
-void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
+void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg src )
 {
-   x87_arith_op(p, dst, arg, 
+   DUMP_RR( dst, src );
+   x87_arith_op(p, dst, src, 
 		0xd8, 0xf8,
 		0xdc, 0xf0,
 		7);
@@ -905,6 +1129,7 @@ void x87_fdivr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg )
 
 void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc8+dst.idx);
@@ -912,6 +1137,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe8+dst.idx);
@@ -919,6 +1145,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe0+dst.idx);
@@ -926,6 +1153,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_faddp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc0+dst.idx);
@@ -933,6 +1161,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf8+dst.idx);
@@ -940,6 +1169,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf0+dst.idx);
@@ -947,70 +1177,83 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fucom( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe0+arg.idx);
 }
 
 void x87_fucomp( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe8+arg.idx);
 }
 
 void x87_fucompp( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xda, 0xe9);
 }
 
 void x87_fxch( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xd9, 0xc8+arg.idx);
 }
 
 void x87_fabs( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xe1);
 }
 
 void x87_fchs( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xe0);
 }
 
 void x87_fcos( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xff);
 }
 
 
 void x87_fprndint( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xfc);
 }
 
 void x87_fscale( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xfd);
 }
 
 void x87_fsin( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xfe);
 }
 
 void x87_fsincos( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xfb);
 }
 
 void x87_fsqrt( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xfa);
 }
 
 void x87_fxtract( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xf4);
 }
 
@@ -1020,6 +1263,7 @@ void x87_fxtract( struct x86_function *p )
  */
 void x87_f2xm1( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xf0);
 }
 
@@ -1028,6 +1272,7 @@ void x87_f2xm1( struct x86_function *p )
  */
 void x87_fyl2x( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xf1);
 }
 
@@ -1038,12 +1283,14 @@ void x87_fyl2x( struct x86_function *p )
  */
 void x87_fyl2xp1( struct x86_function *p )
 {
+   DUMP();
    emit_2ub(p, 0xd9, 0xf9);
 }
 
 
 void x87_fld( struct x86_function *p, struct x86_reg arg )
 {
+   DUMP_R( arg );
    if (arg.file == file_x87) 
       emit_2ub(p, 0xd9, 0xc0 + arg.idx);
    else {
@@ -1054,6 +1301,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )
 
 void x87_fst( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd0 + dst.idx);
    else {
@@ -1064,6 +1312,7 @@ void x87_fst( struct x86_function *p, struct x86_reg dst )
 
 void x87_fstp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xdd, 0xd8 + dst.idx);
    else {
@@ -1074,6 +1323,7 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcom( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd0 + dst.idx);
    else {
@@ -1084,6 +1334,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )
 
 void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    if (dst.file == file_x87) 
       emit_2ub(p, 0xd8, 0xd8 + dst.idx);
    else {
@@ -1095,6 +1346,7 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 
 void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 {
+   DUMP_R( dst );
    assert(dst.file == file_REG32);
 
    if (dst.idx == reg_AX &&
@@ -1115,6 +1367,7 @@ void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 
 void mmx_emms( struct x86_function *p )
 {
+   DUMP();
    assert(p->need_emms);
    emit_2ub(p, 0x0f, 0x77);
    p->need_emms = 0;
@@ -1124,6 +1377,7 @@ void mmx_packssdw( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1137,6 +1391,7 @@ void mmx_packuswb( struct x86_function *p,
 		   struct x86_reg dst,
 		   struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    assert(dst.file == file_MMX && 
 	  (src.file == file_MMX || src.mod != mod_REG));
 
@@ -1150,6 +1405,7 @@ void mmx_movd( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6e, 0x7e, dst, src );
@@ -1159,6 +1415,7 @@ void mmx_movq( struct x86_function *p,
 	       struct x86_reg dst,
 	       struct x86_reg src )
 {
+   DUMP_RR( dst, src );
    p->need_emms = 1;
    emit_1ub(p, X86_TWOB);
    emit_op_modrm( p, 0x6f, 0x7f, dst, src );
@@ -1186,18 +1443,25 @@ void x86_init_func( struct x86_function *p )
    p->size = 0;
    p->store = NULL;
    p->csr = p->store;
+   DUMP_START();
 }
 
 void x86_init_func_size( struct x86_function *p, unsigned code_size )
 {
    p->size = code_size;
    p->store = rtasm_exec_malloc(code_size);
+   if (p->store == NULL) {
+      p->store = p->error_overflow;
+   }
    p->csr = p->store;
+   DUMP_START();
 }
 
 void x86_release_func( struct x86_function *p )
 {
-   rtasm_exec_free(p->store);
+   if (p->store && p->store != p->error_overflow)
+      rtasm_exec_free(p->store);
+
    p->store = NULL;
    p->csr = NULL;
    p->size = 0;
@@ -1206,9 +1470,14 @@ void x86_release_func( struct x86_function *p )
 
 void (*x86_get_func( struct x86_function *p ))(void)
 {
+   DUMP_END();
    if (DISASSEM && p->store)
       debug_printf("disassemble %p %p\n", p->store, p->csr);
-   return (void (*)(void)) p->store;
+
+   if (p->store == p->error_overflow)
+      return (void (*)(void)) NULL;
+   else
+      return (void (*)(void)) p->store;
 }
 
 #else
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 606b41eb35..695a1cef4e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -43,6 +43,7 @@ struct x86_function {
    unsigned char *csr;
    unsigned stack_offset;
    int need_emms;
+   unsigned char error_overflow[4];
    const char *fn;
 };
 
@@ -165,6 +166,7 @@ void mmx_packuswb( struct x86_function *p, struct x86_reg dst, struct x86_reg sr
 
 void sse2_cvtps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_cvttps2dq( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
+void sse2_cvtdq2ps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_movd( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packssdw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_packsswb( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -202,6 +204,7 @@ void sse_rsqrtss( struct x86_function *p, struct x86_reg dst, struct x86_reg src
 void sse_shufps( struct x86_function *p, struct x86_reg dest, struct x86_reg arg0,
                  unsigned char shuf );
 void sse_pmovmskb( struct x86_function *p, struct x86_reg dest, struct x86_reg src );
+void sse2_punpcklbw( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
 void x86_add( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_and( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -211,6 +214,7 @@ void x86_inc( struct x86_function *p, struct x86_reg reg );
 void x86_lea( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mov( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_mul( struct x86_function *p, struct x86_reg src );
+void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_pop( struct x86_function *p, struct x86_reg reg );
 void x86_push( struct x86_function *p, struct x86_reg reg );
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
index 78e7dec569..29e104bbd1 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -287,10 +287,10 @@ micro_abs(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) fabs( (double) src->f[0] );
-   dst->f[1] = (float) fabs( (double) src->f[1] );
-   dst->f[2] = (float) fabs( (double) src->f[2] );
-   dst->f[3] = (float) fabs( (double) src->f[3] );
+   dst->f[0] = fabsf( src->f[0] );
+   dst->f[1] = fabsf( src->f[1] );
+   dst->f[2] = fabsf( src->f[2] );
+   dst->f[3] = fabsf( src->f[3] );
 }
 
 static void
@@ -334,10 +334,10 @@ micro_ceil(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) ceil( (double) src->f[0] );
-   dst->f[1] = (float) ceil( (double) src->f[1] );
-   dst->f[2] = (float) ceil( (double) src->f[2] );
-   dst->f[3] = (float) ceil( (double) src->f[3] );
+   dst->f[0] = ceilf( src->f[0] );
+   dst->f[1] = ceilf( src->f[1] );
+   dst->f[2] = ceilf( src->f[2] );
+   dst->f[3] = ceilf( src->f[3] );
 }
 
 static void
@@ -345,10 +345,10 @@ micro_cos(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) cos( (double) src->f[0] );
-   dst->f[1] = (float) cos( (double) src->f[1] );
-   dst->f[2] = (float) cos( (double) src->f[2] );
-   dst->f[3] = (float) cos( (double) src->f[3] );
+   dst->f[0] = cosf( src->f[0] );
+   dst->f[1] = cosf( src->f[1] );
+   dst->f[2] = cosf( src->f[2] );
+   dst->f[3] = cosf( src->f[3] );
 }
 
 static void
@@ -430,10 +430,10 @@ micro_exp2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src)
 {
-   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
-   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
-   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
-   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+   dst->f[0] = powf( 2.0f, src->f[0] );
+   dst->f[1] = powf( 2.0f, src->f[1] );
+   dst->f[2] = powf( 2.0f, src->f[2] );
+   dst->f[3] = powf( 2.0f, src->f[3] );
 }
 
 static void
@@ -463,10 +463,10 @@ micro_flr(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) src->f[0] );
-   dst->f[1] = (float) floor( (double) src->f[1] );
-   dst->f[2] = (float) floor( (double) src->f[2] );
-   dst->f[3] = (float) floor( (double) src->f[3] );
+   dst->f[0] = floorf( src->f[0] );
+   dst->f[1] = floorf( src->f[1] );
+   dst->f[2] = floorf( src->f[2] );
+   dst->f[3] = floorf( src->f[3] );
 }
 
 static void
@@ -474,10 +474,10 @@ micro_frc(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
-   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
-   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
-   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+   dst->f[0] = src->f[0] - floorf( src->f[0] );
+   dst->f[1] = src->f[1] - floorf( src->f[1] );
+   dst->f[2] = src->f[2] - floorf( src->f[2] );
+   dst->f[3] = src->f[3] - floorf( src->f[3] );
 }
 
 static void
@@ -510,10 +510,10 @@ micro_lg2(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
-   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
-   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
-   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+   dst->f[0] = logf( src->f[0] ) * 1.442695f;
+   dst->f[1] = logf( src->f[1] ) * 1.442695f;
+   dst->f[2] = logf( src->f[2] ) * 1.442695f;
+   dst->f[3] = logf( src->f[3] ) * 1.442695f;
 }
 
 static void
@@ -764,10 +764,10 @@ micro_pow(
    const union tgsi_exec_channel *src0,
    const union tgsi_exec_channel *src1 )
 {
-   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
-   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
-   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
-   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+   dst->f[0] = powf( src0->f[0], src1->f[0] );
+   dst->f[1] = powf( src0->f[1], src1->f[1] );
+   dst->f[2] = powf( src0->f[2], src1->f[2] );
+   dst->f[3] = powf( src0->f[3], src1->f[3] );
 }
 
 static void
@@ -775,10 +775,10 @@ micro_rnd(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
-   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
-   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
-   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+   dst->f[0] = floorf( src->f[0] + 0.5f );
+   dst->f[1] = floorf( src->f[1] + 0.5f );
+   dst->f[2] = floorf( src->f[2] + 0.5f );
+   dst->f[3] = floorf( src->f[3] + 0.5f );
 }
 
 static void
@@ -833,20 +833,20 @@ micro_sin(
    union tgsi_exec_channel *dst,
    const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sin( (double) src->f[0] );
-   dst->f[1] = (float) sin( (double) src->f[1] );
-   dst->f[2] = (float) sin( (double) src->f[2] );
-   dst->f[3] = (float) sin( (double) src->f[3] );
+   dst->f[0] = sinf( src->f[0] );
+   dst->f[1] = sinf( src->f[1] );
+   dst->f[2] = sinf( src->f[2] );
+   dst->f[3] = sinf( src->f[3] );
 }
 
 static void
 micro_sqrt( union tgsi_exec_channel *dst,
             const union tgsi_exec_channel *src )
 {
-   dst->f[0] = (float) sqrt( (double) src->f[0] );
-   dst->f[1] = (float) sqrt( (double) src->f[1] );
-   dst->f[2] = (float) sqrt( (double) src->f[2] );
-   dst->f[3] = (float) sqrt( (double) src->f[3] );
+   dst->f[0] = sqrtf( src->f[0] );
+   dst->f[1] = sqrtf( src->f[1] );
+   dst->f[2] = sqrtf( src->f[2] );
+   dst->f[3] = sqrtf( src->f[3] );
 }
 
 static void
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
index 45c49dd007..92e2e5e985 100644
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -166,7 +166,7 @@ struct tgsi_exec_machine
 
    float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
    unsigned                      ImmLimit;
-   float                         (*Consts)[4];
+   const float                   (*Consts)[4];
    struct tgsi_exec_vector       *Inputs;
    struct tgsi_exec_vector       *Outputs;
    const struct tgsi_token       *Tokens;
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
index c37e201b2b..c3295a27ff 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -36,113 +36,8 @@
 
 #if defined(__i386__) || defined(__386__)
 
-#define DUMP_SSE  0
+#define HIGH_PRECISION 1  /* for 1/sqrt() */
 
-#if DUMP_SSE
-
-static void
-_print_reg(
-   struct x86_reg reg )
-{
-   if (reg.mod != mod_REG) 
-      debug_printf( "[" );
-      
-   switch( reg.file ) {
-   case file_REG32:
-      switch( reg.idx ) {
-      case reg_AX:
-         debug_printf( "EAX" );
-         break;
-      case reg_CX:
-         debug_printf( "ECX" );
-         break;
-      case reg_DX:
-         debug_printf( "EDX" );
-         break;
-      case reg_BX:
-         debug_printf( "EBX" );
-         break;
-      case reg_SP:
-         debug_printf( "ESP" );
-         break;
-      case reg_BP:
-         debug_printf( "EBP" );
-         break;
-      case reg_SI:
-         debug_printf( "ESI" );
-         break;
-      case reg_DI:
-         debug_printf( "EDI" );
-         break;
-      }
-      break;
-   case file_MMX:
-      assert( 0 );
-      break;
-   case file_XMM:
-      debug_printf( "XMM%u", reg.idx );
-      break;
-   case file_x87:
-      assert( 0 );
-      break;
-   }
-
-   if (reg.mod == mod_DISP8 ||
-       reg.mod == mod_DISP32)
-      debug_printf("+%d", reg.disp);
-
-   if (reg.mod != mod_REG) 
-      debug_printf( "]" );
-}
-
-static void
-_fill(
-   const char  *op )
-{
-   unsigned count = 10 - strlen( op );
-
-   while( count-- ) {
-      debug_printf( " " );
-   }
-}
-
-#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
-#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
-#define DUMP( OP ) debug_printf( "\n%s", OP )
-#define DUMP_I( OP, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   debug_printf( "%u", I ); } while( 0 )
-#define DUMP_R( OP, R0 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 ); } while( 0 )
-#define DUMP_RR( OP, R0, R1 ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 ); } while( 0 )
-#define DUMP_RRI( OP, R0, R1, I ) do {\
-   debug_printf( "\n%s", OP );\
-   _fill( OP );\
-   _print_reg( R0 );\
-   debug_printf( ", " );\
-   _print_reg( R1 );\
-   debug_printf( ", " );\
-   debug_printf( "%u", I ); } while( 0 )
-
-#else
-
-#define DUMP_START()
-#define DUMP_END()
-#define DUMP( OP )
-#define DUMP_I( OP, I )
-#define DUMP_R( OP, R0 )
-#define DUMP_RR( OP, R0, R1 )
-#define DUMP_RRI( OP, R0, R1, I )
-
-#endif
 
 #define FOR_EACH_CHANNEL( CHAN )\
    for( CHAN = 0; CHAN < 4; CHAN++ )
@@ -308,200 +203,6 @@ get_coef(
       ((vec * 3 + member) * 4 + chan) * 4 );
 }
 
-/**
- * X86 rtasm wrappers.
- */
-
-static void
-emit_addps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ADDPS", dst, src );
-   sse_addps( func, dst, src );
-}
-
-static void
-emit_andnps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDNPS", dst, src );
-   sse_andnps( func, dst, src );
-}
-
-static void
-emit_andps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ANDPS", dst, src );
-   sse_andps( func, dst, src );
-}
-
-static void
-emit_call(
-   struct x86_function  *func,
-   void                 (* addr)() )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-   DUMP_I( "CALL", addr );
-   x86_mov_reg_imm( func, ecx, (unsigned long) addr );
-   x86_call( func, ecx );
-}
-
-static void
-emit_cmpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   enum sse_cc          cc )
-{
-   DUMP_RRI( "CMPPS", dst, src, cc );
-   sse_cmpps( func, dst, src, cc );
-}
-
-static void
-emit_cvttps2dq(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "CVTTPS2DQ", dst, src );
-   sse2_cvttps2dq( func, dst, src );
-}
-
-static void
-emit_maxps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MAXPS", dst, src );
-   sse_maxps( func, dst, src );
-}
-
-static void
-emit_minps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MINPS", dst, src );
-   sse_minps( func, dst, src );
-}
-
-static void
-emit_mov(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOV", dst, src );
-   x86_mov( func, dst, src );
-}
-
-static void
-emit_movaps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVAPS", dst, src );
-   sse_movaps( func, dst, src );
-}
-
-static void
-emit_movss(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVSS", dst, src );
-   sse_movss( func, dst, src );
-}
-
-static void
-emit_movups(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MOVUPS", dst, src );
-   sse_movups( func, dst, src );
-}
-
-static void
-emit_mulps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "MULPS", dst, src );
-   sse_mulps( func, dst, src );
-}
-
-static void
-emit_or(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "OR", dst, src );
-   x86_or( func, dst, src );
-}
-
-static void
-emit_orps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "ORPS", dst, src );
-   sse_orps( func, dst, src );
-}
-
-static void
-emit_pmovmskb(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "PMOVMSKB", dst, src );
-   sse_pmovmskb( func, dst, src );
-}
-
-static void
-emit_pop(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "POP", dst );
-   x86_pop( func, dst );
-}
-
-static void
-emit_push(
-   struct x86_function  *func,
-   struct x86_reg       dst )
-{
-   DUMP_R( "PUSH", dst );
-   x86_push( func, dst );
-}
-
-static void
-emit_rcpps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RCPPS", dst, src );
-   sse2_rcpps( func, dst, src );
-}
 
 #ifdef WIN32
 static void
@@ -509,7 +210,6 @@ emit_retw(
    struct x86_function  *func,
    unsigned             size )
 {
-   DUMP_I( "RET", size );
    x86_retw( func, size );
 }
 #else
@@ -517,51 +217,10 @@ static void
 emit_ret(
    struct x86_function  *func )
 {
-   DUMP( "RET" );
    x86_ret( func );
 }
 #endif
 
-static void
-emit_rsqrtps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "RSQRTPS", dst, src );
-   sse_rsqrtps( func, dst, src );
-}
-
-static void
-emit_shufps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src,
-   unsigned char        shuf )
-{
-   DUMP_RRI( "SHUFPS", dst, src, shuf );
-   sse_shufps( func, dst, src, shuf );
-}
-
-static void
-emit_subps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "SUBPS", dst, src );
-   sse_subps( func, dst, src );
-}
-
-static void
-emit_xorps(
-   struct x86_function  *func,
-   struct x86_reg       dst,
-   struct x86_reg       src )
-{
-   DUMP_RR( "XORPS", dst, src );
-   sse_xorps( func, dst, src );
-}
 
 /**
  * Data fetch helpers.
@@ -580,11 +239,11 @@ emit_const(
    unsigned vec,
    unsigned chan )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_const( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -598,11 +257,11 @@ emit_immediate(
    unsigned vec,
    unsigned chan )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_immediate( vec, chan ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -623,7 +282,7 @@ emit_inputf(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm ),
       get_input( vec, chan ) );
@@ -642,7 +301,7 @@ emit_output(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_output( vec, chan ),
       make_xmm( xmm ) );
@@ -661,7 +320,7 @@ emit_tempf(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm ),
       get_temp( vec, chan ) );
@@ -682,11 +341,11 @@ emit_coef(
    unsigned chan,
    unsigned member )
 {
-   emit_movss(
+   sse_movss(
       func,
       make_xmm( xmm ),
       get_coef( vec, chan, member ) );
-   emit_shufps(
+   sse_shufps(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ),
@@ -704,7 +363,7 @@ emit_inputs(
    unsigned vec,
    unsigned chan )
 {
-   emit_movups(
+   sse_movups(
       func,
       get_input( vec, chan ),
       make_xmm( xmm ) );
@@ -717,7 +376,7 @@ emit_temps(
    unsigned vec,
    unsigned chan )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( vec, chan ),
       make_xmm( xmm ) );
@@ -794,39 +453,39 @@ static void
 emit_push_gp(
    struct x86_function *func )
 {
-   emit_push(
+   x86_push(
       func,
       get_const_base() );
-   emit_push(
+   x86_push(
       func,
       get_input_base() );
-   emit_push(
+   x86_push(
       func,
       get_output_base() );
 
    /* It is important on non-win32 platforms that temp base is pushed last.
     */
-   emit_push(
+   x86_push(
       func,
       get_temp_base() );
 }
 
 static void
-emit_pop_gp(
+x86_pop_gp(
    struct x86_function *func )
 {
    /* Restore GP registers in a reverse order.
     */
-   emit_pop(
+   x86_pop(
       func,
       get_temp_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_output_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_input_base() );
-   emit_pop(
+   x86_pop(
       func,
       get_const_base() );
 }
@@ -837,7 +496,7 @@ emit_func_call_dst(
    unsigned xmm_dst,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 0 ),
       make_xmm( xmm_dst ) );
@@ -846,19 +505,22 @@ emit_func_call_dst(
       func );
 
 #ifdef WIN32
-   emit_push(
+   x86_push(
       func,
       get_temp( TEMP_R0, 0 ) );
 #endif
 
-   emit_call(
-      func,
-      code );
+   {
+      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+      x86_mov_reg_imm( func, ecx, (unsigned long) code );
+      x86_call( func, ecx );
+   }
 
-   emit_pop_gp(
+   x86_pop_gp(
       func );
 
-   emit_movaps(
+   sse_movaps(
       func,
       make_xmm( xmm_dst ),
       get_temp( TEMP_R0, 0 ) );
@@ -871,7 +533,7 @@ emit_func_call_dst_src(
    unsigned xmm_src,
    void (*code)() )
 {
-   emit_movaps(
+   sse_movaps(
       func,
       get_temp( TEMP_R0, 1 ),
       make_xmm( xmm_src ) );
@@ -891,7 +553,7 @@ emit_abs(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_andps(
+   sse_andps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -905,7 +567,7 @@ emit_add(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_addps(
+   sse_addps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -916,17 +578,15 @@ cos4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) cos( (double) store[0] );
-   store[1] = (float) cos( (double) store[1] );
-   store[2] = (float) cos( (double) store[2] );
-   store[3] = (float) cos( (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
+
    store[X + 0] = cosf( store[X + 0] );
    store[X + 1] = cosf( store[X + 1] );
    store[X + 2] = cosf( store[X + 2] );
    store[X + 3] = cosf( store[X + 3] );
-#endif
 }
 
 static void
@@ -945,17 +605,14 @@ ex24f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) pow( 2.0, (double) store[0] );
-   store[1] = (float) pow( 2.0, (double) store[1] );
-   store[2] = (float) pow( 2.0, (double) store[2] );
-   store[3] = (float) pow( 2.0, (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = powf( 2.0f, store[X + 0] );
    store[X + 1] = powf( 2.0f, store[X + 1] );
    store[X + 2] = powf( 2.0f, store[X + 2] );
    store[X + 3] = powf( 2.0f, store[X + 3] );
-#endif
 }
 
 static void
@@ -974,7 +631,7 @@ emit_f2it(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_cvttps2dq(
+   sse2_cvttps2dq(
       func,
       make_xmm( xmm ),
       make_xmm( xmm ) );
@@ -989,10 +646,10 @@ flr4f(
 #else
    const unsigned X = TEMP_R0 * 16;
 #endif
-   store[X + 0] = (float) floor( (double) store[X + 0] );
-   store[X + 1] = (float) floor( (double) store[X + 1] );
-   store[X + 2] = (float) floor( (double) store[X + 2] );
-   store[X + 3] = (float) floor( (double) store[X + 3] );
+   store[X + 0] = floorf( store[X + 0] );
+   store[X + 1] = floorf( store[X + 1] );
+   store[X + 2] = floorf( store[X + 2] );
+   store[X + 3] = floorf( store[X + 3] );
 }
 
 static void
@@ -1015,10 +672,10 @@ frc4f(
 #else
    const unsigned X = TEMP_R0 * 16;
 #endif
-   store[X + 0] -= (float) floor( (double) store[X + 0] );
-   store[X + 1] -= (float) floor( (double) store[X + 1] );
-   store[X + 2] -= (float) floor( (double) store[X + 2] );
-   store[X + 3] -= (float) floor( (double) store[X + 3] );
+   store[X + 0] -= floorf( store[X + 0] );
+   store[X + 1] -= floorf( store[X + 1] );
+   store[X + 2] -= floorf( store[X + 2] );
+   store[X + 3] -= floorf( store[X + 3] );
 }
 
 static void
@@ -1064,7 +721,7 @@ emit_MOV(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_movups(
+   sse_movups(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1075,7 +732,7 @@ emit_mul (struct x86_function *func,
           unsigned xmm_dst,
           unsigned xmm_src)
 {
-   emit_mulps(
+   sse_mulps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1086,7 +743,7 @@ emit_neg(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_xorps(
+   sse_xorps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1099,17 +756,14 @@ pow4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) pow( (double) store[0], (double) store[4] );
-   store[1] = (float) pow( (double) store[1], (double) store[5] );
-   store[2] = (float) pow( (double) store[2], (double) store[6] );
-   store[3] = (float) pow( (double) store[3], (double) store[7] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = powf( store[X + 0], store[X + 4] );
    store[X + 1] = powf( store[X + 1], store[X + 5] );
    store[X + 2] = powf( store[X + 2], store[X + 6] );
    store[X + 3] = powf( store[X + 3], store[X + 7] );
-#endif
 }
 
 static void
@@ -1131,22 +785,80 @@ emit_rcp (
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_rcpps(
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.  Need to either emit a proper divide or use the
+    * iterative technique described below in emit_rsqrt().
+    */
+   sse2_rcpps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
 }
 
+#if HIGH_PRECISION
+static void XSTDCALL
+rsqrt4f(
+   float *store )
+{
+#ifdef WIN32
+   const unsigned X = 0;
+#else
+   const unsigned X = TEMP_R0 * 16;
+#endif
+   store[X + 0] = 1.0F / sqrtf( store[X + 0] );
+   store[X + 1] = 1.0F / sqrtf( store[X + 1] );
+   store[X + 2] = 1.0F / sqrtf( store[X + 2] );
+   store[X + 3] = 1.0F / sqrtf( store[X + 3] );
+}
+#endif
+
 static void
 emit_rsqrt(
    struct x86_function *func,
    unsigned xmm_dst,
    unsigned xmm_src )
 {
+#if HIGH_PRECISION
+#if 1
+   emit_func_call_dst_src(
+      func,
+      xmm_dst,
+      xmm_src,
+      rsqrt4f );
+#else
+   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
+    * implementations, it is possible to improve its precision at
+    * fairly low cost, using a newton/raphson step, as below:
+    * 
+    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+    *
+    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+    */
+   /* This is some code that woudl do the above for a scalar 'a'.  We
+    * obviously are interested in a vector version:
+    *
+    * movss   xmm3, a;
+    * movss   xmm1, half;
+    * movss   xmm2, three;
+    * rsqrtss xmm0, xmm3;
+    * mulss   xmm3, xmm0;
+    * mulss   xmm1, xmm0;
+    * mulss   xmm3, xmm0;
+    * subss   xmm2, xmm3;
+    * mulss   xmm1, xmm2;
+    * movss   x,    xmm1;
+    */
+#endif
+#else
+   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
+    * good enough.
+    */
    emit_rsqrtps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
+#endif
 }
 
 static void
@@ -1154,7 +866,7 @@ emit_setsign(
    struct x86_function *func,
    unsigned xmm )
 {
-   emit_orps(
+   sse_orps(
       func,
       make_xmm( xmm ),
       get_temp(
@@ -1167,17 +879,14 @@ sin4f(
    float *store )
 {
 #ifdef WIN32
-   store[0] = (float) sin( (double) store[0] );
-   store[1] = (float) sin( (double) store[1] );
-   store[2] = (float) sin( (double) store[2] );
-   store[3] = (float) sin( (double) store[3] );
+   const unsigned X = 0;
 #else
    const unsigned X = TEMP_R0 * 16;
+#endif
    store[X + 0] = sinf( store[X + 0] );
    store[X + 1] = sinf( store[X + 1] );
    store[X + 2] = sinf( store[X + 2] );
    store[X + 3] = sinf( store[X + 3] );
-#endif
 }
 
 static void
@@ -1196,7 +905,7 @@ emit_sub(
    unsigned xmm_dst,
    unsigned xmm_src )
 {
-   emit_subps(
+   sse_subps(
       func,
       make_xmm( xmm_dst ),
       make_xmm( xmm_src ) );
@@ -1405,16 +1114,16 @@ emit_kil(
       }
    }
 
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
-   emit_push(
+   x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
 
    FOR_EACH_CHANNEL( chan_index ) {
       if( uniquemask & (1 << chan_index) ) {
-         emit_cmpps(
+         sse_cmpps(
             func,
             make_xmm( registers[chan_index] ),
             get_temp(
@@ -1423,17 +1132,17 @@ emit_kil(
             cc_LessThan );
 
          if( chan_index == firstchan ) {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                make_xmm( registers[chan_index] ) );
          }
          else {
-            emit_pmovmskb(
+            sse_pmovmskb(
                func,
                x86_make_reg( file_REG32, reg_DX ),
                make_xmm( registers[chan_index] ) );
-            emit_or(
+            x86_or(
                func,
                x86_make_reg( file_REG32, reg_AX ),
                x86_make_reg( file_REG32, reg_DX ) );
@@ -1441,17 +1150,17 @@ emit_kil(
       }
    }
 
-   emit_or(
+   x86_or(
       func,
       get_temp(
          TGSI_EXEC_TEMP_KILMASK_I,
          TGSI_EXEC_TEMP_KILMASK_C ),
       x86_make_reg( file_REG32, reg_AX ) );
 
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_DX ) );
-   emit_pop(
+   x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX ) );
 }
@@ -1467,12 +1176,12 @@ emit_setcc(
    FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ),
          cc );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 0 ),
          get_temp(
@@ -1493,22 +1202,22 @@ emit_cmp(
       FETCH( func, *inst, 0, 0, chan_index );
       FETCH( func, *inst, 1, 1, chan_index );
       FETCH( func, *inst, 2, 2, chan_index );
-      emit_cmpps(
+      sse_cmpps(
          func,
          make_xmm( 0 ),
          get_temp(
             TGSI_EXEC_TEMP_00000000_I,
             TGSI_EXEC_TEMP_00000000_C ),
          cc_LessThan );
-      emit_andps(
+      sse_andps(
          func,
          make_xmm( 1 ),
          make_xmm( 0 ) );
-      emit_andnps(
+      sse_andnps(
          func,
          make_xmm( 0 ),
          make_xmm( 2 ) );
-      emit_orps(
+      sse_orps(
          func,
          make_xmm( 0 ),
          make_xmm( 1 ) );
@@ -1559,7 +1268,7 @@ emit_instruction(
           IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_maxps(
+            sse_maxps(
                func,
                make_xmm( 0 ),
                get_temp(
@@ -1568,21 +1277,26 @@ emit_instruction(
             STORE( func, *inst, 0, 0, CHAN_Y );
          }
          if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            /* XMM[1] = SrcReg[0].yyyy */
             FETCH( func, *inst, 1, 0, CHAN_Y );
-            emit_maxps(
+            /* XMM[1] = max(XMM[1], 0) */
+            sse_maxps(
                func,
                make_xmm( 1 ),
                get_temp(
                   TGSI_EXEC_TEMP_00000000_I,
                   TGSI_EXEC_TEMP_00000000_C ) );
+            /* XMM[2] = SrcReg[0].wwww */
             FETCH( func, *inst, 2, 0, CHAN_W );
-            emit_minps(
+            /* XMM[2] = min(XMM[2], 128.0) */
+            sse_minps(
                func,
                make_xmm( 2 ),
                get_temp(
                   TGSI_EXEC_TEMP_128_I,
                   TGSI_EXEC_TEMP_128_C ) );
-            emit_maxps(
+            /* XMM[2] = max(XMM[2], -128.0) */
+            sse_maxps(
                func,
                make_xmm( 2 ),
                get_temp(
@@ -1590,16 +1304,16 @@ emit_instruction(
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
             emit_pow( func, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
-            emit_xorps(
+            sse_xorps(
                func,
                make_xmm( 2 ),
                make_xmm( 2 ) );
-            emit_cmpps(
+            sse_cmpps(
                func,
                make_xmm( 2 ),
                make_xmm( 0 ),
                cc_LessThanEqual );
-            emit_andps(
+            sse_andps(
                func,
                make_xmm( 2 ),
                make_xmm( 1 ) );
@@ -1721,7 +1435,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_minps(
+         sse_minps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -1733,7 +1447,7 @@ emit_instruction(
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
          FETCH( func, *inst, 1, 1, chan_index );
-         emit_maxps(
+         sse_maxps(
             func,
             make_xmm( 0 ),
             make_xmm( 1 ) );
@@ -2332,7 +2046,7 @@ emit_declaration(
  */
 unsigned
 tgsi_emit_sse2(
-   struct tgsi_token *tokens,
+   const struct tgsi_token *tokens,
    struct x86_function *func,
    float (*immediates)[4])
 {
@@ -2341,8 +2055,6 @@ tgsi_emit_sse2(
    unsigned ok = 1;
    uint num_immediates = 0;
 
-   DUMP_START();
-
    func->csr = func->store;
 
    tgsi_parse_init( &parse, tokens );
@@ -2352,24 +2064,24 @@ tgsi_emit_sse2(
     */
    if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
       /* DECLARATION phase, do not load output argument. */
-      emit_mov(
+      x86_mov(
          func,
          get_input_base(),
          get_argument( 0 ) );
       /* skipping outputs argument here */
-      emit_mov(
+      x86_mov(
          func,
          get_const_base(),
          get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_temp_base(),
          get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_coef_base(),
          get_argument( 4 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_immediate_base(),
          get_argument( 5 ) );
@@ -2377,23 +2089,23 @@ tgsi_emit_sse2(
    else {
       assert(parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX);
 
-      emit_mov(
+      x86_mov(
          func,
          get_input_base(),
          get_argument( 0 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_output_base(),
          get_argument( 1 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_const_base(),
          get_argument( 2 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_temp_base(),
          get_argument( 3 ) );
-      emit_mov(
+      x86_mov(
          func,
          get_immediate_base(),
          get_argument( 4 ) );
@@ -2416,7 +2128,7 @@ tgsi_emit_sse2(
             if( !instruction_phase ) {
                /* INSTRUCTION phase, overwrite coeff with output. */
                instruction_phase = TRUE;
-               emit_mov(
+               x86_mov(
                   func,
                   get_output_base(),
                   get_argument( 1 ) );
@@ -2428,8 +2140,10 @@ tgsi_emit_sse2(
             &parse.FullToken.FullInstruction );
 
 	 if (!ok) {
-	    debug_printf("failed to translate tgsi opcode %d to SSE\n", 
-			 parse.FullToken.FullInstruction.Instruction.Opcode );
+	    debug_printf("failed to translate tgsi opcode %d to SSE (%s)\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode,
+                         parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX ?
+                         "vertex shader" : "fragment shader");
 	 }
          break;
 
@@ -2464,8 +2178,6 @@ tgsi_emit_sse2(
 
    tgsi_parse_free( &parse );
 
-   DUMP_END();
-
    return ok;
 }
 
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
index d56bf7f98a..063287dc5e 100755
--- a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -10,7 +10,7 @@ struct x86_function;
 
 unsigned
 tgsi_emit_sse2(
-   struct tgsi_token *tokens,
+   const struct tgsi_token *tokens,
    struct x86_function *function,
    float (*immediates)[4]
  );
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
index ff6a2c4194..26bfc2051f 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
@@ -25,8 +25,6 @@
  * 
  **************************************************************************/
 
-#include <stdio.h> 
-
 #include "pipe/p_debug.h"
 #include "pipe/p_util.h"
 #include "pipe/p_shader_tokens.h"
@@ -35,196 +33,28 @@
 #include "tgsi_parse.h"
 #include "tgsi_build.h"
 
-struct gen_dump
-{
-   unsigned tabs;
-   void  (* write)(
-               struct gen_dump   *dump,
-               const void        *data,
-               unsigned          size );
-};
-
-struct text_dump
-{
-   struct gen_dump   base;
-   char              *text;
-   unsigned          length;
-   unsigned          capacity;
-};
-
-static void
-_text_dump_write(
-   struct gen_dump   *dump,
-   const void        *data,
-   unsigned          size )
-{
-   struct text_dump  *td = (struct text_dump *) dump;
-   unsigned          new_length = td->length + size;
-
-   if( new_length >= td->capacity ) {
-      unsigned new_capacity = td->capacity;
-
-      do {
-         if( new_capacity == 0 ) {
-            new_capacity = 256;
-         }
-         else {
-            new_capacity *= 2;
-         }
-      } while( new_length >= new_capacity );
-      td->text = (char *) REALLOC(
-         td->text,
-         td->capacity,
-         new_capacity );
-      td->capacity = new_capacity;
-   }
-   memcpy(
-      &td->text[td->length],
-      data,
-      size );
-   td->length = new_length;
-   td->text[td->length] = '\0';
-}
-
-struct file_dump
-{
-   struct gen_dump   base;
-   FILE              *file;
-};
-
-static void
-_file_dump_write(
-   struct gen_dump   *dump,
-   const void        *data,
-   unsigned          size )
-{
-   struct file_dump  *fd = (struct file_dump *) dump;
-
-#if 0
-   fwrite( data, 1, size, fd->file );
-#else
-   {
-      unsigned i;
-
-      for (i = 0; i < size; i++ ) {
-         fprintf( fd->file, "%c", ((const char *) data)[i] );
-      }
-   }
-#endif
-}
-
-static void
-gen_dump_str(
-   struct gen_dump   *dump,
-   const char        *str )
-{
-   unsigned i;
-   size_t   len = strlen( str );
-
-   for (i = 0; i < len; i++) {
-      dump->write( dump, &str[i], 1 );
-      if (str[i] == '\n') {
-         unsigned i;
-
-         for (i = 0; i < dump->tabs; i++) {
-            dump->write( dump, "    ", 4 );
-         }
-      }
-   }
-}
-
-static void
-gen_dump_chr(
-   struct gen_dump   *dump,
-   const char        chr )
-{
-   dump->write( dump, &chr, 1 );
-}
-
-static void
-gen_dump_uix(
-   struct gen_dump   *dump,
-   const unsigned    ui )
-{
-   char  str[36];
-
-   util_snprintf( str, sizeof(str), "0x%x", ui );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_uid(
-   struct gen_dump   *dump,
-   const unsigned    ui )
-{
-   char  str[16];
-
-   util_snprintf( str, sizeof(str), "%u", ui );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_sid(
-   struct gen_dump   *dump,
-   const int         si )
-{
-   char  str[16];
-
-   util_snprintf( str, sizeof(str), "%d", si );
-   gen_dump_str( dump, str );
-}
-
 static void
-gen_dump_flt(
-   struct gen_dump   *dump,
-   const float       flt )
-{
-   char  str[48];
-
-   util_snprintf( str, sizeof(str), "%10.4f", flt );
-   gen_dump_str( dump, str );
-}
-
-static void
-gen_dump_enum(
-   struct gen_dump   *dump,
+dump_enum(
    const unsigned    e,
    const char        **enums,
    const unsigned    enums_count )
 {
    if (e >= enums_count) {
-      gen_dump_uid( dump, e );
+      debug_printf( "%u", e );
    }
    else {
-      gen_dump_str( dump, enums[e] );
+      debug_printf( "%s", enums[e] );
    }
 }
 
-static void
-gen_dump_tab(
-   struct gen_dump   *dump )
-{
-   ++dump->tabs;
-}
-
-static void
-gen_dump_untab(
-   struct gen_dump   *dump )
-{
-   assert( dump->tabs > 0 );
-
-   --dump->tabs;
-}
-
-#define TXT(S)          gen_dump_str( dump, S )
-#define CHR(C)          gen_dump_chr( dump, C )
-#define UIX(I)          gen_dump_uix( dump, I )
-#define UID(I)          gen_dump_uid( dump, I )
-#define SID(I)          gen_dump_sid( dump, I )
-#define FLT(F)          gen_dump_flt( dump, F )
-#define TAB()           gen_dump_tab( dump )
-#define UNT()           gen_dump_untab( dump )
-#define ENM(E,ENUMS)    gen_dump_enum( dump, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+#define EOL()           debug_printf( "\n" )
+#define TXT(S)          debug_printf( "%s", S )
+#define CHR(C)          debug_printf( "%c", C )
+#define UIX(I)          debug_printf( "0x%x", I )
+#define UID(I)          debug_printf( "%u", I )
+#define SID(I)          debug_printf( "%d", I )
+#define FLT(F)          debug_printf( "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
 
 static const char *TGSI_PROCESSOR_TYPES[] =
 {
@@ -711,7 +541,6 @@ static const char *TGSI_MODULATES[] =
 
 static void
 dump_declaration_short(
-   struct gen_dump               *dump,
    struct tgsi_full_declaration  *decl )
 {
    TXT( "\nDCL " );
@@ -765,7 +594,6 @@ dump_declaration_short(
 
 static void
 dump_declaration_verbose(
-   struct gen_dump               *dump,
    struct tgsi_full_declaration  *decl,
    unsigned                      ignored,
    unsigned                      deflt,
@@ -803,7 +631,7 @@ dump_declaration_verbose(
       UIX( decl->Declaration.Padding );
    }
 
-   CHR( '\n' );
+   EOL();
    switch( decl->Declaration.Declare ) {
    case TGSI_DECLARE_RANGE:
       TXT( "\nFirst: " );
@@ -822,7 +650,7 @@ dump_declaration_verbose(
    }
 
    if( decl->Declaration.Interpolate ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nInterpolate: " );
       ENM( decl->Interpolation.Interpolate, TGSI_INTERPOLATES );
       if( ignored ) {
@@ -832,7 +660,7 @@ dump_declaration_verbose(
    }
 
    if( decl->Declaration.Semantic ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nSemanticName : " );
       ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
       TXT( "\nSemanticIndex: " );
@@ -846,7 +674,6 @@ dump_declaration_verbose(
 
 static void
 dump_immediate_short(
-   struct gen_dump            *dump,
    struct tgsi_full_immediate *imm )
 {
    unsigned i;
@@ -874,7 +701,6 @@ dump_immediate_short(
 
 static void
 dump_immediate_verbose(
-   struct gen_dump            *dump,
    struct tgsi_full_immediate *imm,
    unsigned                   ignored )
 {
@@ -888,7 +714,7 @@ dump_immediate_verbose(
    }
 
    for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
-      CHR( '\n' );
+      EOL();
       switch( imm->Immediate.DataType ) {
       case TGSI_IMM_FLOAT32:
          TXT( "\nFloat: " );
@@ -903,14 +729,13 @@ dump_immediate_verbose(
 
 static void
 dump_instruction_short(
-   struct gen_dump               *dump,
    struct tgsi_full_instruction  *inst,
    unsigned                      instno )
 {
    unsigned i;
    boolean  first_reg = TRUE;
 
-   CHR( '\n' );
+   EOL();
    UID( instno );
    CHR( ':' );
    ENM( inst->Instruction.Opcode, TGSI_OPCODES_SHORT );
@@ -1042,7 +867,6 @@ dump_instruction_short(
 
 static void
 dump_instruction_verbose(
-   struct gen_dump               *dump,
    struct tgsi_full_instruction  *inst,
    unsigned                      ignored,
    unsigned                      deflt,
@@ -1070,7 +894,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType          : " );
       ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
@@ -1124,7 +948,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType    : " );
       ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
@@ -1142,7 +966,7 @@ dump_instruction_verbose(
    }
 
    if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
-      CHR( '\n' );
+      EOL();
       TXT( "\nType    : " );
       ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
       if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
@@ -1163,7 +987,7 @@ dump_instruction_verbose(
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
       struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
 
-      CHR( '\n' );
+      EOL();
       TXT( "\nFile     : " );
       ENM( dst->DstRegister.File, TGSI_FILES );
       if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
@@ -1194,7 +1018,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType        : " );
          ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
          if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
@@ -1232,7 +1056,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType    : " );
          ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
          if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
@@ -1254,7 +1078,7 @@ dump_instruction_verbose(
       struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
       struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
 
-      CHR( '\n' );
+      EOL();
       TXT( "\nFile     : ");
       ENM( src->SrcRegister.File, TGSI_FILES );
       if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
@@ -1299,7 +1123,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType       : " );
          ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
          if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
@@ -1345,7 +1169,7 @@ dump_instruction_verbose(
       }
 
       if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
-         CHR( '\n' );
+         EOL();
          TXT( "\nType     : " );
          ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
          if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
@@ -1380,9 +1204,8 @@ dump_instruction_verbose(
    }
 }
 
-static void
-dump_gen(
-   struct gen_dump         *dump,
+void
+tgsi_dump(
    const struct tgsi_token *tokens,
    unsigned                flags )
 {
@@ -1394,16 +1217,16 @@ dump_gen(
    unsigned deflt = !(flags & TGSI_DUMP_NO_DEFAULT);
    unsigned instno = 0;
 
-   dump->tabs = 0;
-
-   /* sanity check */
+   /* sanity checks */
    assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_CONT], "OPCODE_CONT") == 0);
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
+   assert(strcmp(TGSI_OPCODES_SHORT[TGSI_OPCODE_END], "END") == 0);
 
    tgsi_parse_init( &parse, tokens );
 
    TXT( "tgsi-dump begin -----------------" );
 
-   CHR( '\n' );
+   EOL();
    ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES_SHORT );
    UID( parse.FullVersion.Version.MajorVersion );
    CHR( '.' );
@@ -1414,7 +1237,7 @@ dump_gen(
       UID( parse.FullVersion.Version.MajorVersion );
       TXT( "\nMinorVersion: " );
       UID( parse.FullVersion.Version.MinorVersion );
-      CHR( '\n' );
+      EOL();
 
       TXT( "\nHeaderSize: " );
       UID( parse.FullHeader.Header.HeaderSize );
@@ -1422,7 +1245,7 @@ dump_gen(
       UID( parse.FullHeader.Header.BodySize );
       TXT( "\nProcessor : " );
       ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
-      CHR( '\n' );
+      EOL();
    }
 
    fi = tgsi_default_full_instruction();
@@ -1434,19 +1257,16 @@ dump_gen(
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
          dump_declaration_short(
-            dump,
             &parse.FullToken.FullDeclaration );
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
          dump_immediate_short(
-            dump,
             &parse.FullToken.FullImmediate );
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
          dump_instruction_short(
-            dump,
             &parse.FullToken.FullInstruction,
             instno );
          instno++;
@@ -1471,7 +1291,6 @@ dump_gen(
          switch( parse.FullToken.Token.Type ) {
          case TGSI_TOKEN_TYPE_DECLARATION:
             dump_declaration_verbose(
-               dump,
                &parse.FullToken.FullDeclaration,
                ignored,
                deflt,
@@ -1480,14 +1299,12 @@ dump_gen(
 
          case TGSI_TOKEN_TYPE_IMMEDIATE:
             dump_immediate_verbose(
-               dump,
                &parse.FullToken.FullImmediate,
                ignored );
             break;
 
          case TGSI_TOKEN_TYPE_INSTRUCTION:
             dump_instruction_verbose(
-               dump,
                &parse.FullToken.FullInstruction,
                ignored,
                deflt,
@@ -1498,7 +1315,7 @@ dump_gen(
             assert( 0 );
          }
 
-         CHR( '\n' );
+         EOL();
       }
    }
 
@@ -1506,86 +1323,3 @@ dump_gen(
 
    tgsi_parse_free( &parse );
 }
-
-
-static void
-sanity_checks(void)
-{
-   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
-   assert(strcmp(TGSI_OPCODES_SHORT[TGSI_OPCODE_END], "END") == 0);
-}
-
-
-void
-tgsi_dump(
-   const struct tgsi_token *tokens,
-   unsigned                flags )
-{
-   struct file_dump  dump;
-
-   sanity_checks();
-
-   dump.base.write = _file_dump_write;
-#if 0
-   {
-      static unsigned   counter = 0;
-      char              buffer[64];
-      sprintf( buffer, "tgsi-dump-%.4u.txt", counter++ );
-      dump.file = fopen( buffer, "wt" );
-   }
-#else
-   dump.file = stderr;
-#endif
-
-   dump_gen(
-      &dump.base,
-      tokens,
-      flags );
-
-#if 0
-   fclose( dump.file );
-#endif
-}
-
-void
-tgsi_dump_str(
-   char                    **str,
-   const struct tgsi_token *tokens,
-   unsigned                flags )
-{
-   struct text_dump  dump;
-
-   dump.base.write = _text_dump_write;
-   dump.text = NULL;
-   dump.length = 0;
-   dump.capacity = 0;
-
-   dump_gen(
-      &dump.base,
-      tokens,
-      flags );
-
-   *str = dump.text;
-}
-
-
-void tgsi_debug_dump( struct tgsi_token *tokens )
-{
-   char *str, *p;
-
-   tgsi_dump_str( &str, tokens, 0 );
-
-   p = str;
-   while (p != NULL)
-   {
-      char *end = strchr( p, '\n' );
-      if (end != NULL)
-      {
-         *end++ = '\0';
-      }
-      debug_printf( "%s\n", p );
-      p = end;
-   }
-
-   FREE( str );
-}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
index 51d79a0362..beb0155d56 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
@@ -14,16 +14,6 @@ tgsi_dump(
    const struct tgsi_token *tokens,
    unsigned                flags );
 
-void
-tgsi_dump_str(
-   char                    **str,
-   const struct tgsi_token *tokens,
-   unsigned                flags );
-
-/* Dump to debug_printf()
- */
-void tgsi_debug_dump( struct tgsi_token *tokens );
-
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
index a98e88e343..da0121c482 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
@@ -1,6 +1,8 @@
 #if !defined TGSI_PARSE_H
 #define TGSI_PARSE_H
 
+#include "pipe/p_shader_tokens.h"
+
 #if defined __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/auxiliary/translate/Makefile b/src/gallium/auxiliary/translate/Makefile
new file mode 100644
index 0000000000..39dfb0de30
--- /dev/null
+++ b/src/gallium/auxiliary/translate/Makefile
@@ -0,0 +1,14 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = translate
+
+C_SOURCES = \
+	translate_generic.c \
+	translate_sse.c \
+	translate.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/translate/SConscript b/src/gallium/auxiliary/translate/SConscript
new file mode 100644
index 0000000000..7608908915
--- /dev/null
+++ b/src/gallium/auxiliary/translate/SConscript
@@ -0,0 +1,11 @@
+Import('*')
+
+translate = env.ConvenienceLibrary(
+	target = 'translate',
+	source = [
+		'translate_generic.c',
+		'translate_sse.c',
+		'translate.c',
+	])
+
+auxiliaries.insert(0, translate)
diff --git a/src/gallium/auxiliary/draw/draw_debug.c b/src/gallium/auxiliary/translate/translate.c
index d6220b5f62..b04bc6eefd 100644
--- a/src/gallium/auxiliary/draw/draw_debug.c
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -30,84 +30,19 @@
   *   Keith Whitwell <keith@tungstengraphics.com>
   */
 
-#include "draw_private.h"
-#include "draw_context.h"
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "translate.h"
 
-
-
-static void
-draw_prim_info(unsigned prim, unsigned *first, unsigned *incr)
-{
-   assert(prim >= PIPE_PRIM_POINTS);
-   assert(prim <= PIPE_PRIM_POLYGON);
-
-   switch (prim) {
-   case PIPE_PRIM_POINTS:
-      *first = 1;
-      *incr = 1;
-      break;
-   case PIPE_PRIM_LINES:
-      *first = 2;
-      *incr = 2;
-      break;
-   case PIPE_PRIM_LINE_STRIP:
-      *first = 2;
-      *incr = 1;
-      break;
-   case PIPE_PRIM_LINE_LOOP:
-      *first = 2;
-      *incr = 1;
-      break;
-   case PIPE_PRIM_TRIANGLES:
-      *first = 3;
-      *incr = 3;
-      break;
-   case PIPE_PRIM_TRIANGLE_STRIP:
-      *first = 3;
-      *incr = 1;
-      break;
-   case PIPE_PRIM_TRIANGLE_FAN:
-   case PIPE_PRIM_POLYGON:
-      *first = 3;
-      *incr = 1;
-      break;
-   case PIPE_PRIM_QUADS:
-      *first = 4;
-      *incr = 4;
-      break;
-   case PIPE_PRIM_QUAD_STRIP:
-      *first = 4;
-      *incr = 2;
-      break;
-   default:
-      assert(0);
-      *first = 1;
-      *incr = 1;
-      break;
-   }
-}
-
-
-unsigned 
-draw_trim_prim( unsigned mode, unsigned count )
+struct translate *translate_create( const struct translate_key *key )
 {
-   unsigned length, first, incr;
+   struct translate *translate = NULL;
 
-   draw_prim_info( mode, &first, &incr );
+#if defined(__i386__) || defined(__386__) || defined(i386)
+   translate = translate_sse2_create( key );
+   if (translate)
+      return translate;
+#endif
 
-   if (count < first)
-      length = 0;
-   else
-      length = count - (count - first) % incr; 
-
-   return length;
-}
-
-
-boolean
-draw_validate_prim( unsigned mode, unsigned count )
-{
-   return (count > 0 &&
-           count == draw_trim_prim( mode, count ));
+   return translate_generic_create( key );
 }
-
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
new file mode 100644
index 0000000000..d95d1ac4f3
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+#ifndef _TRANSLATE_H
+#define _TRANSLATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+struct translate_element 
+{
+   enum pipe_format input_format;
+   unsigned input_buffer;
+   unsigned input_offset;
+
+   enum pipe_format output_format;
+   unsigned output_offset;
+};
+
+
+struct translate_key {
+   unsigned output_stride;
+   unsigned nr_elements;
+   struct translate_element element[PIPE_MAX_ATTRIBS];
+};
+
+
+struct translate {
+   struct translate_key key;
+
+   void (*release)( struct translate * );
+
+   void (*set_buffer)( struct translate *,
+		       unsigned i,
+		       const void *ptr,
+		       unsigned stride );
+
+   void (*run_elts)( struct translate *,
+		     const unsigned *elts,
+		     unsigned count,
+		     void *output_buffer);
+
+   void (*run)( struct translate *,
+		unsigned start,
+		unsigned count,
+		void *output_buffer);
+};
+
+
+
+#if 0
+struct translate_context *translate_context_create( void );
+void translate_context_destroy( struct translate_context * );
+
+struct translate *translate_lookup_or_create( struct translate_context *tctx,
+					      const struct translate_key *key );
+#endif
+
+
+struct translate *translate_create( const struct translate_key *key );
+
+
+/*******************************************************************************
+ *  Private:
+ */
+struct translate *translate_sse2_create( const struct translate_key *key );
+
+struct translate *translate_generic_create( const struct translate_key *key );
+
+
+#endif
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
new file mode 100644
index 0000000000..402780ee53
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -0,0 +1,676 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+
+#define DRAW_DBG 0
+
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*emit_func)(const float *attrib, void *ptr);
+
+
+
+struct translate_generic {
+   struct translate translate;
+
+   struct {
+      fetch_func fetch;
+      unsigned buffer;
+      unsigned input_offset;
+
+      emit_func emit;
+      unsigned output_offset;
+      
+      char *input_ptr;
+      unsigned input_stride;
+
+   } attrib[PIPE_MAX_ATTRIBS];
+
+   unsigned nr_attrib;
+};
+
+
+static struct translate_generic *translate_generic( struct translate *translate )
+{
+   return (struct translate_generic *)translate;
+}
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define ATTRIB( NAME, SZ, TYPE, FROM, TO )		\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   const float defaults[4] = { 0.0f,0.0f,0.0f,1.0f };	\
+   unsigned i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = FROM(i);				\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}							\
+							\
+static void						\
+emit_##NAME(const float *attrib, void *ptr)		\
+{  \
+   unsigned i;						\
+   TYPE *out = (TYPE *)ptr;				\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      out[i] = TO(attrib[i]);				\
+   }							\
+}
+
+
+#define FROM_64_FLOAT(i)   ((float) ((double *) ptr)[i])
+#define FROM_32_FLOAT(i)   (((float *) ptr)[i])
+
+#define FROM_8_USCALED(i)  ((float) ((unsigned char *) ptr)[i])
+#define FROM_16_USCALED(i) ((float) ((unsigned short *) ptr)[i])
+#define FROM_32_USCALED(i) ((float) ((unsigned int *) ptr)[i])
+
+#define FROM_8_SSCALED(i)  ((float) ((char *) ptr)[i])
+#define FROM_16_SSCALED(i) ((float) ((short *) ptr)[i])
+#define FROM_32_SSCALED(i) ((float) ((int *) ptr)[i])
+
+#define FROM_8_UNORM(i)    ((float) ((unsigned char *) ptr)[i] / 255.0f)
+#define FROM_16_UNORM(i)   ((float) ((unsigned short *) ptr)[i] / 65535.0f)
+#define FROM_32_UNORM(i)   ((float) ((unsigned int *) ptr)[i] / 4294967295.0f)
+
+#define FROM_8_SNORM(i)    ((float) ((char *) ptr)[i] / 127.0f)
+#define FROM_16_SNORM(i)   ((float) ((short *) ptr)[i] / 32767.0f)
+#define FROM_32_SNORM(i)   ((float) ((int *) ptr)[i] / 2147483647.0f)
+
+#define TO_64_FLOAT(x)   ((double) x)
+#define TO_32_FLOAT(x)   (x)
+
+#define TO_8_USCALED(x)  ((unsigned char) x)
+#define TO_16_USCALED(x) ((unsigned short) x)
+#define TO_32_USCALED(x) ((unsigned int) x)
+
+#define TO_8_SSCALED(x)  ((char) x)
+#define TO_16_SSCALED(x) ((short) x)
+#define TO_32_SSCALED(x) ((int) x)
+
+#define TO_8_UNORM(x)    ((unsigned char) (x * 255.0f))
+#define TO_16_UNORM(x)   ((unsigned short) (x * 65535.0f))
+#define TO_32_UNORM(x)   ((unsigned int) (x * 4294967295.0f))
+
+#define TO_8_SNORM(x)    ((char) (x * 127.0f))
+#define TO_16_SNORM(x)   ((short) (x * 32767.0f))
+#define TO_32_SNORM(x)   ((int) (x * 2147483647.0f))
+
+
+
+ATTRIB( R64G64B64A64_FLOAT,   4, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64B64_FLOAT,      3, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64_FLOAT,         2, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64_FLOAT,            1, double, FROM_64_FLOAT, TO_64_FLOAT )
+
+ATTRIB( R32G32B32A32_FLOAT,   4, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32B32_FLOAT,      3, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32_FLOAT,         2, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32_FLOAT,            1, float, FROM_32_FLOAT, TO_32_FLOAT )
+
+ATTRIB( R32G32B32A32_USCALED, 4, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32B32_USCALED,    3, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32_USCALED,       2, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32_USCALED,          1, unsigned, FROM_32_USCALED, TO_32_USCALED )
+
+ATTRIB( R32G32B32A32_SSCALED, 4, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32B32_SSCALED,    3, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32_SSCALED,       2, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32_SSCALED,          1, int, FROM_32_SSCALED, TO_32_SSCALED )
+
+ATTRIB( R32G32B32A32_UNORM, 4, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32B32_UNORM,    3, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32_UNORM,       2, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32_UNORM,          1, unsigned, FROM_32_UNORM, TO_32_UNORM )
+
+ATTRIB( R32G32B32A32_SNORM, 4, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32B32_SNORM,    3, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32_SNORM,       2, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32_SNORM,          1, int, FROM_32_SNORM, TO_32_SNORM )
+
+ATTRIB( R16G16B16A16_USCALED, 4, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16B16_USCALED,    3, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16_USCALED,       2, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16_USCALED,          1, ushort, FROM_16_USCALED, TO_16_USCALED )
+
+ATTRIB( R16G16B16A16_SSCALED, 4, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16B16_SSCALED,    3, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16_SSCALED,       2, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16_SSCALED,          1, short, FROM_16_SSCALED, TO_16_SSCALED )
+
+ATTRIB( R16G16B16A16_UNORM, 4, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16B16_UNORM,    3, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16_UNORM,       2, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16_UNORM,          1, ushort, FROM_16_UNORM, TO_16_UNORM )
+
+ATTRIB( R16G16B16A16_SNORM, 4, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16B16_SNORM,    3, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16_SNORM,       2, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16_SNORM,          1, short, FROM_16_SNORM, TO_16_SNORM )
+
+ATTRIB( R8G8B8A8_USCALED,   4, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8B8_USCALED,     3, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8_USCALED,       2, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8_USCALED,         1, ubyte, FROM_8_USCALED, TO_8_USCALED )
+
+ATTRIB( R8G8B8A8_SSCALED,  4, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8B8_SSCALED,    3, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8_SSCALED,      2, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8_SSCALED,        1, char, FROM_8_SSCALED, TO_8_SSCALED )
+
+ATTRIB( R8G8B8A8_UNORM,  4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8B8_UNORM,    3, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8_UNORM,      2, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8_UNORM,        1, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+ATTRIB( R8G8B8A8_SNORM,  4, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8B8_SNORM,    3, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8_SNORM,      2, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8_SNORM,        1, char, FROM_8_SNORM, TO_8_SNORM )
+
+ATTRIB( A8R8G8B8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+//ATTRIB( R8G8B8A8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+
+
+static void
+fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
+{
+   attrib[2] = FROM_8_UNORM(0);
+   attrib[1] = FROM_8_UNORM(1);
+   attrib[0] = FROM_8_UNORM(2);
+   attrib[3] = FROM_8_UNORM(3);
+}
+
+static void
+emit_B8G8R8A8_UNORM( const float *attrib, void *ptr)
+{
+   ubyte *out = (ubyte *)ptr;
+   out[2] = TO_8_UNORM(attrib[0]);
+   out[1] = TO_8_UNORM(attrib[1]);
+   out[0] = TO_8_UNORM(attrib[2]);
+   out[3] = TO_8_UNORM(attrib[3]);
+}
+
+static void 
+fetch_NULL( const void *ptr, float *attrib )
+{
+   attrib[0] = 0;
+   attrib[1] = 0;
+   attrib[2] = 0;
+   attrib[3] = 1;
+}
+
+static void 
+emit_NULL( const float *attrib, void *ptr )
+{
+   /* do nothing is the only sensible option */
+}
+
+static fetch_func get_fetch_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return fetch_B8G8R8A8_UNORM;
+
+   default:
+      assert(0); 
+      return fetch_NULL;
+   }
+}
+
+
+
+
+static emit_func get_emit_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return emit_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return emit_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return emit_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return emit_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return emit_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return emit_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return emit_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return emit_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return emit_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return emit_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return emit_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return emit_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return emit_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return emit_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return emit_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return emit_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return emit_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return emit_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return emit_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return emit_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return emit_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return emit_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return emit_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return emit_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return emit_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return emit_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return emit_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return emit_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return emit_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return emit_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return emit_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return emit_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return emit_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return emit_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return emit_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return emit_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return emit_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return emit_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return emit_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return emit_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return emit_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return emit_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return emit_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return emit_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return emit_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return emit_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return emit_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return emit_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return emit_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return emit_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return emit_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return emit_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return emit_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return emit_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return emit_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return emit_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return emit_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return emit_B8G8R8A8_UNORM;
+
+   default:
+      assert(0); 
+      return emit_NULL;
+   }
+}
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_run_elts( struct translate *translate,
+			      const unsigned *elts,
+			      unsigned count,
+			      void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = *elts++;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
+                             i, elt, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+
+static void generic_run( struct translate *translate,
+			 unsigned start,
+			 unsigned count,
+			 void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = start + i;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
+                             i, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+			       
+static void generic_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   unsigned i;
+
+   for (i = 0; i < tg->nr_attrib; i++) {
+      if (tg->attrib[i].buffer == buf) {
+	 tg->attrib[i].input_ptr = ((char *)ptr +
+				    tg->attrib[i].input_offset);
+	 tg->attrib[i].input_stride = stride;
+      }
+   }
+}
+
+
+static void generic_release( struct translate *translate )
+{
+   /* Refcount?
+    */
+   FREE(translate);
+}
+
+struct translate *translate_generic_create( const struct translate_key *key )
+{
+   struct translate_generic *tg = CALLOC_STRUCT(translate_generic);
+   unsigned i;
+
+   if (tg == NULL)
+      return NULL;
+
+   tg->translate.key = *key;
+   tg->translate.release = generic_release;
+   tg->translate.set_buffer = generic_set_buffer;
+   tg->translate.run_elts = generic_run_elts;
+   tg->translate.run = generic_run;
+
+   for (i = 0; i < key->nr_elements; i++) {
+
+      tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
+      tg->attrib[i].buffer = key->element[i].input_buffer;
+      tg->attrib[i].input_offset = key->element[i].input_offset;
+
+      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      tg->attrib[i].output_offset = key->element[i].output_offset;
+
+   }
+
+   tg->nr_attrib = key->nr_elements;
+
+
+   return &tg->translate;
+}
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
new file mode 100644
index 0000000000..f590d48b78
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <keithw@tungstengraphics.com>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+#include "util/u_simple_list.h"
+
+#include "translate.h"
+
+
+#if defined(__i386__) || defined(__386__) || defined(i386)
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+#ifdef WIN32
+#define RTASM __cdecl
+#else
+#define RTASM
+#endif
+
+typedef void (RTASM *run_func)( struct translate *translate,
+                                unsigned start,
+                                unsigned count,
+                                void *output_buffer );
+
+typedef void (RTASM *run_elts_func)( struct translate *translate,
+                                     const unsigned *elts,
+                                     unsigned count,
+                                     void *output_buffer );
+
+
+
+struct translate_sse {
+   struct translate translate;
+
+   struct x86_function linear_func;
+   struct x86_function elt_func;
+   struct x86_function *func;
+
+   boolean loaded_identity;
+   boolean loaded_255;
+   boolean loaded_inv_255;
+
+   float identity[4];
+   float float_255[4];
+   float inv_255[4];
+
+   struct {
+      char *input_ptr;
+      unsigned input_stride;
+   } attrib[PIPE_MAX_ATTRIBS];
+
+   run_func      gen_run;
+   run_elts_func gen_run_elts;
+
+};
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+
+
+static struct x86_reg get_identity( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 6);
+
+   if (!p->loaded_identity) {
+      /* Nasty: 
+       */
+      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+
+      p->loaded_identity = TRUE;
+      p->identity[0] = 0;
+      p->identity[1] = 0;
+      p->identity[2] = 0;
+      p->identity[3] = 1;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(translateESI, 
+			       get_offset(p, &p->identity[0])));
+   }
+
+   return reg;
+}
+
+static struct x86_reg get_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 6);
+
+   if (!p->loaded_255) {
+      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+
+      p->loaded_255 = TRUE;
+      p->float_255[0] =
+	 p->float_255[1] =
+	 p->float_255[2] =
+	 p->float_255[3] = 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(translateESI, 
+			       get_offset(p, &p->float_255[0])));
+   }
+
+   return reg;
+   return x86_make_reg(file_XMM, 7);
+}
+
+static struct x86_reg get_inv_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 5);
+
+   if (!p->loaded_inv_255) {
+      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+
+      p->loaded_inv_255 = TRUE;
+      p->inv_255[0] =
+	 p->inv_255[1] =
+	 p->inv_255[2] =
+	 p->inv_255[3] = 1.0f / 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(translateESI, 
+			       get_offset(p, &p->inv_255[0])));
+   }
+
+   return reg;
+}
+
+
+static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
+				    struct x86_reg data,
+				    struct x86_reg arg0 )
+{
+   sse_movups(p->func, data, arg0);
+}
+
+static void emit_load_R32G32B32( struct translate_sse *p, 			   
+				 struct x86_reg data,
+				 struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(p->func, data, x86_make_disp(arg0, 8));
+   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+   sse_movlps(p->func, data, arg0);
+}
+
+static void emit_load_R32G32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* 0 0 0 1
+    * a b 0 1
+    */
+   sse_movups(p->func, data, get_identity(p) );
+   sse_movlps(p->func, data, arg0);
+}
+
+
+static void emit_load_R32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* a 0 0 0
+    * a 0 0 1
+    */
+   sse_movss(p->func, data, arg0);
+   sse_orps(p->func, data, get_identity(p) );
+}
+
+
+static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src )
+{
+
+   /* Load and unpack twice:
+    */
+   sse_movss(p->func, data, src);
+   sse2_punpcklbw(p->func, data, get_identity(p));
+   sse2_punpcklbw(p->func, data, get_identity(p));
+
+   /* Convert to float:
+    */
+   sse2_cvtdq2ps(p->func, data, data);
+
+
+   /* Scale by 1/255.0
+    */
+   sse_mulps(p->func, data, get_inv_255(p));
+}
+
+
+
+
+static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
+				     struct x86_reg dest,
+				     struct x86_reg dataXMM )
+{
+   sse_movups(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32G32B32( struct translate_sse *p, 
+				  struct x86_reg dest,
+				  struct x86_reg dataXMM )
+{
+   /* Emit two, shuffle, emit one.
+    */
+   sse_movlps(p->func, dest, dataXMM);
+   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
+}
+
+static void emit_store_R32G32( struct translate_sse *p, 
+			       struct x86_reg dest,
+			       struct x86_reg dataXMM )
+{
+   sse_movlps(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32( struct translate_sse *p, 
+			    struct x86_reg dest,
+			    struct x86_reg dataXMM )
+{
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg dest,
+				       struct x86_reg dataXMM )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(p->func, dataXMM, get_255(p));
+
+   /* Pack and emit:
+    */
+   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+   sse2_packssdw(p->func, dataXMM, dataXMM);
+   sse2_packuswb(p->func, dataXMM, dataXMM);
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+
+
+static void get_src_ptr( struct translate_sse *p,
+			 struct x86_reg srcEAX,
+			 struct x86_reg translateREG,
+			 struct x86_reg eltREG,	
+			 unsigned a )
+{
+   struct x86_reg input_ptr = 
+      x86_make_disp(translateREG, 
+		    get_offset(p, &p->attrib[a].input_ptr));
+
+   struct x86_reg input_stride = 
+      x86_make_disp(translateREG, 
+		    get_offset(p, &p->attrib[a].input_stride));
+
+   /* Calculate pointer to current attrib:
+    */
+   x86_mov(p->func, srcEAX, input_stride);
+   x86_imul(p->func, srcEAX, eltREG);	
+   x86_add(p->func, srcEAX, input_ptr);
+}
+
+
+/* Extended swizzles?  Maybe later.
+ */  
+static void emit_swizzle( struct translate_sse *p,
+			  struct x86_reg dest,
+			  struct x86_reg src,
+			  unsigned shuffle )
+{
+   sse_shufps(p->func, dest, src, shuffle);
+}
+
+
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg srcECX,
+			       struct x86_reg dstEAX)
+{
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+   switch (a->input_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_load_R32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_load_R32G32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_load_R32G32B32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_load_R32G32B32A32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      break;
+   default:
+      return FALSE;
+   }
+
+   switch (a->output_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_store_R32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_store_R32G32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_store_R32G32B32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   default:
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+/* Build run( struct translate *translate,
+ *            unsigned start,
+ *            unsigned count,
+ *            void *output_buffer )
+ * or
+ *  run_elts( struct translate *translate,
+ *            unsigned *elts,
+ *            unsigned count,
+ *            void *output_buffer )
+ *
+ *  Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct translate_sse *p,
+				  struct x86_function *func,
+				  boolean linear )
+{
+   struct x86_reg vertexECX    = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg idxEBX       = x86_make_reg(file_REG32, reg_BX);
+   struct x86_reg srcEAX       = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP     = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
+   uint8_t *fixup, *label;
+   unsigned j;
+
+   p->func = func;
+   p->loaded_inv_255 = FALSE;
+   p->loaded_255 = FALSE;
+   p->loaded_identity = FALSE;
+
+   x86_init_func(p->func);
+
+   /* Push a few regs?
+    */
+   x86_push(p->func, countEBP);
+   x86_push(p->func, translateESI);
+   x86_push(p->func, idxEBX);
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(p->func, idxEBX, idxEBX);
+   x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
+   x86_cmp(p->func, countEBP, idxEBX);
+   fixup = x86_jcc_forward(p->func, cc_E);
+
+   /* If linear, idx is the current element, otherwise it is a pointer
+    * to the current element.
+    */
+   x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
+
+   /* Initialize destination register. 
+    */
+   x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
+
+   /* Move argument 1 (translate_sse pointer) into a reg:
+    */
+   x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
+
+   
+   /* always load, needed or not:
+    */
+
+   /* Note address for loop jump */
+   label = x86_get_label(p->func);
+
+
+   for (j = 0; j < p->translate.key.nr_elements; j++) {
+      const struct translate_element *a = &p->translate.key.element[j];
+
+      struct x86_reg destEAX = x86_make_disp(vertexECX, 
+					     a->output_offset);
+
+      /* Figure out source pointer address:
+       */
+      if (linear) {
+	 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
+      }
+      else {
+	 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
+      }
+
+      if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
+	 return FALSE;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
+
+   /* Incr index
+    */   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+
+   if (linear) {
+      x86_inc(p->func, idxEBX);
+   } 
+   else {
+      x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
+   }
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(p->func, countEBP);
+   x86_test(p->func, countEBP, countEBP); 
+   x86_jcc(p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func->need_emms)
+      mmx_emms(p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(p->func, fixup);
+
+   /* Pop regs and return
+    */
+   
+   x86_pop(p->func, idxEBX);
+   x86_pop(p->func, translateESI);
+   x86_pop(p->func, countEBP);
+   x86_ret(p->func);
+
+   return TRUE;
+}
+
+
+
+
+
+
+			       
+static void translate_sse_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+   unsigned i;
+
+   for (i = 0; i < p->translate.key.nr_elements; i++) {
+      if (p->translate.key.element[i].input_buffer == buf) {
+	 p->attrib[i].input_ptr = ((char *)ptr +
+				    p->translate.key.element[i].input_offset);
+	 p->attrib[i].input_stride = stride;
+      }
+   }
+}
+
+
+static void translate_sse_release( struct translate *translate )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   x86_release_func( &p->linear_func );
+   x86_release_func( &p->elt_func );
+
+   FREE(p);
+}
+
+static void translate_sse_run_elts( struct translate *translate,
+			      const unsigned *elts,
+			      unsigned count,
+			      void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run_elts( translate,
+		    elts,
+		    count,
+		    output_buffer );
+}
+
+static void translate_sse_run( struct translate *translate,
+			 unsigned start,
+			 unsigned count,
+			 void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run( translate,
+	       start,
+	       count,
+	       output_buffer );
+}
+
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   struct translate_sse *p = NULL;
+
+   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+      goto fail;
+
+   p = CALLOC_STRUCT( translate_sse );
+   if (p == NULL) 
+      goto fail;
+
+   p->translate.key = *key;
+   p->translate.release = translate_sse_release;
+   p->translate.set_buffer = translate_sse_set_buffer;
+   p->translate.run_elts = translate_sse_run_elts;
+   p->translate.run = translate_sse_run;
+
+   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+      goto fail;
+
+   p->gen_run = (run_func)x86_get_func(&p->linear_func);
+   if (p->gen_run == NULL)
+      goto fail;
+
+   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
+   if (p->gen_run_elts == NULL)
+      goto fail;
+
+   return &p->translate;
+
+ fail:
+   if (p)
+      translate_sse_release( &p->translate );
+
+   return NULL;
+}
+
+
+
+#else
+
+void translate_create_sse( const struct translate_key *key )
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index f9366467cd..25b132b40c 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -59,10 +59,15 @@ void _debug_vprintf(const char *format, va_list ap)
 #ifdef WIN32
 #ifndef WINCE
    /* EngDebugPrint does not handle float point arguments, so we need to use
-    * our own vsnprintf implementation */
-   char buf[512 + 1];
-   util_vsnprintf(buf, sizeof(buf), format, ap);
-   _EngDebugPrint("%s", buf);
+    * our own vsnprintf implementation. It is also very slow, so buffer until
+    * we find a newline. */
+   static char buf[512 + 1] = {'\0'};
+   size_t len = strlen(buf);
+   int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
+   if(ret > (int)(sizeof(buf) - len - 1) || strchr(buf + len, '\n')) {
+      _EngDebugPrint("%s", buf);
+      buf[0] = '\0';
+   }
 #else
    /* TODO: Implement debug print for WINCE */
 #endif
@@ -195,6 +200,8 @@ debug_get_bool_option(const char *name, boolean dfault)
    
    if(str == NULL)
       result = dfault;
+   else if(!strcmp(str, "n"))
+      result = FALSE;
    else if(!strcmp(str, "no"))
       result = FALSE;
    else if(!strcmp(str, "0"))
@@ -246,57 +253,16 @@ debug_get_flags_option(const char *name,
 }
 
 
-#if defined(WIN32)
-ULONG_PTR debug_config_file = 0;
-void *mapped_config_file = 0;
-
-enum {
-	eAssertAbortEn = 0x1,
-};
-
-/* Check for aborts enabled. */
-static unsigned abort_en(void)
-{
-   if (!mapped_config_file)
-   {
-      /* Open an 8 byte file for configuration data. */
-      mapped_config_file = EngMapFile(L"\\??\\c:\\gaDebug.cfg", 8, &debug_config_file);
-   }
-
-   /* A value of "0" (ascii) in the configuration file will clear the
-    * first 8 bits in the test byte. 
-    *
-    * A value of "1" (ascii) in the configuration file will set the
-    * first bit in the test byte. 
-    *
-    * A value of "2" (ascii) in the configuration file will set the
-    * second bit in the test byte. 
-    *
-    * Currently the only interesting values are 0 and 1, which clear
-    * and set abort-on-assert behaviour respectively.
-    */
-   return ((((char *)mapped_config_file)[0]) - 0x30) & eAssertAbortEn;
-}
-#else /* WIN32 */
-static unsigned abort_en(void)
-{
-   return !GETENV("GALLIUM_ABORT_ON_ASSERT");
-}
-#endif
-
 void _debug_assert_fail(const char *expr, 
                         const char *file, 
                         unsigned line, 
                         const char *function) 
 {
    _debug_printf("%s:%u:%s: Assertion `%s' failed.\n", file, line, function, expr);
-   if (abort_en())
-   {
+   if (debug_get_bool_option("GALLIUM_ABORT_ON_ASSERT", TRUE))
       debug_break();
-   } else
-   {
+   else
       _debug_printf("continuing...\n");
-   }
 }
 
 
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index eec5e600c9..1105066cb8 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -57,6 +57,7 @@ struct blit_state
    struct pipe_depth_stencil_alpha_state depthstencil;
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
+   struct pipe_viewport_state viewport;
 
    struct pipe_shader_state vert_shader;
    struct pipe_shader_state frag_shader;
@@ -100,7 +101,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
    ctx->rasterizer.front_winding = PIPE_WINDING_CW;
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
-   ctx->rasterizer.bypass_clipping = 1;  /* bypasses viewport too */
+   ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
 
    /* samplers */
@@ -113,8 +114,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->sampler.mag_img_filter = 0; /* set later */
    ctx->sampler.normalized_coords = 1;
 
-#if 0
-   /* viewport */
+   /* viewport (identity, we setup vertices in wincoords) */
    ctx->viewport.scale[0] = 1.0;
    ctx->viewport.scale[1] = 1.0;
    ctx->viewport.scale[2] = 1.0;
@@ -123,7 +123,6 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->viewport.translate[1] = 0.0;
    ctx->viewport.translate[2] = 0.0;
    ctx->viewport.translate[3] = 0.0;
-#endif
 
    /* vertex shader */
    {
@@ -300,11 +299,15 @@ util_blit_pixels(struct blit_state *ctx,
    cso_save_samplers(ctx->cso);
    cso_save_sampler_textures(ctx->cso);
    cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
 
    /* set misc state we care about */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_viewport(ctx->cso, &ctx->viewport);
 
    /* sampler */
    ctx->sampler.min_img_filter = filter;
@@ -313,11 +316,11 @@ util_blit_pixels(struct blit_state *ctx,
    cso_single_sampler_done(ctx->cso);
 
    /* texture */
-   pipe->set_sampler_textures(pipe, 1, &tex);
+   cso_set_sampler_textures(ctx->cso, 1, &tex);
 
    /* shaders */
-   pipe->bind_fs_state(pipe, ctx->fs);
-   pipe->bind_vs_state(pipe, ctx->vs);
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
 
    /* drawing dest */
    memset(&fb, 0, sizeof(fb));
@@ -344,6 +347,9 @@ util_blit_pixels(struct blit_state *ctx,
    cso_restore_samplers(ctx->cso);
    cso_restore_sampler_textures(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
 
    /* free the texture */
    pipe_surface_reference(&texSurf, NULL);
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 2fd214d22e..dfdb5f16fe 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -61,6 +61,7 @@ struct gen_mipmap_state
    struct pipe_depth_stencil_alpha_state depthstencil;
    struct pipe_rasterizer_state rasterizer;
    struct pipe_sampler_state sampler;
+   struct pipe_viewport_state viewport;
 
    struct pipe_shader_state vert_shader;
    struct pipe_shader_state frag_shader;
@@ -712,7 +713,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    memset(&ctx->rasterizer, 0, sizeof(ctx->rasterizer));
    ctx->rasterizer.front_winding = PIPE_WINDING_CW;
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
-   ctx->rasterizer.bypass_clipping = 1;  /* bypasses viewport too */
+   ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
 
    /* sampler state */
@@ -723,9 +724,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NEAREST;
    ctx->sampler.normalized_coords = 1;
 
-
-#if 0
-   /* viewport */
+   /* viewport state (identity, verts are in wincoords) */
    ctx->viewport.scale[0] = 1.0;
    ctx->viewport.scale[1] = 1.0;
    ctx->viewport.scale[2] = 1.0;
@@ -734,7 +733,6 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->viewport.translate[1] = 0.0;
    ctx->viewport.translate[2] = 0.0;
    ctx->viewport.translate[3] = 0.0;
-#endif
 
    /* vertex shader */
    {
@@ -825,26 +823,6 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
 }
 
 
-#if 0
-static void
-simple_viewport(struct pipe_context *pipe, uint width, uint height)
-{
-   struct pipe_viewport_state vp;
-
-   vp.scale[0] =  0.5 * width;
-   vp.scale[1] = -0.5 * height;
-   vp.scale[2] = 1.0;
-   vp.scale[3] = 1.0;
-   vp.translate[0] = 0.5 * width;
-   vp.translate[1] = 0.5 * height;
-   vp.translate[2] = 0.0;
-   vp.translate[3] = 0.0;
-
-   pipe->set_viewport_state(pipe, &vp);
-}
-#endif
-
-
 /**
  * Generate mipmap images.  It's assumed all needed texture memory is
  * already allocated.
@@ -880,17 +858,18 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_save_samplers(ctx->cso);
    cso_save_sampler_textures(ctx->cso);
    cso_save_framebuffer(ctx->cso);
+   cso_save_fragment_shader(ctx->cso);
+   cso_save_vertex_shader(ctx->cso);
+   cso_save_viewport(ctx->cso);
 
    /* bind our state */
    cso_set_blend(ctx->cso, &ctx->blend);
    cso_set_depth_stencil_alpha(ctx->cso, &ctx->depthstencil);
    cso_set_rasterizer(ctx->cso, &ctx->rasterizer);
+   cso_set_viewport(ctx->cso, &ctx->viewport);
 
-   pipe->bind_vs_state(pipe, ctx->vs);
-   pipe->bind_fs_state(pipe, ctx->fs);
-#if 0
-   pipe->set_viewport_state(pipe, &ctx->viewport);
-#endif
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs);
+   cso_set_vertex_shader_handle(ctx->cso, ctx->vs);
 
    /* init framebuffer state */
    memset(&fb, 0, sizeof(fb));
@@ -926,11 +905,8 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
       ctx->sampler.lod_bias = (float) srcLevel;
       cso_single_sampler(ctx->cso, 0, &ctx->sampler);
       cso_single_sampler_done(ctx->cso);
-#if 0
-      simple_viewport(pipe, pt->width[dstLevel], pt->height[dstLevel]);
-#endif
 
-      pipe->set_sampler_textures(pipe, 1, &pt);
+      cso_set_sampler_textures(ctx->cso, 1, &pt);
 
       /* quad coords in window coords (bypassing clipping, viewport mapping) */
       set_vertex_data(ctx,
@@ -954,4 +930,7 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
    cso_restore_samplers(ctx->cso);
    cso_restore_sampler_textures(ctx->cso);
    cso_restore_framebuffer(ctx->cso);
+   cso_restore_fragment_shader(ctx->cso);
+   cso_restore_vertex_shader(ctx->cso);
+   cso_restore_viewport(ctx->cso);
 }
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index cd13823985..1f6604c554 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -40,6 +40,45 @@
 
 
 /**
+ * Pack ubyte R,G,B,A into dest pixel.
+ */
+static INLINE void
+util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
+                   enum pipe_format format, void *dest)
+{
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (r << 24) | (g << 16) | (b << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         uint *d = (uint *) dest;
+         *d = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      {
+         ushort *d = (ushort *) dest;
+         *d = ((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3);
+      }
+      return;
+   /* XXX lots more cases to add */
+   default:
+      debug_printf("gallium: unhandled format in util_pack_color_ub()");
+   }
+}
+ 
+
+/**
  * Note rgba outside [0,1] will be clamped for int pixel formats.
  */
 static INLINE void
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index e6c0b19ff6..01112ebe5a 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -120,20 +120,20 @@ util_time_compare(const struct util_time *t1,
 }
 
 
-int 
+boolean 
 util_time_timeout(const struct util_time *start, 
                   const struct util_time *end,
                   const struct util_time *curr) 
 {
    if(util_time_compare(start, end) <= 0)
-      return util_time_compare(start, curr) <= 0 && util_time_compare(curr, end) < 0;
+      return !(util_time_compare(start, curr) <= 0 && util_time_compare(curr, end) < 0);
    else
-      return util_time_compare(start, curr) <= 0 || util_time_compare(curr, end) < 0;
+      return !(util_time_compare(start, curr) <= 0 || util_time_compare(curr, end) < 0);
 }
 
 
 #ifdef WIN32
-void util_time_usleep(unsigned usecs)
+void util_time_sleep(unsigned usecs)
 {
    LONGLONG start, curr, end;
    
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index 32035cceb5..c8836c137f 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -77,9 +77,9 @@ util_time_diff(const struct util_time *t1,
                const struct util_time *t2);
 
 /**
- * Returns zero when the timeout expires, non zero otherwise.
+ * Returns non-zero when the timeout expires.
  */
-int 
+boolean 
 util_time_timeout(const struct util_time *start, 
                   const struct util_time *end,
                   const struct util_time *curr);
@@ -87,7 +87,7 @@ util_time_timeout(const struct util_time *start,
 #ifndef WIN32
 #define util_time_sleep usleep
 #else
-int 
+void
 util_time_sleep(unsigned usecs);
 #endif
 
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 36af5be5f0..6e08cf6fe8 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -101,17 +101,6 @@ cell_draw_elements(struct pipe_context *pipe,
    struct draw_context *draw = sp->draw;
    unsigned i;
 
-   /* first, check that the primitive is not malformed.  It is the
-    * state tracker's responsibility to do send only correctly formed
-    * primitives down.  It currently isn't doing that though...
-    */
-#if 1
-   count = draw_trim_prim( mode, count );
-#else
-   if (!draw_validate_prim( mode, count ))
-      assert(0);
-#endif
-
    if (sp->dirty)
       cell_update_derived( sp );
 
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index 58a5854f0d..4bef21619c 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -142,7 +142,7 @@ struct pipe_context *i915_create_context( struct pipe_screen *screen,
     */
    i915->draw = draw_create();
    assert(i915->draw);
-   if (GETENV("I915_VBUF")) {
+   if (!GETENV("I915_NO_VBUF")) {
       draw_set_rasterize_stage(i915->draw, i915_draw_vbuf_stage(i915));
    }
    else {
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index b6fb0a6d88..9ffa460138 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 
-#include "draw/draw_private.h"
+#include "draw/draw_pipe.h"
 #include "pipe/p_util.h"
 
 #include "i915_context.h"
@@ -78,9 +78,6 @@ emit_hw_vertex( struct i915_context *i915,
       const uint j = vinfo->src_index[i];
       const float *attrib = vertex->data[j];
       switch (vinfo->emit[i]) {
-      case EMIT_OMIT:
-         /* no-op */
-         break;
       case EMIT_1F:
          OUT_BATCH( fui(attrib[0]) );
          count++;
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index 39aed151c7..1236706891 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -49,6 +49,9 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
    struct softpipe_context *softpipe = softpipe_context(pipe);
    uint i;
 
+   if (softpipe->no_rast)
+      return;
+
 #if 0
    softpipe_update_derived(softpipe); /* not needed?? */
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 8c84ddbe19..200fb415ac 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -219,6 +219,9 @@ softpipe_create( struct pipe_screen *screen,
    assert(softpipe->draw);
    softpipe->setup = sp_draw_render_stage(softpipe);
 
+   if (GETENV( "SP_NO_RAST" ) != NULL)
+      softpipe->no_rast = TRUE;
+
    if (GETENV( "SP_VBUF" ) != NULL) {
       sp_init_vbuf(softpipe);
    }
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 0e1d5e561d..b3e2b2e435 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -86,6 +86,8 @@ struct softpipe_context {
    unsigned num_vertex_elements;
    unsigned num_vertex_buffers;
 
+   boolean no_rast;
+
    /* Counter for occlusion queries.  Note this supports overlapping
     * queries.
     */
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 421509495a..778291dded 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -118,17 +118,6 @@ softpipe_draw_elements(struct pipe_context *pipe,
    struct draw_context *draw = sp->draw;
    unsigned i;
 
-   /* first, check that the primitive is not malformed.  It is the
-    * state tracker's responsibility to do send only correctly formed
-    * primitives down.  It currently isn't doing that though...
-    */
-#if 1
-   count = draw_trim_prim( mode, count );
-#else
-   if (!draw_validate_prim( mode, count ))
-      assert(0);
-#endif
-
    sp->reduced_api_prim = reduced_prim[mode];
 
    if (sp->dirty)
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 5ef02a7142..f857d26143 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -49,7 +49,7 @@
 typedef void (XSTDCALL *codegen_function)(
    const struct tgsi_exec_vector *input,
    struct tgsi_exec_vector *output,
-   float (*constant)[4],
+   const float (*constant)[4],
    struct tgsi_exec_vector *temporary,
    const struct tgsi_interp_coef *coef,
    float (*immediates)[4]
@@ -67,9 +67,9 @@ struct sp_sse_fragment_shader {
 
 
 static void
-fs_sse_prepare( struct sp_fragment_shader *base,
-	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler *samplers )
+fs_sse_prepare( const struct sp_fragment_shader *base,
+		struct tgsi_exec_machine *machine,
+		struct tgsi_sampler *samplers )
 {
 }
 
@@ -80,9 +80,9 @@ fs_sse_prepare( struct sp_fragment_shader *base,
  * TODO: process >1 quad at a time
  */
 static unsigned 
-fs_sse_run( struct sp_fragment_shader *base,
-	 struct tgsi_exec_machine *machine,
-	 struct quad_header *quad )
+fs_sse_run( const struct sp_fragment_shader *base,
+	    struct tgsi_exec_machine *machine,
+	    struct quad_header *quad )
 {
    struct sp_sse_fragment_shader *shader = (struct sp_sse_fragment_shader *) base;
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
index 0ddb06764a..feb35d492a 100644
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ b/src/gallium/drivers/softpipe/sp_prim_setup.c
@@ -39,7 +39,7 @@
 #include "sp_setup.h"
 #include "sp_state.h"
 #include "sp_prim_setup.h"
-#include "draw/draw_private.h"
+#include "draw/draw_pipe.h"
 #include "draw/draw_vertex.h"
 #include "pipe/p_util.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 4fed19ecb6..e063fe82ef 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -106,6 +106,16 @@ static boolean
 sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
+
+   /* XXX: break this dependency - make setup_context live under
+    * softpipe, rename the old "setup" draw stage to something else.
+    */
+   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   
+   setup_prepare( setup_ctx );
+
+
+
    if (prim == PIPE_PRIM_TRIANGLES ||
        prim == PIPE_PRIM_LINES ||
        prim == PIPE_PRIM_POINTS) {
@@ -136,10 +146,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr_indices)
     */
    struct draw_stage *setup = softpipe->setup;
    struct setup_context *setup_ctx = sp_draw_setup_context(softpipe->setup);
-   
-   /* XXX: call this from allocate_vertices: 
-    */
-   setup_prepare( setup_ctx );
 
 
    switch (cvbr->prim) {
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 8dbdbe5764..625d0f9b48 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -77,7 +77,8 @@ shade_quad(
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
    struct tgsi_exec_machine *machine = &qss->machine;
-
+   boolean z_written;
+   
    /* Consts do not require 16 byte alignment. */
    machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
 
@@ -89,7 +90,7 @@ shade_quad(
 				    quad );
 
    /* store outputs */
-   boolean z_written = FALSE;
+   z_written = FALSE;
    {
       const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
       const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 813d703108..7df8fc5f67 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -299,13 +299,6 @@ static boolean setup_sort_vertices( struct setup_context *setup,
                                     const float (*v1)[4],
                                     const float (*v2)[4] )
 {
-#if DEBUG_VERTS
-   debug_printf("Triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
-#endif
-
    setup->vprovoke = v2;
 
    /* determine bottom to top order of vertices */
@@ -726,6 +719,9 @@ void setup_tri( struct setup_context *setup,
 {
    float det = calc_det(v0, v1, v2);
 
+   if (setup->softpipe->no_rast)
+      return;
+
    /*
    debug_printf("%s\n", __FUNCTION__ );
    */
@@ -735,7 +731,12 @@ void setup_tri( struct setup_context *setup,
    setup->numFragsWritten = 0;
 #endif
 
-
+#if DEBUG_VERTS
+   debug_printf("Triangle:\n");
+   print_vertex(setup, v0);
+   print_vertex(setup, v1);
+   print_vertex(setup, v2);
+#endif
 
    if (cull_tri( setup, det ))
       return;
@@ -934,6 +935,9 @@ setup_line(struct setup_context *setup,
    int dy = y1 - y0;
    int xstep, ystep;
 
+   if (setup->softpipe->no_rast)
+      return;
+
    if (dx == 0 && dy == 0)
       return;
 
@@ -1052,6 +1056,10 @@ setup_point( struct setup_context *setup,
    const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
    uint fragSlot;
 
+
+   if (softpipe->no_rast)
+      return;
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 4eefd1d61f..2921066ce3 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -82,7 +82,10 @@ softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 void
 softpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
+   struct softpipe_context *softpipe = softpipe_context(pipe);
    struct sp_fragment_shader *state = fs;
+
+   assert(fs != softpipe->fs);
    
    state->delete( state );
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 34da6356d7..5b63f97997 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -651,7 +651,9 @@ shadow_compare(uint compare_func,
       k = 0;
       break;
    default:
+      k = 0;
       assert(0);
+      break;
    }
 
    rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 19f71887e7..edafd93d8b 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -39,7 +39,7 @@
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 30
+#define NUM_ENTRIES 32
 
 
 /** XXX move these */
diff --git a/src/gallium/include/pipe/p_util.h b/src/gallium/include/pipe/p_util.h
index dbca080a4b..43d94ec4ba 100644
--- a/src/gallium/include/pipe/p_util.h
+++ b/src/gallium/include/pipe/p_util.h
@@ -402,6 +402,54 @@ extern void pipe_copy_rect(ubyte * dst, unsigned cpp, unsigned dst_pitch,
                            int src_pitch, unsigned src_x, int src_y);
 
 
+
+#ifdef WIN32
+
+#if !defined(_INC_MATH) || !defined(__cplusplus)
+
+static INLINE float cosf( float f ) 
+{
+   return (float) cos( (double) f );
+}
+
+static INLINE float sinf( float f ) 
+{
+   return (float) sin( (double) f );
+}
+
+static INLINE float ceilf( float f ) 
+{
+   return (float) ceil( (double) f );
+}
+
+static INLINE float floorf( float f ) 
+{
+   return (float) floor( (double) f );
+}
+
+static INLINE float powf( float f, float g ) 
+{
+   return (float) pow( (double) f, (double) g );
+}
+
+static INLINE float sqrtf( float f ) 
+{
+   return (float) sqrt( (double) f );
+}
+
+static INLINE float fabsf( float f ) 
+{
+   return (float) fabs( (double) f );
+}
+
+static INLINE float logf( float f ) 
+{
+   return (float) cos( (double) f );
+}
+#endif  /* _INC_MATH */
+#endif
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/winsys/xlib/fakeglx.c b/src/gallium/winsys/xlib/fakeglx.c
index 902a755075..6e04cb4117 100644
--- a/src/gallium/winsys/xlib/fakeglx.c
+++ b/src/gallium/winsys/xlib/fakeglx.c
@@ -1504,6 +1504,13 @@ Fake_glXMakeContextCurrent( Display *dpy, GLXDrawable draw,
 #endif
       }
 
+      if (MakeCurrent_PrevContext == ctx &&
+          MakeCurrent_PrevDrawable == draw &&
+          MakeCurrent_PrevReadable == read &&
+          MakeCurrent_PrevDrawBuffer == drawBuffer &&
+          MakeCurrent_PrevReadBuffer == readBuffer)
+         return True;
+          
       MakeCurrent_PrevContext = ctx;
       MakeCurrent_PrevDrawable = draw;
       MakeCurrent_PrevReadable = read;
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 9a20bdfb69..5a424d0ac7 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -84,6 +84,7 @@ struct xmesa_surface
    struct pipe_surface surface;
 
    int tileSize;
+   boolean no_swap;
 };
 
 
@@ -252,6 +253,9 @@ xmesa_display_surface(XMesaBuffer b, const struct pipe_surface *surf)
    const struct xmesa_surface *xm_surf
       = xmesa_surface((struct pipe_surface *) surf);
 
+   if (xm_surf->no_swap)
+      return;
+
    if (xm_surf->tileSize) {
       xmesa_display_surface_tiled(b, surf);
       return;
@@ -529,6 +533,13 @@ static struct pipe_surface *
 xm_surface_alloc(struct pipe_winsys *ws)
 {
    struct xmesa_surface *xms = CALLOC_STRUCT(xmesa_surface);
+   static boolean no_swap = 0;
+   static boolean firsttime = 1;
+
+   if (firsttime) {
+      no_swap = getenv("SP_NO_RAST") != NULL;
+      firsttime = 0;
+   }
 
    assert(ws);
 
@@ -540,7 +551,9 @@ xm_surface_alloc(struct pipe_winsys *ws)
       xms->tileSize = 32; /** probably temporary */
    }
 #endif
-
+   
+   xms->no_swap = no_swap;
+   
    return &xms->surface;
 }
author	Ben Skeggs <skeggsb@gmail.com>	2008-04-23 12:39:38 +1000
committer	Ben Skeggs <skeggsb@gmail.com>	2008-04-23 12:39:38 +1000
commit	104ff59585ad1888c8cef5ad9de0e2fdb3f48c21 (patch)
tree	9128984eef4a90cc6177d336759ce795b835d71f /src/gallium
parent	b20acef90695d6e5975f538b6e9cb812b05f0cf6 (diff)
parent	6fc530ccda2971a5d99a955ad90ae9762238040f (diff)