40 files changed, 872 insertions, 325 deletions
diff --git a/src/gallium/drivers/svga/svga_cmd.c b/src/gallium/drivers/svga/svga_cmd.c
index 05eab8a517..1ed1d5d25b 100644
--- a/src/gallium/drivers/svga/svga_cmd.c
+++ b/src/gallium/drivers/svga/svga_cmd.c
@@ -422,7 +422,8 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
                   struct svga_transfer *st,         // IN
                   SVGA3dTransferType transfer,      // IN
                   const SVGA3dCopyBox *boxes,       // IN
-                  uint32 numBoxes)                  // IN
+                  uint32 numBoxes,                  // IN
+                  SVGA3dSurfaceDMAFlags flags)      // IN
 {
    struct svga_texture *texture = svga_texture(st->base.resource); 
    SVGA3dCmdSurfaceDMA *cmd;
@@ -465,7 +466,7 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
    pSuffix = (SVGA3dCmdSurfaceDMASuffix *)((uint8_t*)cmd + sizeof *cmd + boxesSize);
    pSuffix->suffixSize = sizeof *pSuffix;
    pSuffix->maximumOffset = st->hw_nblocksy*st->base.stride;
-   memset(&pSuffix->flags, 0, sizeof pSuffix->flags);
+   pSuffix->flags = flags;
 
    swc->commit(swc);
 
diff --git a/src/gallium/drivers/svga/svga_cmd.h b/src/gallium/drivers/svga/svga_cmd.h
index 0e568d78e6..223ab17df8 100644
--- a/src/gallium/drivers/svga/svga_cmd.h
+++ b/src/gallium/drivers/svga/svga_cmd.h
@@ -102,7 +102,8 @@ SVGA3D_SurfaceDMA(struct svga_winsys_context *swc,
                   struct svga_transfer *st,
                   SVGA3dTransferType transfer,
                   const SVGA3dCopyBox *boxes,
-                  uint32 numBoxes);
+                  uint32 numBoxes,
+                  SVGA3dSurfaceDMAFlags flags);
 
 enum pipe_error
 SVGA3D_BufferDMA(struct svga_winsys_context *swc,
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index 1e513f1039..4782b4bf70 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -34,6 +34,7 @@
 
 #include "svga_context.h"
 #include "svga_screen.h"
+#include "svga_surface.h"
 #include "svga_resource_texture.h"
 #include "svga_resource_buffer.h"
 #include "svga_resource.h"
@@ -43,6 +44,12 @@
 #include "svga_debug.h"
 #include "svga_state.h"
 
+DEBUG_GET_ONCE_BOOL_OPTION(no_swtnl, "SVGA_NO_SWTNL", FALSE)
+DEBUG_GET_ONCE_BOOL_OPTION(force_swtnl, "SVGA_FORCE_SWTNL", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(use_min_mipmap, "SVGA_USE_MIN_MIPMAP", FALSE);
+DEBUG_GET_ONCE_NUM_OPTION(disable_shader, "SVGA_DISABLE_SHADER", ~0);
+DEBUG_GET_ONCE_BOOL_OPTION(no_line_width, "SVGA_NO_LINE_WIDTH", FALSE);
+DEBUG_GET_ONCE_BOOL_OPTION(force_hw_line_stipple, "SVGA_FORCE_HW_LINE_STIPPLE", FALSE);
 
 static void svga_destroy( struct pipe_context *pipe )
 {
@@ -113,13 +120,12 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen,
 
 
    /* debug */
-   svga->debug.no_swtnl = debug_get_bool_option("SVGA_NO_SWTNL", FALSE);
-   svga->debug.force_swtnl = debug_get_bool_option("SVGA_FORCE_SWTNL", FALSE);
-   svga->debug.use_min_mipmap = debug_get_bool_option("SVGA_USE_MIN_MIPMAP", FALSE);
-   svga->debug.disable_shader = debug_get_num_option("SVGA_DISABLE_SHADER", ~0);
-
-   if (!svga_init_swtnl(svga))
-      goto no_swtnl;
+   svga->debug.no_swtnl = debug_get_option_no_swtnl();
+   svga->debug.force_swtnl = debug_get_option_force_swtnl();
+   svga->debug.use_min_mipmap = debug_get_option_use_min_mipmap();
+   svga->debug.disable_shader = debug_get_option_disable_shader();
+   svga->debug.no_line_width = debug_get_option_no_line_width();
+   svga->debug.force_hw_line_stipple = debug_get_option_force_hw_line_stipple();
 
    svga->fs_bm = util_bitmask_create();
    if (svga->fs_bm == NULL)
@@ -149,6 +155,8 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen,
    if (svga->hwtnl == NULL)
       goto no_hwtnl;
 
+   if (!svga_init_swtnl(svga))
+      goto no_swtnl;
 
    ret = svga_emit_initial_state( svga );
    if (ret)
@@ -171,6 +179,8 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen,
    return &svga->pipe;
 
 no_state:
+   svga_destroy_swtnl(svga);
+no_swtnl:
    svga_hwtnl_destroy( svga->hwtnl );
 no_hwtnl:
    u_upload_destroy( svga->upload_vb );
@@ -181,8 +191,6 @@ no_upload_ib:
 no_vs_bm:
    util_bitmask_destroy( svga->fs_bm );
 no_fs_bm:
-   svga_destroy_swtnl(svga);
-no_swtnl:
    svga->swc->destroy(svga->swc);
 no_swc:
    FREE(svga);
@@ -196,14 +204,10 @@ void svga_context_flush( struct svga_context *svga,
 {
    struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
    struct pipe_fence_handle *fence = NULL;
+   enum pipe_error ret;
 
    svga->curr.nr_fbs = 0;
 
-   /* Unmap upload manager buffers: 
-    */
-   u_upload_flush(svga->upload_vb);
-   u_upload_flush(svga->upload_ib);
-
    /* Ensure that texture dma uploads are processed
     * before submitting commands.
     */
@@ -220,9 +224,25 @@ void svga_context_flush( struct svga_context *svga,
     */
    svga->dirty |= SVGA_NEW_COMMAND_BUFFER;
 
+   /*
+    * We must reemit the surface bindings here, because svga_update_state
+    * will always flush the primitives before processing the
+    * SVGA_NEW_COMMAND_BUFFER state change.
+    *
+    * TODO: Refactor this.
+    */
+   ret = svga_reemit_framebuffer_bindings(svga);
+   assert(ret == PIPE_OK);
+
+   ret = svga_reemit_tss_bindings(svga);
+   assert(ret == PIPE_OK);
+
+   svga->dirty &= ~SVGA_NEW_COMMAND_BUFFER;
+
    if (SVGA_DEBUG & DEBUG_SYNC) {
       if (fence)
-         svga->pipe.screen->fence_finish( svga->pipe.screen, fence, 0);
+         svga->pipe.screen->fence_finish( svga->pipe.screen, fence,
+                                          PIPE_TIMEOUT_INFINITE);
    }
 
    if(pfence)
@@ -245,6 +265,30 @@ void svga_hwtnl_flush_retry( struct svga_context *svga )
    assert(ret == 0);
 }
 
+
+/* Emit all operations pending on host surfaces.
+ */ 
+void svga_surfaces_flush(struct svga_context *svga)
+{
+   unsigned i;
+
+   /* Emit buffered drawing commands.
+    */
+   svga_hwtnl_flush_retry( svga );
+
+   /* Emit back-copy from render target view to texture.
+    */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      if (svga->curr.framebuffer.cbufs[i])
+         svga_propagate_surface(svga, svga->curr.framebuffer.cbufs[i]);
+   }
+
+   if (svga->curr.framebuffer.zsbuf)
+      svga_propagate_surface(svga, svga->curr.framebuffer.zsbuf);
+
+}
+
+
 struct svga_winsys_context *
 svga_winsys_context( struct pipe_context *pipe )
 {
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 04e281a506..7b36a3606e 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -35,6 +35,8 @@
 
 #include "tgsi/tgsi_scan.h"
 
+#include "svga_state.h"
+
 
 #define SVGA_TEX_UNITS 8
 #define SVGA_MAX_POINTSIZE 80.0
@@ -147,7 +149,14 @@ struct svga_rasterizer_state {
    float pointsize;
    
    unsigned hw_unfilled:16;         /* PIPE_POLYGON_MODE_x */
-   unsigned need_pipeline:16;    /* which prims do we need help for? */
+
+   /** Which prims do we need help for?  Bitmask of (1 << PIPE_PRIM_x) flags */
+   unsigned need_pipeline:16;
+
+   /** For debugging: */
+   const char* need_pipeline_tris_str;
+   const char* need_pipeline_lines_str;
+   const char* need_pipeline_points_str;
 };
 
 struct svga_sampler_state {
@@ -237,7 +246,7 @@ struct svga_prescale {
 };
 
 
-/* Updated by calling svga_update_state( SVGA_STATE_HW_VIEWPORT )
+/* Updated by calling svga_update_state( SVGA_STATE_HW_CLEAR )
  */
 struct svga_hw_clear_state
 {
@@ -288,6 +297,11 @@ struct svga_sw_state
    boolean need_swvfetch;
    boolean need_pipeline;
    boolean need_swtnl;
+
+   /* Flag to make sure that need sw is on while
+    * updating state within a swtnl call.
+    */
+   boolean in_swtnl_draw;
 };
 
 
@@ -312,6 +326,9 @@ struct svga_context
       unsigned shader_id;
 
       unsigned disable_shader;
+
+      boolean no_line_width;
+      boolean force_hw_line_stipple;
    } debug;
 
    struct {
@@ -327,7 +344,7 @@ struct svga_context
    struct util_bitmask *vs_bm;
 
    struct {
-      unsigned dirty[4];
+      unsigned dirty[SVGA_STATE_MAX];
 
       unsigned texture_timestamp;
 
@@ -350,6 +367,9 @@ struct svga_context
 
    /** List of buffers with queued transfers */
    struct list_head dirty_buffers;
+
+   /** Was the previous draw done with the SW path? */
+   boolean prev_draw_swtnl;
 };
 
 /* A flag for each state_tracker state object:
@@ -433,6 +453,8 @@ void svga_context_flush( struct svga_context *svga,
 
 void svga_hwtnl_flush_retry( struct svga_context *svga );
 
+void svga_surfaces_flush(struct svga_context *svga);
+
 struct pipe_context *
 svga_context_create(struct pipe_screen *screen,
 		    void *priv);
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 81dd4778d0..2c873a0f7a 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -28,6 +28,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
+#include "util/u_upload_mgr.h"
 
 #include "svga_context.h"
 #include "svga_draw.h"
@@ -143,6 +144,9 @@ svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
       SVGA3dPrimitiveRange *prim;
       unsigned i;
 
+      /* Unmap upload manager vertex buffers */
+      u_upload_flush(svga->upload_vb);
+
       for (i = 0; i < hwtnl->cmd.vdecl_count; i++) {
          handle = svga_buffer_handle(svga, hwtnl->cmd.vdecl_vb[i]);
          if (handle == NULL)
@@ -151,6 +155,9 @@ svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
          vb_handle[i] = handle;
       }
 
+      /* Unmap upload manager index buffers */
+      u_upload_flush(svga->upload_ib);
+
       for (i = 0; i < hwtnl->cmd.prim_count; i++) {
          if (hwtnl->cmd.prim_ib[i]) {
             handle = svga_buffer_handle(svga, hwtnl->cmd.prim_ib[i]);
@@ -315,7 +322,6 @@ enum pipe_error svga_hwtnl_prim( struct svga_hwtnl *hwtnl,
             break;
          }
 
-         assert(!stride || width <= stride);
          if (max_index != ~0) {
             assert(offset + (index_bias + max_index) * stride + width <= size);
          }
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index da33fae62f..a6518042eb 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -53,6 +53,7 @@ static enum pipe_error generate_indices( struct svga_hwtnl *hwtnl,
 
    dst = pipe_buffer_create( pipe->screen, 
 			     PIPE_BIND_INDEX_BUFFER, 
+			     PIPE_USAGE_STATIC,
 			     size );
    if (dst == NULL)
       goto fail;
@@ -65,14 +66,14 @@ static enum pipe_error generate_indices( struct svga_hwtnl *hwtnl,
    generate( nr,
              dst_map );
 
-   pipe_buffer_unmap( pipe, dst, transfer );
+   pipe_buffer_unmap( pipe, transfer );
 
    *out_buf = dst;
    return PIPE_OK;
 
 fail:
    if (dst_map)
-      pipe_buffer_unmap( pipe, dst, transfer );
+      pipe_buffer_unmap( pipe, transfer );
 
    if (dst)
       pipe->screen->resource_destroy( pipe->screen, dst );
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
index c4579177b7..7d420c6b29 100644
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -56,6 +56,7 @@ translate_indices( struct svga_hwtnl *hwtnl,
 
    dst = pipe_buffer_create( pipe->screen, 
 			     PIPE_BIND_INDEX_BUFFER, 
+			     PIPE_USAGE_STATIC,
 			     size );
    if (dst == NULL)
       goto fail;
@@ -72,18 +73,18 @@ translate_indices( struct svga_hwtnl *hwtnl,
               nr,
               dst_map );
 
-   pipe_buffer_unmap( pipe, src, src_transfer );
-   pipe_buffer_unmap( pipe, dst, dst_transfer );
+   pipe_buffer_unmap( pipe, src_transfer );
+   pipe_buffer_unmap( pipe, dst_transfer );
 
    *out_buf = dst;
    return PIPE_OK;
 
 fail:
    if (src_map)
-      pipe_buffer_unmap( pipe, src, src_transfer );
+      pipe_buffer_unmap( pipe, src_transfer );
 
    if (dst_map)
-      pipe_buffer_unmap( pipe, dst, dst_transfer );
+      pipe_buffer_unmap( pipe, dst_transfer );
 
    if (dst)
       pipe->screen->resource_destroy( pipe->screen, dst );
@@ -120,14 +121,17 @@ svga_hwtnl_simple_draw_range_elements( struct svga_hwtnl *hwtnl,
    if (index_buffer && 
        svga_buffer_is_user_buffer(index_buffer)) 
    {
+      boolean flushed;
       assert( index_buffer->width0 >= index_offset + count * index_size );
 
       ret = u_upload_buffer( hwtnl->upload_ib,
+                             0,
                              index_offset,
                              count * index_size,
                              index_buffer,
                              &index_offset,
-                             &upload_buffer );
+                             &upload_buffer,
+                             &flushed );
       if (ret)
          goto done;
 
diff --git a/src/gallium/drivers/svga/svga_pipe_blit.c b/src/gallium/drivers/svga/svga_pipe_blit.c
index 426698806c..c87afb6946 100644
--- a/src/gallium/drivers/svga/svga_pipe_blit.c
+++ b/src/gallium/drivers/svga/svga_pipe_blit.c
@@ -50,7 +50,9 @@ static void svga_surface_copy(struct pipe_context *pipe,
    struct pipe_surface *srcsurf, *dstsurf;*/
    unsigned dst_face, dst_z, src_face, src_z;
 
-   svga_hwtnl_flush_retry( svga );
+   /* Emit buffered drawing commands, and any back copies.
+    */
+   svga_surfaces_flush( svga );
 
 #if 0
    srcsurf = screen->get_tex_surface(screen, src_tex,
diff --git a/src/gallium/drivers/svga/svga_pipe_draw.c b/src/gallium/drivers/svga/svga_pipe_draw.c
index 001ec3616c..fda5c28433 100644
--- a/src/gallium/drivers/svga/svga_pipe_draw.c
+++ b/src/gallium/drivers/svga/svga_pipe_draw.c
@@ -157,6 +157,14 @@ svga_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (!u_trim_pipe_prim( info->mode, &count ))
       return;
 
+   if (svga->state.sw.need_swtnl != svga->prev_draw_swtnl) {
+      /* We're switching between SW and HW drawing.  Do a flush to avoid
+       * mixing HW and SW rendering with the same vertex buffer.
+       */
+      pipe->flush(pipe, NULL);
+      svga->prev_draw_swtnl = svga->state.sw.need_swtnl;
+   }
+
    /*
     * Mark currently bound target surfaces as dirty
     * doesn't really matter if it is done before drawing.
diff --git a/src/gallium/drivers/svga/svga_pipe_flush.c b/src/gallium/drivers/svga/svga_pipe_flush.c
index ab243aa6ec..4578c136cb 100644
--- a/src/gallium/drivers/svga/svga_pipe_flush.c
+++ b/src/gallium/drivers/svga/svga_pipe_flush.c
@@ -24,6 +24,7 @@
  **********************************************************/
 
 #include "pipe/p_defines.h"
+#include "util/u_string.h"
 #include "svga_screen.h"
 #include "svga_surface.h"
 #include "svga_context.h"
@@ -31,31 +32,40 @@
 
 
 static void svga_flush( struct pipe_context *pipe,
-                        unsigned flags,
                         struct pipe_fence_handle **fence )
 {
    struct svga_context *svga = svga_context(pipe);
-   int i;
 
-   /* Emit buffered drawing commands.
+   /* Emit buffered drawing commands, and any back copies.
     */
-   svga_hwtnl_flush_retry( svga );
-
-   /* Emit back-copy from render target view to texture.
-    */
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
-      if (svga->curr.framebuffer.cbufs[i])
-         svga_propagate_surface(pipe, svga->curr.framebuffer.cbufs[i]);
-   }
-   if (svga->curr.framebuffer.zsbuf)
-      svga_propagate_surface(pipe, svga->curr.framebuffer.zsbuf);
+   svga_surfaces_flush( svga );
 
    /* Flush command queue.
     */
    svga_context_flush(svga, fence);
 
-   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s flags %x fence_ptr %p\n",
-            __FUNCTION__, flags, fence ? *fence : 0x0);
+   SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
+            __FUNCTION__, fence ? *fence : 0x0);
+
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+   if (0) {
+      struct pipe_framebuffer_state *fb = &svga->curr.framebuffer;
+      static unsigned frame_no = 1;
+      char filename[256];
+      unsigned i;
+
+      for (i = 0; i < fb->nr_cbufs; i++) {
+         util_snprintf(filename, sizeof(filename), "cbuf%u_%04u", i, frame_no);
+         debug_dump_surface_bmp(&svga->pipe, filename, fb->cbufs[i]);
+      }
+
+      if (0 && fb->zsbuf) {
+         util_snprintf(filename, sizeof(filename), "zsbuf_%04u", frame_no);
+         debug_dump_surface_bmp(&svga->pipe, filename, fb->zsbuf);
+      }
+
+      ++frame_no;
+   }
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
index 8c24fb302f..440919c626 100644
--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -94,7 +94,7 @@ static void svga_set_framebuffer_state(struct pipe_context *pipe,
    
       for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
          if (dst->cbufs[i] && dst->cbufs[i] != fb->cbufs[i])
-            svga_propagate_surface(pipe, dst->cbufs[i]);
+            svga_propagate_surface(svga, dst->cbufs[i]);
    }
 
    /* XXX: Actually the virtual hardware may support rendertargets with
diff --git a/src/gallium/drivers/svga/svga_pipe_rasterizer.c b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
index 660eb0757a..4a1a37f176 100644
--- a/src/gallium/drivers/svga/svga_pipe_rasterizer.c
+++ b/src/gallium/drivers/svga/svga_pipe_rasterizer.c
@@ -64,18 +64,19 @@ static void *
 svga_create_rasterizer_state(struct pipe_context *pipe,
                              const struct pipe_rasterizer_state *templ)
 {
+   struct svga_context *svga = svga_context(pipe);
    struct svga_rasterizer_state *rast = CALLOC_STRUCT( svga_rasterizer_state );
+
    /* need this for draw module. */
    rast->templ = *templ;
 
-   /* light_twoside          - XXX: need fragment shader varient */
+   /* light_twoside          - XXX: need fragment shader variant */
    /* poly_smooth            - XXX: no fallback available */
    /* poly_stipple_enable    - draw module */
    /* sprite_coord_enable    - ? */
    /* point_quad_rasterization - ? */
    /* point_size_per_vertex  - ? */
    /* sprite_coord_mode      - ??? */
-   /* bypass_vs_viewport_and_clip        - handled by viewport setup */
    /* flatshade_first        - handled by index translation */
    /* gl_rasterization_rules - XXX - viewport code */
    /* line_width             - draw module */
@@ -93,17 +94,22 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
 
    /* Use swtnl + decomposition implement these:
     */
-   if (templ->poly_stipple_enable)
+   if (templ->poly_stipple_enable) {
       rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+      rast->need_pipeline_tris_str = "poly stipple";
+   }
 
-   if (templ->line_width != 1.0 &&
-       templ->line_width != 0.0)
+   if (templ->line_width >= 1.5f &&
+       !svga->debug.no_line_width) {
       rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+      rast->need_pipeline_lines_str = "line width";
+   }
 
    if (templ->line_stipple_enable) {
-      /* LinePattern not implemented on all backends. 
+      /* XXX: LinePattern not implemented on all backends, and there is no
+       * mechanism to query it.
        */
-      if (0) {
+      if (!svga->debug.force_hw_line_stipple) {
          SVGA3dLinePattern lp;
          lp.repeat = templ->line_stipple_factor + 1;
          lp.pattern = templ->line_stipple_pattern;
@@ -111,11 +117,19 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       }
       else {
          rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+         rast->need_pipeline_lines_str = "line stipple";
       }
    } 
 
-   if (templ->point_smooth)
+   if (templ->point_smooth) {
       rast->need_pipeline |= SVGA_PIPELINE_FLAG_POINTS;
+      rast->need_pipeline_points_str = "smooth points";
+   }
+
+   if (templ->line_smooth) {
+      rast->need_pipeline |= SVGA_PIPELINE_FLAG_LINES;
+      rast->need_pipeline_lines_str = "smooth lines";
+   }
 
    {
       int fill_front = templ->fill_front;
@@ -148,6 +162,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
              * front/back fill modes:
              */
             rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+            rast->need_pipeline_tris_str = "different front/back fillmodes";
          }
          else {
             offset = offset_front;
@@ -172,6 +187,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       {
          fill = PIPE_POLYGON_MODE_FILL;
          rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+         rast->need_pipeline_tris_str = "unfilled primitives with no index manipulation";
       }
 
       /* If we are decomposing to lines, and lines need the pipeline,
@@ -182,6 +198,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       {
          fill = PIPE_POLYGON_MODE_FILL;
          rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+         rast->need_pipeline_tris_str = "decomposing lines";
       }
 
       /* Similarly for points:
@@ -191,6 +208,7 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       {
          fill = PIPE_POLYGON_MODE_FILL;
          rast->need_pipeline |= SVGA_PIPELINE_FLAG_TRIS;
+         rast->need_pipeline_tris_str = "decomposing points";
       }
 
       if (offset) {
@@ -201,9 +219,6 @@ svga_create_rasterizer_state(struct pipe_context *pipe,
       rast->hw_unfilled = fill;
    }
 
-
-
-
    if (rast->need_pipeline & SVGA_PIPELINE_FLAG_TRIS) {
       /* Turn off stuff which will get done in the draw module:
        */
diff --git a/src/gallium/drivers/svga/svga_pipe_sampler.c b/src/gallium/drivers/svga/svga_pipe_sampler.c
index f44a0e1325..446fcc4407 100644
--- a/src/gallium/drivers/svga/svga_pipe_sampler.c
+++ b/src/gallium/drivers/svga/svga_pipe_sampler.c
@@ -144,8 +144,9 @@ svga_create_sampler_state(struct pipe_context *pipe,
    return cso;
 }
 
-static void svga_bind_sampler_states(struct pipe_context *pipe,
-                                     unsigned num, void **sampler)
+static void
+svga_bind_fragment_sampler_states(struct pipe_context *pipe,
+                                  unsigned num, void **sampler)
 {
    struct svga_context *svga = svga_context(pipe);
    unsigned i;
@@ -203,9 +204,10 @@ svga_sampler_view_destroy(struct pipe_context *pipe,
    FREE(view);
 }
 
-static void svga_set_sampler_views(struct pipe_context *pipe,
-                                   unsigned num,
-                                   struct pipe_sampler_view **views)
+static void
+svga_set_fragment_sampler_views(struct pipe_context *pipe,
+                                unsigned num,
+                                struct pipe_sampler_view **views)
 {
    struct svga_context *svga = svga_context(pipe);
    unsigned flag_1d = 0;
@@ -256,9 +258,9 @@ static void svga_set_sampler_views(struct pipe_context *pipe,
 void svga_init_sampler_functions( struct svga_context *svga )
 {
    svga->pipe.create_sampler_state = svga_create_sampler_state;
-   svga->pipe.bind_fragment_sampler_states = svga_bind_sampler_states;
+   svga->pipe.bind_fragment_sampler_states = svga_bind_fragment_sampler_states;
    svga->pipe.delete_sampler_state = svga_delete_sampler_state;
-   svga->pipe.set_fragment_sampler_views = svga_set_sampler_views;
+   svga->pipe.set_fragment_sampler_views = svga_set_fragment_sampler_views;
    svga->pipe.create_sampler_view = svga_create_sampler_view;
    svga->pipe.sampler_view_destroy = svga_sampler_view_destroy;
 }
diff --git a/src/gallium/drivers/svga/svga_pipe_vertex.c b/src/gallium/drivers/svga/svga_pipe_vertex.c
index 86c79459f3..5846991073 100644
--- a/src/gallium/drivers/svga/svga_pipe_vertex.c
+++ b/src/gallium/drivers/svga/svga_pipe_vertex.c
@@ -27,6 +27,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_transfer.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "svga_screen.h"
diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c
index ef2a0c40f0..6e0622a312 100644
--- a/src/gallium/drivers/svga/svga_resource.c
+++ b/src/gallium/drivers/svga/svga_resource.c
@@ -33,13 +33,13 @@ svga_resource_from_handle(struct pipe_screen * screen,
 void
 svga_init_resource_functions(struct svga_context *svga)
 {
-   svga->pipe.is_resource_referenced = u_is_resource_referenced_vtbl;
    svga->pipe.get_transfer = u_get_transfer_vtbl;
    svga->pipe.transfer_map = u_transfer_map_vtbl;
    svga->pipe.transfer_flush_region = u_transfer_flush_region_vtbl;
    svga->pipe.transfer_unmap = u_transfer_unmap_vtbl;
    svga->pipe.transfer_destroy = u_transfer_destroy_vtbl;
    svga->pipe.transfer_inline_write = u_transfer_inline_write_vtbl;
+   svga->pipe.redefine_user_buffer = svga_redefine_user_buffer;
 }
 
 void
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index f12e2b6862..2d7c524d86 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -51,53 +51,104 @@ svga_buffer_needs_hw_storage(unsigned usage)
 }
 
 
-static unsigned int
-svga_buffer_is_referenced( struct pipe_context *pipe,
-                           struct pipe_resource *buf,
-                           unsigned level, int layer)
+/**
+ * Map a range of a buffer.
+ *
+ * Unlike texture DMAs (which are written immediately to the command buffer and
+ * therefore inherently serialized with other context operations), for buffers
+ * we try to coalesce multiple range mappings (i.e, multiple calls to this
+ * function) into a single DMA command, for better efficiency in command
+ * processing.  This means we need to exercise extra care here to ensure that
+ * the end result is exactly the same as if one DMA was used for every mapped
+ * range.
+ */
+static void *
+svga_buffer_map_range( struct pipe_context *pipe,
+                       struct pipe_resource *buf,
+                       unsigned offset,
+		       unsigned length,
+                       unsigned usage )
 {
+   struct svga_context *svga = svga_context(pipe);
    struct svga_screen *ss = svga_screen(pipe->screen);
-   struct svga_buffer *sbuf = svga_buffer(buf);
-
-   /**
-    * XXX: Check this.
-    * The screen may cache buffer writes, but when we map, we map out
-    * of those cached writes, so we don't need to set a
-    * PIPE_REFERENCED_FOR_WRITE flag for cached buffers.
-    */
-
-   if (!sbuf->handle || ss->sws->surface_is_flushed(ss->sws, sbuf->handle))
-     return PIPE_UNREFERENCED;
-
-   /**
-    * sws->surface_is_flushed() does not distinguish between read references
-    * and write references. So assume a reference is both,
-    * however, we make an exception for index- and vertex buffers, to avoid
-    * a flush in st_bufferobj_get_subdata, during display list replay.
-    */
+   struct svga_winsys_screen *sws = ss->sws;
+   struct svga_buffer *sbuf = svga_buffer( buf );
+   void *map;
 
-   if (sbuf->b.b.bind & (PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER))
-      return PIPE_REFERENCED_FOR_READ;
+   if (usage & PIPE_TRANSFER_WRITE) {
+      if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
+         /*
+          * Finish writing any pending DMA commands, and tell the host to discard
+          * the buffer contents on the next DMA operation.
+          */
 
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
+         if (sbuf->dma.pending) {
+            svga_buffer_upload_flush(svga, sbuf);
 
+            /*
+             * Instead of flushing the context command buffer, simply discard
+             * the current hwbuf, and start a new one.
+             */
 
+            svga_buffer_destroy_hw_storage(ss, sbuf);
+         }
 
+         sbuf->map.num_ranges = 0;
+         sbuf->dma.flags.discard = TRUE;
+      }
 
+      if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+         if (!sbuf->map.num_ranges) {
+            /*
+             * No pending ranges to upload so far, so we can tell the host to
+             * not synchronize on the next DMA command.
+             */
 
+            sbuf->dma.flags.unsynchronized = TRUE;
+         }
+      } else {
+         /*
+          * Synchronizing, so finish writing any pending DMA command, and
+          * ensure the next DMA will be done in order.
+          */
 
-static void *
-svga_buffer_map_range( struct pipe_screen *screen,
-                       struct pipe_resource *buf,
-                       unsigned offset,
-		       unsigned length,
-                       unsigned usage )
-{
-   struct svga_screen *ss = svga_screen(screen); 
-   struct svga_winsys_screen *sws = ss->sws;
-   struct svga_buffer *sbuf = svga_buffer( buf );
-   void *map;
+         if (sbuf->dma.pending) {
+            svga_buffer_upload_flush(svga, sbuf);
+
+            if (sbuf->hwbuf) {
+               /*
+                * We have a pending DMA upload from a hardware buffer, therefore
+                * we need to ensure that the host finishes processing that DMA
+                * command before the state tracker can start overwriting the
+                * hardware buffer.
+                *
+                * XXX: This could be avoided by tying the hardware buffer to
+                * the transfer (just as done with textures), which would allow
+                * overlapping DMAs commands to be queued on the same context
+                * buffer. However, due to the likelihood of software vertex
+                * processing, it is more convenient to hold on to the hardware
+                * buffer, allowing to quickly access the contents from the CPU
+                * without having to do a DMA download from the host.
+                */
+
+               if (usage & PIPE_TRANSFER_DONTBLOCK) {
+                  /*
+                   * Flushing the command buffer here will most likely cause
+                   * the map of the hwbuf below to block, so preemptively
+                   * return NULL here if DONTBLOCK is set to prevent unnecessary
+                   * command buffer flushes.
+                   */
+
+                  return NULL;
+               }
+
+               svga_context_flush(svga, NULL);
+            }
+         }
+
+         sbuf->dma.flags.unsynchronized = FALSE;
+      }
+   }
 
    if (!sbuf->swbuf && !sbuf->hwbuf) {
       if (svga_buffer_create_hw_storage(ss, sbuf) != PIPE_OK) {
@@ -105,9 +156,12 @@ svga_buffer_map_range( struct pipe_screen *screen,
           * We can't create a hardware buffer big enough, so create a malloc
           * buffer instead.
           */
-         debug_printf("%s: failed to allocate %u KB of DMA, splitting DMA transfers\n",
-                      __FUNCTION__,
-                      (sbuf->b.b.width0 + 1023)/1024);
+         if (0) {
+            debug_printf("%s: failed to allocate %u KB of DMA, "
+                         "splitting DMA transfers\n",
+                         __FUNCTION__,
+                         (sbuf->b.b.width0 + 1023)/1024);
+         }
 
          sbuf->swbuf = align_malloc(sbuf->b.b.width0, 16);
       }
@@ -141,12 +195,12 @@ svga_buffer_map_range( struct pipe_screen *screen,
 
 
 static void 
-svga_buffer_flush_mapped_range( struct pipe_screen *screen,
+svga_buffer_flush_mapped_range( struct pipe_context *pipe,
                                 struct pipe_resource *buf,
                                 unsigned offset, unsigned length)
 {
    struct svga_buffer *sbuf = svga_buffer( buf );
-   struct svga_screen *ss = svga_screen(screen);
+   struct svga_screen *ss = svga_screen(pipe->screen);
    
    pipe_mutex_lock(ss->swc_mutex);
    assert(sbuf->map.writing);
@@ -158,10 +212,10 @@ svga_buffer_flush_mapped_range( struct pipe_screen *screen,
 }
 
 static void 
-svga_buffer_unmap( struct pipe_screen *screen,
+svga_buffer_unmap( struct pipe_context *pipe,
                    struct pipe_resource *buf)
 {
-   struct svga_screen *ss = svga_screen(screen); 
+   struct svga_screen *ss = svga_screen(pipe->screen);
    struct svga_winsys_screen *sws = ss->sws;
    struct svga_buffer *sbuf = svga_buffer( buf );
    
@@ -174,11 +228,18 @@ svga_buffer_unmap( struct pipe_screen *screen,
    if(sbuf->hwbuf)
       sws->buffer_unmap(sws, sbuf->hwbuf);
 
-   if(sbuf->map.writing) {
-      if(!sbuf->map.flush_explicit) {
-         /* No mapped range was flushed -- flush the whole buffer */
+   if (sbuf->map.writing) {
+      if (!sbuf->map.flush_explicit) {
+         /*
+          * Mapped range not flushed explicitly, so flush the whole buffer,
+          * and tell the host to discard the contents when processing the DMA
+          * command.
+          */
+
          SVGA_DBG(DEBUG_DMA, "flushing the whole buffer\n");
    
+         sbuf->dma.flags.discard = TRUE;
+
          svga_buffer_add_range(sbuf, 0, sbuf->b.b.width0);
       }
       
@@ -225,7 +286,7 @@ static void *
 svga_buffer_transfer_map( struct pipe_context *pipe,
 			  struct pipe_transfer *transfer )
 {
-   uint8_t *map = svga_buffer_map_range( pipe->screen,
+   uint8_t *map = svga_buffer_map_range( pipe,
 					 transfer->resource,
 					 transfer->box.x,
 					 transfer->box.width,
@@ -248,7 +309,7 @@ static void svga_buffer_transfer_flush_region( struct pipe_context *pipe,
 {
    assert(box->x + box->width <= transfer->box.width);
 
-   svga_buffer_flush_mapped_range(pipe->screen,
+   svga_buffer_flush_mapped_range(pipe,
 				  transfer->resource,
 				  transfer->box.x + box->x,
 				  box->width);
@@ -257,7 +318,7 @@ static void svga_buffer_transfer_flush_region( struct pipe_context *pipe,
 static void svga_buffer_transfer_unmap( struct pipe_context *pipe,
 			    struct pipe_transfer *transfer )
 {
-   svga_buffer_unmap(pipe->screen,
+   svga_buffer_unmap(pipe,
 		     transfer->resource);
 }
 
@@ -271,7 +332,6 @@ struct u_resource_vtbl svga_buffer_vtbl =
 {
    u_default_resource_get_handle,      /* get_handle */
    svga_buffer_destroy,		     /* resource_destroy */
-   svga_buffer_is_referenced,	     /* is_resource_referenced */
    u_default_get_transfer,	     /* get_transfer */
    u_default_transfer_destroy,	     /* transfer_destroy */
    svga_buffer_transfer_map,	     /* transfer_map */
@@ -308,6 +368,9 @@ svga_buffer_create(struct pipe_screen *screen,
          goto error2;
    }
       
+   debug_reference(&sbuf->b.b.reference,
+                   (debug_reference_descriptor)debug_describe_resource, 0);
+
    return &sbuf->b.b; 
 
 error2:
@@ -341,6 +404,9 @@ svga_user_buffer_create(struct pipe_screen *screen,
 
    sbuf->swbuf = ptr;
    sbuf->user = TRUE;
+
+   debug_reference(&sbuf->b.b.reference,
+                   (debug_reference_descriptor)debug_describe_resource, 0);
    
    return &sbuf->b.b; 
 
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.h b/src/gallium/drivers/svga/svga_resource_buffer.h
index d3ec11bfd5..c559f70ec1 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer.h
@@ -243,4 +243,10 @@ svga_winsys_buffer_create(struct svga_context *svga,
                           unsigned usage,
                           unsigned size);
 
+void
+svga_redefine_user_buffer(struct pipe_context *ctx,
+                          struct pipe_resource *resource,
+                          unsigned offset,
+                          unsigned size);
+
 #endif /* SVGA_BUFFER_H */
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 3de5216a94..0bfa8a14a6 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -40,6 +40,9 @@
 #include "svga_debug.h"
 
 
+#define MAX_DMA_SIZE (4 * 1024 * 1024)
+
+
 /**
  * Allocate a winsys_buffer (ie. DMA, aka GMR memory).
  *
@@ -57,6 +60,13 @@ svga_winsys_buffer_create( struct svga_context *svga,
    struct svga_winsys_screen *sws = svgascreen->sws;
    struct svga_winsys_buffer *buf;
    
+   /* XXX this shouldn't be a hard-coded number; it should be queried
+    * somehow.
+    */
+   if (size > MAX_DMA_SIZE) {
+      return NULL;
+   }
+
    /* Just try */
    buf = sws->buffer_create(sws, alignment, usage, size);
    if(!buf) {
@@ -242,12 +252,17 @@ svga_buffer_upload_command(struct svga_context *svga,
  * Patch up the upload DMA command reserved by svga_buffer_upload_command
  * with the final ranges.
  */
-static void
+void
 svga_buffer_upload_flush(struct svga_context *svga,
                          struct svga_buffer *sbuf)
 {
    SVGA3dCopyBox *boxes;
    unsigned i;
+   struct pipe_resource *dummy;
+
+   if (!sbuf->dma.pending) {
+      return;
+   }
 
    assert(sbuf->handle); 
    assert(sbuf->hwbuf);
@@ -285,17 +300,18 @@ svga_buffer_upload_flush(struct svga_context *svga,
    sbuf->head.next = sbuf->head.prev = NULL; 
 #endif
    sbuf->dma.pending = FALSE;
+   sbuf->dma.flags.discard = FALSE;
+   sbuf->dma.flags.unsynchronized = FALSE;
 
    sbuf->dma.svga = NULL;
    sbuf->dma.boxes = NULL;
 
-   /* Decrement reference count */
-   pipe_reference(&(sbuf->b.b.reference), NULL);
-   sbuf = NULL;
+   /* Decrement reference count (and potentially destroy) */
+   dummy = &sbuf->b.b;
+   pipe_resource_reference(&dummy, NULL);
 }
 
 
-
 /**
  * Note a dirty range.
  *
@@ -326,12 +342,6 @@ svga_buffer_add_range(struct svga_buffer *sbuf,
 
    /*
     * Try to grow one of the ranges.
-    *
-    * Note that it is not this function task to care about overlapping ranges,
-    * as the GMR was already given so it is too late to do anything. Situations
-    * where overlapping ranges may pose a problem should be detected via
-    * pipe_context::is_resource_referenced and the context that refers to the
-    * buffer should be flushed.
     */
 
    for(i = 0; i < sbuf->map.num_ranges; ++i) {
@@ -346,6 +356,11 @@ svga_buffer_add_range(struct svga_buffer *sbuf,
       if (dist <= 0) {
          /*
           * Ranges are contiguous or overlapping -- extend this one and return.
+          *
+          * Note that it is not this function's task to prevent overlapping
+          * ranges, as the GMR was already given so it is too late to do
+          * anything.  If the ranges overlap here it must surely be because
+          * PIPE_TRANSFER_UNSYNCHRONIZED was set.
           */
 
          sbuf->map.ranges[i].start = MIN2(sbuf->map.ranges[i].start, start);
@@ -369,8 +384,7 @@ svga_buffer_add_range(struct svga_buffer *sbuf,
     * pending DMA upload and start clean.
     */
 
-   if(sbuf->dma.pending)
-      svga_buffer_upload_flush(sbuf->dma.svga, sbuf);
+   svga_buffer_upload_flush(sbuf->dma.svga, sbuf);
 
    assert(!sbuf->dma.pending);
    assert(!sbuf->dma.svga);
@@ -638,3 +652,54 @@ svga_context_flush_buffers(struct svga_context *svga)
       next = curr->next;
    }
 }
+
+
+void
+svga_redefine_user_buffer(struct pipe_context *pipe,
+                          struct pipe_resource *resource,
+                          unsigned offset,
+                          unsigned size)
+{
+   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_buffer *sbuf = svga_buffer(resource);
+
+   assert(sbuf->user);
+
+   /*
+    * Release any uploaded user buffer.
+    *
+    * TODO: As an optimization, we could try to update the uploaded buffer
+    * instead.
+    */
+
+   pipe_resource_reference(&sbuf->uploaded.buffer, NULL);
+
+   pipe_mutex_lock(ss->swc_mutex);
+
+   if (offset + size > resource->width0) {
+      /*
+       * User buffers shouldn't have DMA directly, unless
+       * SVGA_COMBINE_USERBUFFERS is not set.
+       */
+
+      if (sbuf->dma.pending) {
+         svga_buffer_upload_flush(svga, sbuf);
+      }
+
+      if (sbuf->handle) {
+         svga_buffer_destroy_host_surface(ss, sbuf);
+      }
+
+      if (sbuf->hwbuf) {
+         svga_buffer_destroy_hw_storage(ss, sbuf);
+      }
+
+      sbuf->key.size.width = sbuf->b.b.width0 = offset + size;
+   }
+
+   pipe_mutex_unlock(ss->swc_mutex);
+
+   svga->curr.any_user_vertex_buffers = TRUE;
+   svga->dirty |= SVGA_NEW_VBUFFER | SVGA_NEW_VELEMENT;
+}
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.h b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
index 11df306526..13d8f3e299 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.h
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.h
@@ -28,6 +28,10 @@
 
 
 void
+svga_buffer_upload_flush(struct svga_context *svga,
+                         struct svga_buffer *sbuf);
+
+void
 svga_buffer_add_range(struct svga_buffer *sbuf,
                       unsigned start,
                       unsigned end);
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 7c9e600b9f..b61f85955a 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -48,31 +48,6 @@
 #define SVGA3D_SURFACE_HINT_SCANOUT (1 << 9)
 
 
-static unsigned int
-svga_texture_is_referenced( struct pipe_context *pipe,
-                            struct pipe_resource *texture,
-                            unsigned level, int layer)
-{
-   struct svga_texture *tex = svga_texture(texture);
-   struct svga_screen *ss = svga_screen(pipe->screen);
-
-   /**
-    * The screen does not cache texture writes.
-    */
-
-   if (!tex->handle || ss->sws->surface_is_flushed(ss->sws, tex->handle))
-      return PIPE_UNREFERENCED;
-
-   /**
-    * sws->surface_is_flushed() does not distinguish between read references
-    * and write references. So assume a reference is both.
-    */
-
-   return PIPE_REFERENCED_FOR_READ | PIPE_REFERENCED_FOR_WRITE;
-}
-
-
-
 /*
  * Helper function and arrays
  */
@@ -146,16 +121,6 @@ svga_translate_format_render(enum pipe_format format)
    case PIPE_FORMAT_L8_UNORM:
       return svga_translate_format(format);
 
-#if 1
-   /* For on host conversion */
-   case PIPE_FORMAT_DXT1_RGB:
-      return SVGA3D_X8R8G8B8;
-   case PIPE_FORMAT_DXT1_RGBA:
-   case PIPE_FORMAT_DXT3_RGBA:
-   case PIPE_FORMAT_DXT5_RGBA:
-      return SVGA3D_A8R8G8B8;
-#endif
-
    default:
       return SVGA3D_FORMAT_INVALID;
    }
@@ -166,7 +131,8 @@ static INLINE void
 svga_transfer_dma_band(struct svga_context *svga,
                        struct svga_transfer *st,
                        SVGA3dTransferType transfer,
-                       unsigned y, unsigned h, unsigned srcy)
+                       unsigned y, unsigned h, unsigned srcy,
+                       SVGA3dSurfaceDMAFlags flags)
 {
    struct svga_texture *texture = svga_texture(st->base.resource); 
    SVGA3dCopyBox box;
@@ -202,10 +168,10 @@ svga_transfer_dma_band(struct svga_context *svga,
                 util_format_get_blocksize(texture->b.b.format) * 8 /
                 (util_format_get_blockwidth(texture->b.b.format)*util_format_get_blockheight(texture->b.b.format)));
 
-   ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1);
+   ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1, flags);
    if(ret != PIPE_OK) {
-      svga->swc->flush(svga->swc, NULL);
-      ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1);
+      svga_context_flush(svga, NULL);
+      ret = SVGA3D_SurfaceDMA(svga->swc, st, transfer, &box, 1, flags);
       assert(ret == PIPE_OK);
    }
 }
@@ -214,7 +180,8 @@ svga_transfer_dma_band(struct svga_context *svga,
 static INLINE void
 svga_transfer_dma(struct svga_context *svga,
                   struct svga_transfer *st,
-                  SVGA3dTransferType transfer)
+                  SVGA3dTransferType transfer,
+                  SVGA3dSurfaceDMAFlags flags)
 {
    struct svga_texture *texture = svga_texture(st->base.resource); 
    struct svga_screen *screen = svga_screen(texture->b.b.screen);
@@ -225,11 +192,17 @@ svga_transfer_dma(struct svga_context *svga,
       SVGA_DBG(DEBUG_PERF, "%s: readback transfer\n", __FUNCTION__);
    }
 
+   /* Ensure any pending operations on host surfaces are queued on the command
+    * buffer first.
+    */
+   svga_surfaces_flush( svga );
 
    if(!st->swbuf) {
       /* Do the DMA transfer in a single go */
 
-      svga_transfer_dma_band(svga, st, transfer, st->base.box.y, st->base.box.height, 0);
+      svga_transfer_dma_band(svga, st, transfer,
+                             st->base.box.y, st->base.box.height, 0,
+                             flags);
 
       if(transfer == SVGA3D_READ_HOST_VRAM) {
          svga_context_flush(svga, &fence);
@@ -275,7 +248,14 @@ svga_transfer_dma(struct svga_context *svga,
             }
          }
 
-         svga_transfer_dma_band(svga, st, transfer, y, h, srcy);
+         svga_transfer_dma_band(svga, st, transfer, y, h, srcy, flags);
+
+         /*
+          * Prevent the texture contents to be discarded on the next band
+          * upload.
+          */
+
+         flags.discard = FALSE;
 
          if(transfer == SVGA3D_READ_HOST_VRAM) {
             svga_context_flush(svga, &fence);
@@ -390,18 +370,25 @@ svga_texture_get_transfer(struct pipe_context *pipe,
    if(st->hw_nblocksy < nblocksy) {
       /* We couldn't allocate a hardware buffer big enough for the transfer, 
        * so allocate regular malloc memory instead */
-      debug_printf("%s: failed to allocate %u KB of DMA, splitting into %u x %u KB DMA transfers\n",
-                   __FUNCTION__,
-                   (nblocksy*st->base.stride + 1023)/1024,
-                   (nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
-                   (st->hw_nblocksy*st->base.stride + 1023)/1024);
+      if (0) {
+         debug_printf("%s: failed to allocate %u KB of DMA, "
+                      "splitting into %u x %u KB DMA transfers\n",
+                      __FUNCTION__,
+                      (nblocksy*st->base.stride + 1023)/1024,
+                      (nblocksy + st->hw_nblocksy - 1)/st->hw_nblocksy,
+                      (st->hw_nblocksy*st->base.stride + 1023)/1024);
+      }
+
       st->swbuf = MALLOC(nblocksy*st->base.stride);
       if(!st->swbuf)
          goto no_swbuf;
    }
 
-   if (usage & PIPE_TRANSFER_READ)
-      svga_transfer_dma(svga, st, SVGA3D_READ_HOST_VRAM);
+   if (usage & PIPE_TRANSFER_READ) {
+      SVGA3dSurfaceDMAFlags flags;
+      memset(&flags, 0, sizeof flags);
+      svga_transfer_dma(svga, st, SVGA3D_READ_HOST_VRAM, flags);
+   }
 
    return &st->base;
 
@@ -460,7 +447,17 @@ svga_texture_transfer_destroy(struct pipe_context *pipe,
    struct svga_transfer *st = svga_transfer(transfer);
 
    if (st->base.usage & PIPE_TRANSFER_WRITE) {
-      svga_transfer_dma(svga, st, SVGA3D_WRITE_HOST_VRAM);
+      SVGA3dSurfaceDMAFlags flags;
+
+      memset(&flags, 0, sizeof flags);
+      if (transfer->usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
+         flags.discard = TRUE;
+      }
+      if (transfer->usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
+         flags.unsynchronized = TRUE;
+      }
+
+      svga_transfer_dma(svga, st, SVGA3D_WRITE_HOST_VRAM, flags);
       ss->texture_timestamp++;
       tex->view_age[transfer->level] = ++(tex->age);
       if (transfer->resource->target == PIPE_TEXTURE_CUBE)
@@ -483,7 +480,6 @@ struct u_resource_vtbl svga_texture_vtbl =
 {
    svga_texture_get_handle,	      /* get_handle */
    svga_texture_destroy,	      /* resource_destroy */
-   svga_texture_is_referenced,	      /* is_resource_referenced */
    svga_texture_get_transfer,	      /* get_transfer */
    svga_texture_transfer_destroy,     /* transfer_destroy */
    svga_texture_transfer_map,	      /* transfer_map */
@@ -527,7 +523,8 @@ svga_texture_create(struct pipe_screen *screen,
       tex->key.numFaces = 1;
    }
 
-   tex->key.cachable = 1;
+   /* XXX: Disabled for now */
+   tex->key.cachable = 0;
 
    if (template->bind & PIPE_BIND_SAMPLER_VIEW)
       tex->key.flags |= SVGA3D_SURFACE_HINT_TEXTURE;
@@ -571,6 +568,9 @@ svga_texture_create(struct pipe_screen *screen,
    if (tex->handle)
       SVGA_DBG(DEBUG_DMA, "  --> got sid %p (texture)\n", tex->handle);
 
+   debug_reference(&tex->b.b.reference,
+                   (debug_reference_descriptor)debug_describe_resource, 0);
+
    return &tex->b.b;
 
 error2:
diff --git a/src/gallium/drivers/svga/svga_sampler_view.c b/src/gallium/drivers/svga/svga_sampler_view.c
index 6911f13f77..4f1f4b597e 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.c
+++ b/src/gallium/drivers/svga/svga_sampler_view.c
@@ -32,6 +32,7 @@
 #include "util/u_format.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_string.h"
 
 #include "svga_screen.h"
 #include "svga_context.h"
@@ -41,14 +42,24 @@
 #include "svga_surface.h"
 
 
+void
+svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv)
+{
+   char res[128];
+   debug_describe_resource(res, sv->texture);
+   util_sprintf(buf, "svga_sampler_view<%s,[%u,%u]>", res, sv->min_lod, sv->max_lod);
+}
+
 struct svga_sampler_view *
 svga_get_tex_sampler_view(struct pipe_context *pipe,
 			  struct pipe_resource *pt,
                           unsigned min_lod, unsigned max_lod)
 {
-   struct svga_screen *ss = svga_screen(pt->screen);
+   struct svga_context *svga = svga_context(pipe);
+   struct svga_screen *ss = svga_screen(pipe->screen);
    struct svga_texture *tex = svga_texture(pt); 
    struct svga_sampler_view *sv = NULL;
+   SVGA3dSurfaceFlags flags = SVGA3D_SURFACE_HINT_TEXTURE;
    SVGA3dSurfaceFormat format = svga_translate_format(pt->format);
    boolean view = TRUE;
 
@@ -68,10 +79,6 @@ svga_get_tex_sampler_view(struct pipe_context *pipe,
       if (min_lod == 0 && max_lod >= pt->last_level)
          view = FALSE;
 
-      if (util_format_is_s3tc(pt->format) && view) {
-         format = svga_translate_format_render(pt->format);
-      }
-
       if (ss->debug.no_sampler_view)
          view = FALSE;
 
@@ -113,6 +120,8 @@ svga_get_tex_sampler_view(struct pipe_context *pipe,
                pt->last_level);
       sv->key.cachable = 0;
       sv->handle = tex->handle;
+      debug_reference(&sv->reference,
+                      (debug_reference_descriptor)svga_debug_describe_sampler_view, 0);
       return sv;
    }
 
@@ -126,7 +135,7 @@ svga_get_tex_sampler_view(struct pipe_context *pipe,
             pt->last_level);
 
    sv->age = tex->age;
-   sv->handle = svga_texture_view_surface(pipe, tex, format,
+   sv->handle = svga_texture_view_surface(svga, tex, flags, format,
                                           min_lod,
                                           max_lod - min_lod + 1,
                                           -1, -1,
@@ -136,6 +145,8 @@ svga_get_tex_sampler_view(struct pipe_context *pipe,
       assert(0);
       sv->key.cachable = 0;
       sv->handle = tex->handle;
+      debug_reference(&sv->reference,
+                      (debug_reference_descriptor)svga_debug_describe_sampler_view, 0);
       return sv;
    }
 
@@ -143,6 +154,9 @@ svga_get_tex_sampler_view(struct pipe_context *pipe,
    svga_sampler_view_reference(&tex->cached_view, sv);
    pipe_mutex_unlock(ss->tex_mutex);
 
+   debug_reference(&sv->reference,
+                   (debug_reference_descriptor)svga_debug_describe_sampler_view, 0);
+
    return sv;
 }
 
diff --git a/src/gallium/drivers/svga/svga_sampler_view.h b/src/gallium/drivers/svga/svga_sampler_view.h
index e64665f2e5..2087c1be85 100644
--- a/src/gallium/drivers/svga/svga_sampler_view.h
+++ b/src/gallium/drivers/svga/svga_sampler_view.h
@@ -83,12 +83,16 @@ svga_validate_sampler_view(struct svga_context *svga, struct svga_sampler_view *
 void
 svga_destroy_sampler_view_priv(struct svga_sampler_view *v);
 
+void
+svga_debug_describe_sampler_view(char *buf, const struct svga_sampler_view *sv);
+
 static INLINE void
 svga_sampler_view_reference(struct svga_sampler_view **ptr, struct svga_sampler_view *v)
 {
    struct svga_sampler_view *old = *ptr;
 
-   if (pipe_reference(&(*ptr)->reference, &v->reference))
+   if (pipe_reference_described(&(*ptr)->reference, &v->reference, 
+                                (debug_reference_descriptor)svga_debug_describe_sampler_view))
       svga_destroy_sampler_view_priv(old);
    *ptr = v;
 }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 078190342a..6c987abe05 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -35,7 +35,6 @@
 #include "svga_resource_texture.h"
 #include "svga_resource.h"
 #include "svga_debug.h"
-#include "svga_surface.h"
 
 #include "svga3d_shaderdefs.h"
 
@@ -226,13 +225,18 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
             return svgascreen->use_ps30 ? 32 : 12;
          return result.u;
       case PIPE_SHADER_CAP_MAX_ADDRS:
-         return svgascreen->use_ps30 ? 1 : 0;
+      case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
+	 /* 
+	  * Although PS 3.0 has some addressing abilities it can only represent
+	  * loops that can be statically determined and unrolled. Given we can
+	  * only handle a subset of the cases that the state tracker already
+	  * does it is better to defer loop unrolling to the state tracker.
+	  */
+         return 0;
       case PIPE_SHADER_CAP_MAX_PREDS:
          return svgascreen->use_ps30 ? 1 : 0;
       case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
          return 1;
-      case PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR:
-         return svgascreen->use_ps30 ? 1 : 0;
       case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       case PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR:
       case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
@@ -338,8 +342,7 @@ svga_is_format_supported( struct pipe_screen *screen,
                           enum pipe_format format,
                           enum pipe_texture_target target,
                           unsigned sample_count,
-                          unsigned tex_usage,
-                          unsigned geom_flags )
+                          unsigned tex_usage)
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
    SVGA3dDevCapIndex index;
@@ -361,13 +364,6 @@ svga_is_format_supported( struct pipe_screen *screen,
       case PIPE_FORMAT_B5G5R5A1_UNORM:
          return FALSE;
          
-      /* Simulate ability to render into compressed textures */
-      case PIPE_FORMAT_DXT1_RGB:
-      case PIPE_FORMAT_DXT1_RGBA:
-      case PIPE_FORMAT_DXT3_RGBA:
-      case PIPE_FORMAT_DXT5_RGBA:
-         return TRUE;
-
       default:
          break;
       }
@@ -415,27 +411,26 @@ svga_fence_reference(struct pipe_screen *screen,
 }
 
 
-static int
+static boolean
 svga_fence_signalled(struct pipe_screen *screen,
-                     struct pipe_fence_handle *fence,
-                     unsigned flag)
+                     struct pipe_fence_handle *fence)
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
-   return sws->fence_signalled(sws, fence, flag);
+   return sws->fence_signalled(sws, fence, 0) == 0;
 }
 
 
-static int
+static boolean
 svga_fence_finish(struct pipe_screen *screen,
                   struct pipe_fence_handle *fence,
-                  unsigned flag)
+                  uint64_t timeout)
 {
    struct svga_winsys_screen *sws = svga_screen(screen)->sws;
 
    SVGA_DBG(DEBUG_DMA|DEBUG_PERF, "%s fence_ptr %p\n",
             __FUNCTION__, fence);
 
-   return sws->fence_finish(sws, fence, flag);
+   return sws->fence_finish(sws, fence, 0) == 0;
 }
 
 
@@ -501,6 +496,12 @@ svga_screen_create(struct svga_winsys_screen *sws)
 
    svga_init_screen_resource_functions(svgascreen);
 
+   if (sws->get_hw_version) {
+      svgascreen->hw_version = sws->get_hw_version(sws);
+   } else {
+      svgascreen->hw_version = SVGA3D_HWVERSION_WS65_B1;
+   }
+
    svgascreen->use_ps30 =
       sws->get_cap(sws, SVGA3D_DEVCAP_FRAGMENT_SHADER_VERSION, &result) &&
       result.u >= SVGA3DPSVERSION_30 ? TRUE : FALSE;
diff --git a/src/gallium/drivers/svga/svga_screen.h b/src/gallium/drivers/svga/svga_screen.h
index 86ec89d88c..7ef627f928 100644
--- a/src/gallium/drivers/svga/svga_screen.h
+++ b/src/gallium/drivers/svga/svga_screen.h
@@ -49,6 +49,8 @@ struct svga_screen
    struct pipe_screen screen;
    struct svga_winsys_screen *sws;
 
+   SVGA3dHardwareVersion hw_version;
+
    unsigned use_ps30;
    unsigned use_vs30;
    
diff --git a/src/gallium/drivers/svga/svga_state.h b/src/gallium/drivers/svga/svga_state.h
index 22d5a6d552..7f239e7a32 100644
--- a/src/gallium/drivers/svga/svga_state.h
+++ b/src/gallium/drivers/svga/svga_state.h
@@ -92,4 +92,8 @@ void svga_update_state_retry( struct svga_context *svga,
 
 enum pipe_error svga_emit_initial_state( struct svga_context *svga );
 
+enum pipe_error svga_reemit_framebuffer_bindings( struct svga_context *svga );
+
+enum pipe_error svga_reemit_tss_bindings( struct svga_context *svga );
+
 #endif
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 97c818cd37..6c3275e74c 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -40,9 +40,12 @@
 
 /* Convert from PIPE_SHADER_* to SVGA3D_SHADERTYPE_*
  */
-static int svga_shader_type( int unit )
+static int svga_shader_type( int shader )
 {
-   return unit + 1;
+   assert(PIPE_SHADER_VERTEX + 1 == SVGA3D_SHADERTYPE_VS);
+   assert(PIPE_SHADER_FRAGMENT + 1 == SVGA3D_SHADERTYPE_PS);
+   assert(shader <= PIPE_SHADER_FRAGMENT);
+   return shader + 1;
 }
 
 
@@ -110,7 +113,7 @@ static int emit_consts( struct svga_context *svga,
 
 done:
    if (data)
-      pipe_buffer_unmap(&svga->pipe, svga->curr.cb[unit], transfer);
+      pipe_buffer_unmap(&svga->pipe, transfer);
 
    return ret;
 }
diff --git a/src/gallium/drivers/svga/svga_state_framebuffer.c b/src/gallium/drivers/svga/svga_state_framebuffer.c
index fcbb35e797..cdadb20c17 100644
--- a/src/gallium/drivers/svga/svga_state_framebuffer.c
+++ b/src/gallium/drivers/svga/svga_state_framebuffer.c
@@ -93,6 +93,55 @@ static int emit_framebuffer( struct svga_context *svga,
 }
 
 
+/*
+ * Rebind rendertargets.
+ *
+ * Similar to emit_framebuffer, but without any state checking/update.
+ *
+ * Called at the beginning of every new command buffer to ensure that
+ * non-dirty rendertargets are properly paged-in.
+ */
+enum pipe_error
+svga_reemit_framebuffer_bindings(struct svga_context *svga)
+{
+   struct pipe_framebuffer_state *hw = &svga->state.hw_clear.framebuffer;
+   unsigned i;
+   enum pipe_error ret;
+
+   for (i = 0; i < MIN2(PIPE_MAX_COLOR_BUFS, 8); ++i) {
+      if (hw->cbufs[i]) {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_COLOR0 + i, hw->cbufs[i]);
+         if (ret != PIPE_OK) {
+            return ret;
+         }
+      }
+   }
+
+   if (hw->zsbuf) {
+      ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_DEPTH, hw->zsbuf);
+      if (ret != PIPE_OK) {
+         return ret;
+      }
+
+      if (hw->zsbuf &&
+          hw->zsbuf->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM) {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, hw->zsbuf);
+         if (ret != PIPE_OK) {
+            return ret;
+         }
+      }
+      else {
+         ret = SVGA3D_SetRenderTarget(svga->swc, SVGA3D_RT_STENCIL, NULL);
+         if (ret != PIPE_OK) {
+            return ret;
+         }
+      }
+   }
+
+   return PIPE_OK;
+}
+
+
 struct svga_tracked_state svga_hw_framebuffer = 
 {
    "hw framebuffer state",
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index ad6f294713..9c04adec8e 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -136,7 +136,7 @@ static int make_fs_key( const struct svga_context *svga,
 
    /* The blend workaround for simulating logicop xor behaviour
     * requires that the incoming fragment color be white.  This change
-    * achieves that by creating a varient of the current fragment
+    * achieves that by creating a variant of the current fragment
     * shader that overrides all output colors with 1,1,1,1
     *   
     * This will work for most shaders, including those containing
diff --git a/src/gallium/drivers/svga/svga_state_need_swtnl.c b/src/gallium/drivers/svga/svga_state_need_swtnl.c
index 66fea02a4b..68c0257878 100644
--- a/src/gallium/drivers/svga/svga_state_need_swtnl.c
+++ b/src/gallium/drivers/svga/svga_state_need_swtnl.c
@@ -35,6 +35,11 @@
 /***********************************************************************
  */
 
+
+/**
+ * Given a gallium vertex element format, return the corresponding SVGA3D
+ * format.  Return SVGA3D_DECLTYPE_MAX for unsupported gallium formats.
+ */
 static INLINE SVGA3dDeclType 
 svga_translate_vertex_format(enum pipe_format format)
 {
@@ -80,6 +85,7 @@ static int update_need_swvfetch( struct svga_context *svga,
    for (i = 0; i < svga->curr.velems->count; i++) {
       svga->state.sw.ve_format[i] = svga_translate_vertex_format(svga->curr.velems->velem[i].src_format);
       if (svga->state.sw.ve_format[i] == SVGA3D_DECLTYPE_MAX) {
+         /* Unsupported format - use software fetch */
          need_swvfetch = TRUE;
          break;
       }
@@ -118,6 +124,11 @@ static int update_need_pipeline( struct svga_context *svga,
                  __FUNCTION__,
                  svga->curr.rast->need_pipeline,
                  (1 << svga->curr.reduced_prim) );
+      SVGA_DBG(DEBUG_SWTNL, "%s: rast need_pipeline tris (%s), lines (%s), points (%s)\n",
+                 __FUNCTION__,
+                 svga->curr.rast->need_pipeline_tris_str,
+                 svga->curr.rast->need_pipeline_lines_str,
+                 svga->curr.rast->need_pipeline_points_str);
       need_pipeline = TRUE;
    }
 
@@ -140,6 +151,10 @@ static int update_need_pipeline( struct svga_context *svga,
       svga->dirty |= SVGA_NEW_NEED_PIPELINE;
    }
 
+   /* DEBUG */
+   if (0 && svga->state.sw.need_pipeline)
+      debug_printf("sw.need_pipeline = %d\n", svga->state.sw.need_pipeline);
+
    return 0;
 }
 
@@ -164,20 +179,28 @@ static int update_need_swtnl( struct svga_context *svga,
    boolean need_swtnl;
 
    if (svga->debug.no_swtnl) {
-      svga->state.sw.need_swvfetch = 0;
-      svga->state.sw.need_pipeline = 0;
+      svga->state.sw.need_swvfetch = FALSE;
+      svga->state.sw.need_pipeline = FALSE;
    }
 
    need_swtnl = (svga->state.sw.need_swvfetch ||
                  svga->state.sw.need_pipeline);
 
    if (svga->debug.force_swtnl) {
-      need_swtnl = 1;
+      need_swtnl = TRUE;
    }
 
+   /*
+    * Some state changes the draw module does makes us belive we
+    * we don't need swtnl. This causes the vdecl code to pickup
+    * the wrong buffers and vertex formats. Try trivial/line-wide.
+    */
+   if (svga->state.sw.in_swtnl_draw)
+      need_swtnl = TRUE;
+
    if (need_swtnl != svga->state.sw.need_swtnl) {
       SVGA_DBG(DEBUG_SWTNL|DEBUG_PERF,
-               "%s need_swvfetch: %s, need_pipeline %s\n",
+               "%s: need_swvfetch %s, need_pipeline %s\n",
                __FUNCTION__,
                svga->state.sw.need_swvfetch ? "true" : "false",
                svga->state.sw.need_pipeline ? "true" : "false");
diff --git a/src/gallium/drivers/svga/svga_state_tss.c b/src/gallium/drivers/svga/svga_state_tss.c
index f8b269a101..c502506b93 100644
--- a/src/gallium/drivers/svga/svga_state_tss.c
+++ b/src/gallium/drivers/svga/svga_state_tss.c
@@ -52,6 +52,16 @@ void svga_cleanup_tss_binding(struct svga_context *svga)
 }
 
 
+struct bind_queue {
+   struct {
+      unsigned unit;
+      struct svga_hw_view_state *view;
+   } bind[PIPE_MAX_SAMPLERS];
+
+   unsigned bind_count;
+};
+
+
 static int
 update_tss_binding(struct svga_context *svga, 
                    unsigned dirty )
@@ -63,15 +73,7 @@ update_tss_binding(struct svga_context *svga,
    unsigned min_lod;
    unsigned max_lod;
 
-
-   struct {
-      struct {
-         unsigned unit;
-         struct svga_hw_view_state *view;
-      } bind[PIPE_MAX_SAMPLERS];
-
-      unsigned bind_count;
-   } queue;
+   struct bind_queue queue;
 
    queue.bind_count = 0;
    
@@ -164,6 +166,64 @@ fail:
 }
 
 
+/*
+ * Rebind textures.
+ *
+ * Similar to update_tss_binding, but without any state checking/update.
+ *
+ * Called at the beginning of every new command buffer to ensure that
+ * non-dirty textures are properly paged-in.
+ */
+enum pipe_error
+svga_reemit_tss_bindings(struct svga_context *svga)
+{
+   unsigned i;
+   enum pipe_error ret;
+   struct bind_queue queue;
+
+   queue.bind_count = 0;
+
+   for (i = 0; i < svga->state.hw_draw.num_views; i++) {
+      struct svga_hw_view_state *view = &svga->state.hw_draw.views[i];
+
+      if (view->v) {
+         queue.bind[queue.bind_count].unit = i;
+         queue.bind[queue.bind_count].view = view;
+         queue.bind_count++;
+      }
+   }
+
+   if (queue.bind_count) {
+      SVGA3dTextureState *ts;
+
+      ret = SVGA3D_BeginSetTextureState(svga->swc,
+                                        &ts,
+                                        queue.bind_count);
+      if (ret != PIPE_OK) {
+         return ret;
+      }
+
+      for (i = 0; i < queue.bind_count; i++) {
+         struct svga_winsys_surface *handle;
+
+         ts[i].stage = queue.bind[i].unit;
+         ts[i].name = SVGA3D_TS_BIND_TEXTURE;
+
+         assert(queue.bind[i].view->v);
+         handle = queue.bind[i].view->v->handle;
+         svga->swc->surface_relocation(svga->swc,
+                                       &ts[i].value,
+                                       handle,
+                                       SVGA_RELOC_READ);
+      }
+
+      SVGA_FIFOCommitAll(svga->swc);
+   }
+
+   return PIPE_OK;
+}
+
+
 struct svga_tracked_state svga_hw_tss_binding = {
    "texture binding emit",
    SVGA_NEW_TEXTURE_BINDING |
diff --git a/src/gallium/drivers/svga/svga_state_vdecl.c b/src/gallium/drivers/svga/svga_state_vdecl.c
index 3af7bf2b35..2f85f9488f 100644
--- a/src/gallium/drivers/svga/svga_state_vdecl.c
+++ b/src/gallium/drivers/svga/svga_state_vdecl.c
@@ -57,12 +57,14 @@ upload_user_buffers( struct svga_context *svga )
          struct svga_buffer *buffer = svga_buffer(svga->curr.vb[i].buffer);
 
          if (!buffer->uploaded.buffer) {
+            boolean flushed;
             ret = u_upload_buffer( svga->upload_vb,
-                                   0,
+                                   0, 0,
                                    buffer->b.b.width0,
                                    &buffer->b.b,
                                    &buffer->uploaded.offset,
-                                   &buffer->uploaded.buffer );
+                                   &buffer->uploaded.buffer,
+                                   &flushed);
             if (ret)
                return ret;
 
@@ -76,7 +78,6 @@ upload_user_buffers( struct svga_context *svga )
                             buffer->b.b.width0);
          }
 
-         pipe_resource_reference( &svga->curr.vb[i].buffer, buffer->uploaded.buffer );
          svga->curr.vb[i].buffer_offset = buffer->uploaded.offset;
       }
    }
@@ -108,6 +109,7 @@ static int emit_hw_vs_vdecl( struct svga_context *svga,
    for (i = 0; i < svga->curr.velems->count; i++) {
       const struct pipe_vertex_buffer *vb = &svga->curr.vb[ve[i].vertex_buffer_index];
       unsigned usage, index;
+      struct svga_buffer *buffer = svga_buffer(vb->buffer);
 
 
       svga_generate_vdecl_semantics( i, &usage, &index );
@@ -125,6 +127,7 @@ static int emit_hw_vs_vdecl( struct svga_context *svga,
       svga_hwtnl_vdecl( svga->hwtnl,
                         i,
                         &decl,
+                        buffer->uploaded.buffer ? buffer->uploaded.buffer :
                         vb->buffer );
    }
 
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 5133c70593..ae9a20ebb8 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -229,13 +229,11 @@ static int update_zero_stride( struct svga_context *svga,
 
          translate->set_buffer(translate, vel->vertex_buffer_index,
                                mapped_buffer,
-                               vbuffer->stride, vbuffer->max_index);
+                               vbuffer->stride, ~0);
          translate->run(translate, 0, 1, 0,
                         svga->curr.zero_stride_constants);
 
-         pipe_buffer_unmap(&svga->pipe,
-                           vbuffer->buffer,
-			   transfer);
+         pipe_buffer_unmap(&svga->pipe, transfer);
 
          translate->release(translate);
       }
diff --git a/src/gallium/drivers/svga/svga_surface.c b/src/gallium/drivers/svga/svga_surface.c
index 3e4bed76c0..3e8fb5f027 100644
--- a/src/gallium/drivers/svga/svga_surface.c
+++ b/src/gallium/drivers/svga/svga_surface.c
@@ -100,8 +100,9 @@ svga_texture_copy_handle(struct svga_context *svga,
 
 
 struct svga_winsys_surface *
-svga_texture_view_surface(struct pipe_context *pipe,
+svga_texture_view_surface(struct svga_context *svga,
                           struct svga_texture *tex,
+                          SVGA3dSurfaceFlags flags,
                           SVGA3dSurfaceFormat format,
                           unsigned start_mip,
                           unsigned num_mip,
@@ -109,7 +110,7 @@ svga_texture_view_surface(struct pipe_context *pipe,
                           int zslice_pick,
                           struct svga_host_surface_cache_key *key) /* OUT */
 {
-   struct svga_screen *ss = svga_screen(pipe->screen);
+   struct svga_screen *ss = svga_screen(svga->pipe.screen);
    struct svga_winsys_surface *handle;
    uint32_t i, j;
    unsigned z_offset = 0;
@@ -118,7 +119,7 @@ svga_texture_view_surface(struct pipe_context *pipe,
             "svga: Create surface view: face %d zslice %d mips %d..%d\n",
             face_pick, zslice_pick, start_mip, start_mip+num_mip-1);
 
-   key->flags = 0;
+   key->flags = flags;
    key->format = format;
    key->numMipLevels = num_mip;
    key->size.width = u_minify(tex->b.b.width0, start_mip);
@@ -161,7 +162,7 @@ svga_texture_view_surface(struct pipe_context *pipe,
                               u_minify(tex->b.b.depth0, i + start_mip) :
                               1);
 
-            svga_texture_copy_handle(svga_context(pipe),
+            svga_texture_copy_handle(svga,
                                      tex->handle, 
                                      0, 0, z_offset, 
                                      i + start_mip, 
@@ -183,6 +184,7 @@ svga_create_surface(struct pipe_context *pipe,
                     struct pipe_resource *pt,
                     const struct pipe_surface *surf_tmpl)
 {
+   struct svga_context *svga = svga_context(pipe);
    struct svga_texture *tex = svga_texture(pt);
    struct pipe_screen *screen = pipe->screen;
    struct svga_surface *s;
@@ -191,6 +193,7 @@ svga_create_surface(struct pipe_context *pipe,
    boolean render = (surf_tmpl->usage & (PIPE_BIND_RENDER_TARGET |
                                          PIPE_BIND_DEPTH_STENCIL)) ? TRUE : FALSE;
    boolean view = FALSE;
+   SVGA3dSurfaceFlags flags;
    SVGA3dSurfaceFormat format;
 
    assert(surf_tmpl->u.tex.first_layer == surf_tmpl->u.tex.last_layer);
@@ -219,10 +222,18 @@ svga_create_surface(struct pipe_context *pipe,
    s->base.u.tex.first_layer = surf_tmpl->u.tex.first_layer;
    s->base.u.tex.last_layer = surf_tmpl->u.tex.last_layer;
 
-   if (!render)
+   if (!render) {
+      flags = SVGA3D_SURFACE_HINT_TEXTURE;
       format = svga_translate_format(surf_tmpl->format);
-   else
+   } else {
+      if (surf_tmpl->usage & PIPE_BIND_RENDER_TARGET) {
+         flags = SVGA3D_SURFACE_HINT_RENDERTARGET;
+      }
+      if (surf_tmpl->usage & PIPE_BIND_DEPTH_STENCIL) {
+         flags = SVGA3D_SURFACE_HINT_DEPTHSTENCIL;
+      }
       format = svga_translate_format_render(surf_tmpl->format);
+   }
 
    assert(format != SVGA3D_FORMAT_INVALID);
 
@@ -249,7 +260,8 @@ svga_create_surface(struct pipe_context *pipe,
       SVGA_DBG(DEBUG_VIEWS, "svga: Surface view: yes %p, level %u face %u z %u, %p\n",
                pt, surf_tmpl->u.tex.level, face, zslice, s);
 
-      s->handle = svga_texture_view_surface(NULL, tex, format, surf_tmpl->u.tex.level,
+      s->handle = svga_texture_view_surface(svga, tex, flags, format,
+                                            surf_tmpl->u.tex.level,
 	                                    1, face, zslice, &s->key);
       s->real_face = 0;
       s->real_level = 0;
@@ -329,7 +341,7 @@ void svga_mark_surfaces_dirty(struct svga_context *svga)
  * pipe is optional context to inline the blit command in.
  */
 void
-svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf)
+svga_propagate_surface(struct svga_context *svga, struct pipe_surface *surf)
 {
    struct svga_surface *s = svga_surface(surf);
    struct svga_texture *tex = svga_texture(surf->texture);
@@ -354,7 +366,7 @@ svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf)
 
    if (s->handle != tex->handle) {
       SVGA_DBG(DEBUG_VIEWS, "svga: Surface propagate: tex %p, level %u, from %p\n", tex, surf->u.tex.level, surf);
-      svga_texture_copy_handle(svga_context(pipe),
+      svga_texture_copy_handle(svga,
                                s->handle, 0, 0, 0, s->real_level, s->real_face,
                                tex->handle, 0, 0, zslice, surf->u.tex.level, face,
                                u_minify(tex->b.b.width0, surf->u.tex.level),
diff --git a/src/gallium/drivers/svga/svga_surface.h b/src/gallium/drivers/svga/svga_surface.h
index afb8326e1f..bffc8c22c6 100644
--- a/src/gallium/drivers/svga/svga_surface.h
+++ b/src/gallium/drivers/svga/svga_surface.h
@@ -56,14 +56,15 @@ struct svga_surface
 
 
 extern void
-svga_propagate_surface(struct pipe_context *pipe, struct pipe_surface *surf);
+svga_propagate_surface(struct svga_context *svga, struct pipe_surface *surf);
 
 extern boolean
 svga_surface_needs_propagation(struct pipe_surface *surf);
 
 struct svga_winsys_surface *
-svga_texture_view_surface(struct pipe_context *pipe,
+svga_texture_view_surface(struct svga_context *svga,
                           struct svga_texture *tex,
+                          SVGA3dSurfaceFlags flags,
                           SVGA3dSurfaceFormat format,
                           unsigned start_mip,
                           unsigned num_mip,
diff --git a/src/gallium/drivers/svga/svga_swtnl_backend.c b/src/gallium/drivers/svga/svga_swtnl_backend.c
index ff3da84272..ac9d637f8c 100644
--- a/src/gallium/drivers/svga/svga_swtnl_backend.c
+++ b/src/gallium/drivers/svga/svga_swtnl_backend.c
@@ -87,11 +87,14 @@ svga_vbuf_render_allocate_vertices( struct vbuf_render *render,
       svga_render->vbuf_size = MAX2(size, svga_render->vbuf_alloc_size);
       svga_render->vbuf = pipe_buffer_create(screen,
                                              PIPE_BIND_VERTEX_BUFFER,
+                                             PIPE_USAGE_STREAM,
                                              svga_render->vbuf_size);
       if(!svga_render->vbuf) {
          svga_context_flush(svga, NULL);
+         assert(!svga_render->vbuf);
          svga_render->vbuf = pipe_buffer_create(screen,
                                                 PIPE_BIND_VERTEX_BUFFER,
+                                                PIPE_USAGE_STREAM,
                                                 svga_render->vbuf_size);
          assert(svga_render->vbuf);
       }
@@ -141,7 +144,7 @@ svga_vbuf_render_unmap_vertices( struct vbuf_render *render,
    pipe_buffer_flush_mapped_range(&svga->pipe,
 				  svga_render->vbuf_transfer,
 				  offset, length);
-   pipe_buffer_unmap(&svga->pipe, svga_render->vbuf, svga_render->vbuf_transfer);
+   pipe_buffer_unmap(&svga->pipe, svga_render->vbuf_transfer);
    svga_render->min_index = min_index;
    svga_render->max_index = max_index;
    svga_render->vbuf_used = MAX2(svga_render->vbuf_used, used);
@@ -158,7 +161,7 @@ svga_vbuf_render_set_primitive( struct vbuf_render *render,
 }
 
 static void
-svga_vbuf_sumbit_state( struct svga_vbuf_render *svga_render )
+svga_vbuf_submit_state( struct svga_vbuf_render *svga_render )
 {
    struct svga_context *svga = svga_render->svga;
    SVGA3dVertexDecl vdecl[PIPE_MAX_ATTRIBS];
@@ -221,7 +224,8 @@ svga_vbuf_render_draw_arrays( struct vbuf_render *render,
    unsigned bias = (svga_render->vbuf_offset - svga_render->vdecl_offset) / svga_render->vertex_size;
    enum pipe_error ret = 0;
 
-   svga_vbuf_sumbit_state(svga_render);
+   /* off to hardware */
+   svga_vbuf_submit_state(svga_render);
 
    /* Need to call update_state() again as the draw module may have
     * altered some of our state behind our backs.  Testcase:
@@ -260,6 +264,7 @@ svga_vbuf_render_draw_elements( struct vbuf_render *render,
       svga_render->ibuf_size = MAX2(size, svga_render->ibuf_alloc_size);
       svga_render->ibuf = pipe_buffer_create(screen,
                                              PIPE_BIND_INDEX_BUFFER,
+                                             PIPE_USAGE_STREAM,
                                              svga_render->ibuf_size);
       svga_render->ibuf_offset = 0;
    }
@@ -267,9 +272,8 @@ svga_vbuf_render_draw_elements( struct vbuf_render *render,
    pipe_buffer_write_nooverlap(&svga->pipe, svga_render->ibuf,
 			       svga_render->ibuf_offset, 2 * nr_indices, indices);
 
-
    /* off to hardware */
-   svga_vbuf_sumbit_state(svga_render);
+   svga_vbuf_submit_state(svga_render);
 
    /* Need to call update_state() again as the draw module may have
     * altered some of our state behind our backs.  Testcase:
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
index 814e8edd70..ad29c1b642 100644
--- a/src/gallium/drivers/svga/svga_swtnl_draw.c
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -51,6 +51,9 @@ svga_swtnl_draw_vbo(struct svga_context *svga,
    assert(svga->state.sw.need_swtnl);
    assert(draw);
 
+   /* Make sure that the need_swtnl flag does not go away */
+   svga->state.sw.in_swtnl_draw = TRUE;
+
    ret = svga_update_state(svga, SVGA_STATE_SWTNL_DRAW);
    if (ret) {
       svga_context_flush(svga, NULL);
@@ -106,22 +109,23 @@ svga_swtnl_draw_vbo(struct svga_context *svga,
     * unmap vertex/index buffers
     */
    for (i = 0; i < svga->curr.num_vertex_buffers; i++) {
-      pipe_buffer_unmap(&svga->pipe, svga->curr.vb[i].buffer, 
-			vb_transfer[i]);
+      pipe_buffer_unmap(&svga->pipe, vb_transfer[i]);
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
 
    if (ib_transfer) {
-      pipe_buffer_unmap(&svga->pipe, svga->curr.ib.buffer, ib_transfer);
+      pipe_buffer_unmap(&svga->pipe, ib_transfer);
       draw_set_mapped_index_buffer(draw, NULL);
    }
 
    if (svga->curr.cb[PIPE_SHADER_VERTEX]) {
-      pipe_buffer_unmap(&svga->pipe,
-                        svga->curr.cb[PIPE_SHADER_VERTEX],
-			cb_transfer);
+      pipe_buffer_unmap(&svga->pipe, cb_transfer);
    }
 
+   /* Now safe to remove the need_swtnl flag in any update_state call */
+   svga->state.sw.in_swtnl_draw = FALSE;
+   svga->dirty |= SVGA_NEW_NEED_PIPELINE | SVGA_NEW_NEED_SWVFETCH;
+
    return ret;
 }
 
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index a759238293..efda2f605b 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -61,7 +61,7 @@ static void set_draw_viewport( struct svga_context *svga )
        * going to be drawn with triangles, but we're not catching all
        * cases where that will happen.
        */
-      if (svga->curr.rast->templ.line_width > 1.0) 
+      if (svga->curr.rast->need_pipeline & SVGA_PIPELINE_FLAG_LINES)
       {
          adjx = SVGA_LINE_ADJ_X + 0.175;
          adjy = SVGA_LINE_ADJ_Y - 0.175;
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index f2591c5721..99600cf5c0 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -57,7 +57,6 @@ translate_opcode(
    case TGSI_OPCODE_MUL:        return SVGA3DOP_MUL;
    case TGSI_OPCODE_NOP:        return SVGA3DOP_NOP;
    case TGSI_OPCODE_NRM4:       return SVGA3DOP_NRM;
-   case TGSI_OPCODE_SSG:        return SVGA3DOP_SGN;
    default:
       debug_printf("Unkown opcode %u\n", opcode);
       assert( 0 );
@@ -285,6 +284,41 @@ static void reset_temp_regs( struct svga_shader_emitter *emit )
 }
    
 
+/* Replace the src with the temporary specified in the dst, but copying
+ * only the necessary channels, and preserving the original swizzle (which is
+ * important given that several opcodes have constraints in the allowed
+ * swizzles).
+ */
+static boolean emit_repl( struct svga_shader_emitter *emit,
+                          SVGA3dShaderDestToken dst,
+                          struct src_register *src0)
+{
+   unsigned src0_swizzle;
+   unsigned chan;
+
+   assert(SVGA3dShaderGetRegType(dst.value) == SVGA3DREG_TEMP);
+
+   src0_swizzle = src0->base.swizzle;
+
+   dst.mask = 0;
+   for (chan = 0; chan < 4; ++chan) {
+      unsigned swizzle = (src0_swizzle >> (chan *2)) & 0x3;
+      dst.mask |= 1 << swizzle;
+   }
+   assert(dst.mask);
+
+   src0->base.swizzle = SVGA3DSWIZZLE_NONE;
+
+   if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), dst, *src0 ))
+      return FALSE;
+
+   *src0 = src( dst );
+   src0->base.swizzle = src0_swizzle;
+
+   return TRUE;
+}
+
+
 static boolean submit_op0( struct svga_shader_emitter *emit,
                            SVGA3dShaderInstToken inst,
                            SVGA3dShaderDestToken dest )
@@ -333,14 +367,11 @@ static boolean submit_op2( struct svga_shader_emitter *emit,
        src0.base.num != src1.base.num)
       need_temp = TRUE;
 
-   if (need_temp)
-   {
+   if (need_temp) {
       temp = get_temp( emit );
 
-      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp, src0 ))
+      if (!emit_repl( emit, temp, &src0 ))
          return FALSE;
-
-      src0 = src( temp );
    }
 
    if (!emit_op2( emit, inst, dest, src0, src1 ))
@@ -396,24 +427,18 @@ static boolean submit_op3( struct svga_shader_emitter *emit,
        (type2 == SVGA3DREG_INPUT && src1.base.num != src2.base.num))
       need_temp1 = TRUE;
 
-   if (need_temp0)
-   {
+   if (need_temp0) {
       temp0 = get_temp( emit );
  
-      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+      if (!emit_repl( emit, temp0, &src0 ))
          return FALSE;
-         
-      src0 = src( temp0 );
    }
 
-   if (need_temp1)
-   {
+   if (need_temp1) {
       temp1 = get_temp( emit );
 
-      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp1, src1 ))
+      if (!emit_repl( emit, temp1, &src1 ))
          return FALSE;
-
-      src1 = src( temp1 );
    }
 
    if (!emit_op3( emit, inst, dest, src0, src1, src2 ))
@@ -478,24 +503,18 @@ static boolean submit_op4( struct svga_shader_emitter *emit,
        (type2 == SVGA3DREG_INPUT && src3.base.num != src2.base.num))
       need_temp3 = TRUE;
 
-   if (need_temp0)
-   {
+   if (need_temp0) {
       temp0 = get_temp( emit );
  
-      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp0, src0 ))
+      if (!emit_repl( emit, temp0, &src0 ))
          return FALSE;
-         
-      src0 = src( temp0 );
    }
 
-   if (need_temp3)
-   {
+   if (need_temp3) {
       temp3 = get_temp( emit );
 
-      if (!emit_op1( emit, inst_token( SVGA3DOP_MOV ), temp3, src3 ))
+      if (!emit_repl( emit, temp3, &src3 ))
          return FALSE;
-
-      src3 = src( temp3 );
    }
 
    if (!emit_op4( emit, inst, dest, src0, src1, src2, src3 ))
@@ -509,6 +528,55 @@ static boolean submit_op4( struct svga_shader_emitter *emit,
 }
 
 
+static boolean alias_src_dst( struct src_register src,
+                              SVGA3dShaderDestToken dst )
+{
+   if (src.base.num != dst.num)
+      return FALSE;
+
+   if (SVGA3dShaderGetRegType(dst.value) !=
+       SVGA3dShaderGetRegType(src.base.value))
+      return FALSE;
+
+   return TRUE;
+}
+
+
+static boolean submit_lrp(struct svga_shader_emitter *emit,
+                          SVGA3dShaderDestToken dst,
+                          struct src_register src0,
+                          struct src_register src1,
+                          struct src_register src2)
+{
+   SVGA3dShaderDestToken tmp;
+   boolean need_dst_tmp = FALSE;
+
+   /* The dst reg must be a temporary, and not be the same as src0 or src2 */
+   if (SVGA3dShaderGetRegType(dst.value) != SVGA3DREG_TEMP ||
+       alias_src_dst(src0, dst) ||
+       alias_src_dst(src2, dst))
+      need_dst_tmp = TRUE;
+
+   if (need_dst_tmp) {
+      tmp = get_temp( emit );
+      tmp.mask = dst.mask;
+   }
+   else {
+      tmp = dst;
+   }
+
+   if (!submit_op3(emit, inst_token( SVGA3DOP_LRP ), tmp, src0, src1, src2))
+      return FALSE;
+
+   if (need_dst_tmp) {
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+
 static boolean emit_def_const( struct svga_shader_emitter *emit,
                                SVGA3dShaderConstType type,
                                unsigned idx,
@@ -747,7 +815,7 @@ static boolean emit_fake_arl(struct svga_shader_emitter *emit,
 static boolean emit_if(struct svga_shader_emitter *emit,
                        const struct tgsi_full_instruction *insn)
 {
-   const struct src_register src = translate_src_register(
+   struct src_register src0 = translate_src_register(
       emit, &insn->Src[0] );
    struct src_register zero = get_zero_immediate( emit );
    SVGA3dShaderInstToken if_token = inst_token( SVGA3DOP_IFC );
@@ -755,10 +823,23 @@ static boolean emit_if(struct svga_shader_emitter *emit,
    if_token.control = SVGA3DOPCOMPC_NE;
    zero = scalar(zero, TGSI_SWIZZLE_X);
 
+   if (SVGA3dShaderGetRegType(src0.base.value) == SVGA3DREG_CONST) {
+      /*
+       * Max different constant registers readable per IFC instruction is 1.
+       */
+
+      SVGA3dShaderDestToken tmp = get_temp( emit );
+
+      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), tmp, src0))
+         return FALSE;
+
+      src0 = scalar(src( tmp ), TGSI_SWIZZLE_X);
+   }
+
    emit->dynamic_branching_level++;
 
    return (emit_instruction( emit, if_token ) &&
-           emit_src( emit, src ) &&
+           emit_src( emit, src0 ) &&
            emit_src( emit, zero ) );
 }
 
@@ -832,7 +913,7 @@ static boolean emit_cmp(struct svga_shader_emitter *emit,
        */
       if (!submit_op2(emit, inst_token(SVGA3DOP_SLT), temp, src0, zero))
          return FALSE;
-      return submit_op3(emit, inst_token(SVGA3DOP_LRP), dst, src(temp), src1, src2);
+      return submit_lrp(emit, dst, src(temp), src1, src2);
    }
 
    /* CMP  DST, SRC0, SRC2, SRC1 */
@@ -1066,6 +1147,41 @@ static boolean emit_cos(struct svga_shader_emitter *emit,
    return TRUE;
 }
 
+static boolean emit_ssg(struct svga_shader_emitter *emit,
+                        const struct tgsi_full_instruction *insn )
+{
+   SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
+   struct src_register src0 = translate_src_register(
+      emit, &insn->Src[0] );
+   SVGA3dShaderDestToken temp0 = get_temp( emit );
+   SVGA3dShaderDestToken temp1 = get_temp( emit );
+   struct src_register zero, one;
+
+   if (emit->unit == PIPE_SHADER_VERTEX) {
+      /* SGN  DST, SRC0, TMP0, TMP1 */
+      return submit_op3( emit, inst_token( SVGA3DOP_SGN ), dst, src0,
+                         src( temp0 ), src( temp1 ) );
+   }
+
+   zero = get_zero_immediate( emit );
+   one = scalar( zero, TGSI_SWIZZLE_W );
+   zero = scalar( zero, TGSI_SWIZZLE_X );
+
+   /* CMP  TMP0, SRC0, one, zero */
+   if (!submit_op3( emit, inst_token( SVGA3DOP_CMP ),
+                    writemask( temp0, dst.mask ), src0, one, zero ))
+      return FALSE;
+
+   /* CMP  TMP1, negate(SRC0), negate(one), zero */
+   if (!submit_op3( emit, inst_token( SVGA3DOP_CMP ),
+                    writemask( temp1, dst.mask ), negate( src0 ), negate( one ),
+                    zero ))
+      return FALSE;
+
+   /* ADD  DST, TMP0, TMP1 */
+   return submit_op2( emit, inst_token( SVGA3DOP_ADD ), dst, src( temp0 ),
+                      src( temp1 ) );
+}
 
 /*
  * ADD DST SRC0, negate(SRC0)
@@ -1588,6 +1704,10 @@ static boolean emit_deriv(struct svga_shader_emitter *emit,
    }
    else {
       unsigned opcode;
+      const struct tgsi_full_src_register *reg = &insn->Src[0];
+      SVGA3dShaderInstToken inst;
+      SVGA3dShaderDestToken dst;
+      struct src_register src0;
 
       switch (insn->Instruction.Opcode) {
       case TGSI_OPCODE_DDX:
@@ -1600,7 +1720,21 @@ static boolean emit_deriv(struct svga_shader_emitter *emit,
          return FALSE;
       }
 
-      return emit_simple_instruction( emit, opcode, insn );
+      inst = inst_token( opcode );
+      dst = translate_dst_register( emit, insn, 0 );
+      src0 = translate_src_register( emit, reg );
+
+      /* We cannot use negate or abs on source to dsx/dsy instruction.
+       */
+      if (reg->Register.Absolute ||
+          reg->Register.Negate) {
+         SVGA3dShaderDestToken temp = get_temp( emit );
+
+         if (!emit_repl( emit, temp, &src0 ))
+            return FALSE;
+      }
+
+      return submit_op1( emit, inst, dst, src0 );
    }
 }
 
@@ -1624,19 +1758,6 @@ static boolean emit_arl(struct svga_shader_emitter *emit,
    }
 }
 
-static boolean alias_src_dst( struct src_register src,
-                              SVGA3dShaderDestToken dst )
-{
-   if (src.base.num != dst.num)
-      return FALSE;
-
-   if (SVGA3dShaderGetRegType(dst.value) != 
-       SVGA3dShaderGetRegType(src.base.value))
-      return FALSE;
-
-   return TRUE;
-}
-
 static boolean emit_pow(struct svga_shader_emitter *emit,
                         const struct tgsi_full_instruction *insn)
 {
@@ -1729,37 +1850,14 @@ static boolean emit_lrp(struct svga_shader_emitter *emit,
                         const struct tgsi_full_instruction *insn)
 {
    SVGA3dShaderDestToken dst = translate_dst_register( emit, insn, 0 );
-   SVGA3dShaderDestToken tmp;
    const struct src_register src0 = translate_src_register(
       emit, &insn->Src[0] );
    const struct src_register src1 = translate_src_register(
       emit, &insn->Src[1] );
    const struct src_register src2 = translate_src_register(
       emit, &insn->Src[2] );
-   boolean need_dst_tmp = FALSE;
-
-   /* The dst reg must not be the same as src0 or src2 */
-   if (alias_src_dst(src0, dst) ||
-       alias_src_dst(src2, dst))
-      need_dst_tmp = TRUE;
 
-   if (need_dst_tmp) {
-      tmp = get_temp( emit );
-      tmp.mask = dst.mask;
-   }
-   else {
-      tmp = dst;
-   }
-
-   if (!submit_op3(emit, inst_token( SVGA3DOP_LRP ), tmp, src0, src1, src2))
-      return FALSE;
-
-   if (need_dst_tmp) {
-      if (!submit_op1(emit, inst_token( SVGA3DOP_MOV ), dst, src( tmp )))
-         return FALSE;      
-   } 
-
-   return TRUE;
+   return submit_lrp(emit, dst, src0, src1, src2);
 }
 
 
@@ -2366,6 +2464,9 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_LRP:
       return emit_lrp( emit, insn );
 
+   case TGSI_OPCODE_SSG:
+      return emit_ssg( emit, insn );
+
    default: {
       unsigned opcode = translate_opcode(insn->Instruction.Opcode);
 
@@ -2715,6 +2816,7 @@ needs_to_create_zero( struct svga_shader_emitter *emit )
          return TRUE;
 
       if (emit->info.opcode_count[TGSI_OPCODE_DST] >= 1 ||
+          emit->info.opcode_count[TGSI_OPCODE_SSG] >= 1 ||
           emit->info.opcode_count[TGSI_OPCODE_LIT] >= 1)
          return TRUE;
    }
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 5e4bdeff2e..ae61cea083 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -136,6 +136,9 @@ struct svga_winsys_screen
    void
    (*destroy)(struct svga_winsys_screen *sws);
    
+   SVGA3dHardwareVersion
+   (*get_hw_version)(struct svga_winsys_screen *sws);
+
    boolean
    (*get_cap)(struct svga_winsys_screen *sws,
               SVGA3dDevCapIndex index,
@@ -243,12 +246,12 @@ struct svga_winsys_screen
 
    /** 
     * Map the entire data store of a buffer object into the client's address.
-    * flags is a bitmaks of PIPE_TRANSFER_*
+    * usage is a bitmask of PIPE_TRANSFER_*
     */
    void *
    (*buffer_map)( struct svga_winsys_screen *sws, 
 	          struct svga_winsys_buffer *buf,
-		  unsigned flags );
+		  unsigned usage );
    
    void 
    (*buffer_unmap)( struct svga_winsys_screen *sws, 
diff --git a/src/gallium/drivers/svga/svgadump/svga_shader_op.c b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
index 95612a8006..ad1549d9f8 100644
--- a/src/gallium/drivers/svga/svgadump/svga_shader_op.c
+++ b/src/gallium/drivers/svga/svgadump/svga_shader_op.c
@@ -136,7 +136,7 @@ static struct sh_opcode_info opcode_info[] =
    { "dsy",          1, 1, 0, 0, SVGA3DOP_INVALID,     },
    { "texldd",       1, 4, 0, 0, SVGA3DOP_INVALID,     },
    { "setp",         1, 2, 0, 0, SVGA3DOP_SETP,        },
-   { "texldl",       1, 2, 0, 0, SVGA3DOP_INVALID,     },
+   { "texldl",       1, 2, 0, 0, SVGA3DOP_TEXLDL,      },
    { "breakp",       0, 1, 0, 0, SVGA3DOP_INVALID,     },
 };
 
@@ -156,6 +156,8 @@ const struct sh_opcode_info *svga_opcode_info( uint op )
    if (info->svga_opcode == SVGA3DOP_INVALID) {
       /* No valid information. Please provide number of dst/src registers.
        */
+      _debug_printf("Missing information for opcode %u, '%s'\n", op,
+                    opcode_info[op].mnemonic);
       assert( 0 );
       return NULL;
    }