Merge branch 'master' into instanced-arrays

Conflicts: src/gallium/auxiliary/tgsi/tgsi_dump.c src/gallium/include/pipe/p_shader_tokens.h
author: Michal Krol <michal@vmware.com> 2010-01-05 11:04:50 +0100
committer: Michal Krol <michal@vmware.com> 2010-01-05 11:04:50 +0100
commit: 9b21b3c52a8a7d58d08151d1a6bf25c472dec213 (patch)
tree: d9083b6af4e2e9b70a7fa6cd31bac45a36e0f6b6 /src/gallium/drivers
parent: 543b9566bdaa48fea2df1866fa1310c1cdbcde27 (diff)
parent: 1f9aa38f4e2be47229d92be2c1189c2b8d9c7133 (diff)
117 files changed, 1654 insertions, 871 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index 5cc1d4ddf8..01bea0f8cc 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -59,7 +59,7 @@ cell_map_constant_buffers(struct cell_context *sp)
       }
    }
 
-   draw_set_mapped_constant_buffer(sp->draw,
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
                                    sp->constants[PIPE_SHADER_VERTEX].buffer->size);
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index efc4f78364..b723e794e7 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -66,7 +66,7 @@ calculate_vertex_layout( struct cell_context *cell )
    vinfo->num_attribs = 0;
 
    /* we always want to emit vertex pos */
-   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
    assert(src >= 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
 
@@ -82,14 +82,14 @@ calculate_vertex_layout( struct cell_context *cell )
          break;
 
       case TGSI_SEMANTIC_COLOR:
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_COLOR, 
                                    fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
          break;
 
       case TGSI_SEMANTIC_FOG:
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
 #if 1
          if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
             src = 0;
@@ -100,7 +100,7 @@ calculate_vertex_layout( struct cell_context *cell )
 
       case TGSI_SEMANTIC_GENERIC:
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
+         src = draw_find_shader_output(cell->draw, TGSI_SEMANTIC_GENERIC,
                               fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index ac5fafec1a..5b87286d4c 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -331,7 +331,7 @@ cell_emit_state(struct cell_context *cell)
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
 
-      info.num_outputs = draw_num_vs_outputs(draw);
+      info.num_outputs = draw_num_shader_outputs(draw);
       info.declarations = (uintptr_t) draw->vs.machine.Declarations;
       info.num_declarations = draw->vs.machine.NumDeclarations;
       info.instructions = (uintptr_t) draw->vs.machine.Instructions;
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 5ed330aa6e..d86d8e09a5 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -1681,7 +1681,7 @@ exec_instruction(
       }
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( &r[0], 0, chan_index );
          FETCH( &r[1], 1, chan_index );
diff --git a/src/gallium/drivers/i915/i915_buffer.c b/src/gallium/drivers/i915/i915_buffer.c
index effeba1297..669964770d 100644
--- a/src/gallium/drivers/i915/i915_buffer.c
+++ b/src/gallium/drivers/i915/i915_buffer.c
@@ -111,6 +111,7 @@ i915_buffer_unmap(struct pipe_screen *screen,
 {
    struct i915_buffer *buf = i915_buffer(buffer);
    assert(!buf->ibuf);
+   (void) buf;
 }
 
 static void
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 94c8aee30f..949f046350 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -84,7 +84,7 @@ i915_draw_range_elements(struct pipe_context *pipe,
    }
 
 
-   draw_set_mapped_constant_buffer(draw,
+   draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
                                    i915->current.constants[PIPE_SHADER_VERTEX],
                                    (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
                                       4 * sizeof(float)));
diff --git a/src/gallium/drivers/i915/i915_state.c b/src/gallium/drivers/i915/i915_state.c
index e580b6c0f7..1528afc859 100644
--- a/src/gallium/drivers/i915/i915_state.c
+++ b/src/gallium/drivers/i915/i915_state.c
@@ -58,10 +58,10 @@ translate_wrap_mode(unsigned wrap)
       return TEXCOORDMODE_CLAMP_EDGE;
    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
       return TEXCOORDMODE_CLAMP_BORDER;
-/*
+   /*         
    case PIPE_TEX_WRAP_MIRRORED_REPEAT:
       return TEXCOORDMODE_MIRROR;
-*/
+    */
    default:
       return TEXCOORDMODE_WRAP;
    }
diff --git a/src/gallium/drivers/i915/i915_state_derived.c b/src/gallium/drivers/i915/i915_state_derived.c
index 178d4e8781..03dd5091a6 100644
--- a/src/gallium/drivers/i915/i915_state_derived.c
+++ b/src/gallium/drivers/i915/i915_state_derived.c
@@ -84,7 +84,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
 
    
    /* pos */
-   src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_POSITION, 0);
    if (needW) {
       draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
@@ -101,21 +101,21 @@ static void calculate_vertex_layout( struct i915_context *i915 )
 
    /* primary color */
    if (colors[0]) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 0);
       draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
       vinfo.hwfmt[0] |= S4_VFMT_COLOR;
    }
 
    /* secondary color */
    if (colors[1]) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_COLOR, 1);
       draw_emit_vertex_attr(&vinfo, EMIT_4UB, colorInterp, src);
       vinfo.hwfmt[0] |= S4_VFMT_SPEC_FOG;
    }
 
    /* fog coord, not fog blend factor */
    if (fog) {
-      src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
+      src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_FOG, 0);
       draw_emit_vertex_attr(&vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
       vinfo.hwfmt[0] |= S4_VFMT_FOG_PARAM;
    }
@@ -125,7 +125,7 @@ static void calculate_vertex_layout( struct i915_context *i915 )
       uint hwtc;
       if (texCoords[i]) {
          hwtc = TEXCOORDFMT_4D;
-         src = draw_find_vs_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
+         src = draw_find_shader_output(i915->draw, TGSI_SEMANTIC_GENERIC, i);
          draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
       }
       else {
diff --git a/src/gallium/drivers/i965/brw_clip.c b/src/gallium/drivers/i965/brw_clip.c
index 58d9e56df2..d67a1a6263 100644
--- a/src/gallium/drivers/i965/brw_clip.c
+++ b/src/gallium/drivers/i965/brw_clip.c
@@ -83,19 +83,19 @@ compile_clip_prog( struct brw_context *brw,
 
    c.offset_hpos = delta + c.key.output_hpos * ATTR_SIZE;
 
-   if (c.key.output_color0)
+   if (c.key.output_color0 != BRW_OUTPUT_NOT_PRESENT)
       c.offset_color0 = delta + c.key.output_color0 * ATTR_SIZE;
 
-   if (c.key.output_color1)
+   if (c.key.output_color1 != BRW_OUTPUT_NOT_PRESENT)
       c.offset_color1 = delta + c.key.output_color1 * ATTR_SIZE;
 
-   if (c.key.output_bfc0)
+   if (c.key.output_bfc0 != BRW_OUTPUT_NOT_PRESENT)
       c.offset_bfc0 = delta + c.key.output_bfc0 * ATTR_SIZE;
 
-   if (c.key.output_bfc1)
+   if (c.key.output_bfc1 != BRW_OUTPUT_NOT_PRESENT)
       c.offset_bfc1 = delta + c.key.output_bfc1 * ATTR_SIZE;
 
-   if (c.key.output_edgeflag)
+   if (c.key.output_edgeflag != BRW_OUTPUT_NOT_PRESENT)
       c.offset_edgeflag = delta + c.key.output_edgeflag * ATTR_SIZE;
    
    if (BRW_IS_IGDNG(brw))
@@ -182,7 +182,6 @@ upload_clip_prog(struct brw_context *brw)
     */
    /* CACHE_NEW_VS_PROG */
    key.nr_attrs        = brw->vs.prog_data->nr_outputs;
-   key.output_edgeflag = brw->vs.prog_data->output_edgeflag;
 
    /* PIPE_NEW_VS */
    key.output_hpos     = vs->output_hpos;
@@ -190,6 +189,7 @@ upload_clip_prog(struct brw_context *brw)
    key.output_color1   = vs->output_color1;
    key.output_bfc0     = vs->output_bfc0;
    key.output_bfc1     = vs->output_bfc1;
+   key.output_edgeflag = vs->output_edgeflag;
 
    /* PIPE_NEW_CLIP */
    key.nr_userclip = brw->curr.ucp.nr;
diff --git a/src/gallium/drivers/i965/brw_context.h b/src/gallium/drivers/i965/brw_context.h
index 56e7807400..8c006bb95b 100644
--- a/src/gallium/drivers/i965/brw_context.h
+++ b/src/gallium/drivers/i965/brw_context.h
@@ -120,6 +120,13 @@
 
 #define BRW_MAX_CURBE                    (32*16)
 
+
+/* Need a value to say a particular vertex shader output isn't
+ * present.  Limits us to 63 outputs currently.
+ */
+#define BRW_OUTPUT_NOT_PRESENT           ((1<<6)-1)
+
+
 struct brw_context;
 
 struct brw_depth_stencil_state {
@@ -335,8 +342,6 @@ struct brw_vs_prog_data {
 
    GLuint nr_params;       /**< number of TGSI_FILE_CONSTANT's */
 
-   GLuint output_edgeflag;
-
    GLboolean writes_psiz;
 
    /* Used for calculating urb partitions:
diff --git a/src/gallium/drivers/i965/brw_disasm.h b/src/gallium/drivers/i965/brw_disasm.h
index 77d402d35e..ba5b109c48 100644
--- a/src/gallium/drivers/i965/brw_disasm.h
+++ b/src/gallium/drivers/i965/brw_disasm.h
@@ -23,6 +23,8 @@
 #ifndef BRW_DISASM_H
 #define BRW_DISASM_H
 
+#include <stdio.h>
+
 struct brw_instruction;
 
 int brw_disasm_insn (FILE *file, const struct brw_instruction *inst);
diff --git a/src/gallium/drivers/i965/brw_eu_emit.c b/src/gallium/drivers/i965/brw_eu_emit.c
index 4fe7b6acc1..00d8eaccbc 100644
--- a/src/gallium/drivers/i965/brw_eu_emit.c
+++ b/src/gallium/drivers/i965/brw_eu_emit.c
@@ -860,7 +860,7 @@ void brw_land_fwd_jump(struct brw_compile *p,
        jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
-   assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
+   assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
 
    jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_clear.c b/src/gallium/drivers/i965/brw_pipe_clear.c
index 211be88178..452e1e89f9 100644
--- a/src/gallium/drivers/i965/brw_pipe_clear.c
+++ b/src/gallium/drivers/i965/brw_pipe_clear.c
@@ -114,18 +114,18 @@ static void color_clear(struct brw_context *brw,
                         const float *rgba )
 {
    enum pipe_error ret;
-   unsigned value;
+   union util_color value;
 
    util_pack_color( rgba, bsurface->base.format, &value );
 
    if (bsurface->cpp == 2)
-      value |= value << 16;
+      value.ui |= value.ui << 16;
 
-   ret = try_clear( brw, bsurface, value );
+   ret = try_clear( brw, bsurface, value.ui );
 
    if (ret != 0) {
       brw_context_flush( brw );
-      ret = try_clear( brw, bsurface, value );
+      ret = try_clear( brw, bsurface, value.ui );
       assert( ret == 0 );
    }
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_fb.c b/src/gallium/drivers/i965/brw_pipe_fb.c
index 6b03094f50..5d4e5025f9 100644
--- a/src/gallium/drivers/i965/brw_pipe_fb.c
+++ b/src/gallium/drivers/i965/brw_pipe_fb.c
@@ -3,6 +3,7 @@
 #include "pipe/p_state.h"
 
 #include "brw_context.h"
+#include "brw_debug.h"
 
 /**
  * called from intelDrawBuffer()
@@ -51,8 +52,14 @@ static void brw_set_viewport_state( struct pipe_context *pipe,
    struct brw_context *brw = brw_context(pipe);
 
    brw->curr.viewport = *viewport;
-   brw->curr.ccv.min_depth = 0.0;         /* XXX: near */
-   brw->curr.ccv.max_depth = 1.0;         /* XXX: far */
+   brw->curr.ccv.min_depth = viewport->scale[2] * -1.0 + viewport->translate[2];
+   brw->curr.ccv.max_depth = viewport->scale[2] *  1.0 + viewport->translate[2];
+
+   if (0)
+      debug_printf("%s depth range %f .. %f\n",
+                   __FUNCTION__,
+                   brw->curr.ccv.min_depth,
+                   brw->curr.ccv.max_depth);
 
    brw->state.dirty.mesa |= PIPE_NEW_VIEWPORT;
 }
diff --git a/src/gallium/drivers/i965/brw_pipe_shader.c b/src/gallium/drivers/i965/brw_pipe_shader.c
index 20f20571f6..bb32d90e33 100644
--- a/src/gallium/drivers/i965/brw_pipe_shader.c
+++ b/src/gallium/drivers/i965/brw_pipe_shader.c
@@ -197,6 +197,13 @@ static void *brw_create_vs_state( struct pipe_context *pipe,
    vs->id = brw->program_id++;
    vs->has_flow_control = has_flow_control(&vs->info);
 
+   vs->output_hpos = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_color1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc0 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_bfc1 = BRW_OUTPUT_NOT_PRESENT;
+   vs->output_edgeflag = BRW_OUTPUT_NOT_PRESENT;
+
    for (i = 0; i < vs->info.num_outputs; i++) {
       int index = vs->info.output_semantic_index[i];
       switch (vs->info.output_semantic_name[i]) {
diff --git a/src/gallium/drivers/i965/brw_vs_emit.c b/src/gallium/drivers/i965/brw_vs_emit.c
index 714def5046..8a16205d2f 100644
--- a/src/gallium/drivers/i965/brw_vs_emit.c
+++ b/src/gallium/drivers/i965/brw_vs_emit.c
@@ -79,18 +79,12 @@ static void release_tmps( struct brw_vs_compile *c )
 static boolean is_position_output( struct brw_vs_compile *c,
                                    unsigned vs_output )
 {
-   struct brw_vertex_shader *vs = c->vp;
-
-   if (vs_output == c->prog_data.output_edgeflag) {
-      return FALSE;
-   }
-   else {
-      unsigned semantic = vs->info.output_semantic_name[vs_output];
-      unsigned index = vs->info.output_semantic_index[vs_output];
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
       
-      return (semantic == TGSI_SEMANTIC_POSITION &&
-              index == 0);
-   }
+   return (semantic == TGSI_SEMANTIC_POSITION &&
+           index == 0);
 }
 
 
@@ -98,23 +92,16 @@ static boolean find_output_slot( struct brw_vs_compile *c,
                                   unsigned vs_output,
                                   unsigned *fs_input_slot )
 {
-   struct brw_vertex_shader *vs = c->vp;
+   const struct brw_vertex_shader *vs = c->vp;
+   unsigned semantic = vs->info.output_semantic_name[vs_output];
+   unsigned index = vs->info.output_semantic_index[vs_output];
+   unsigned i;
 
-   if (vs_output == c->prog_data.output_edgeflag) {
-      *fs_input_slot = c->key.fs_signature.nr_inputs;
-      return TRUE;
-   }
-   else {
-      unsigned semantic = vs->info.output_semantic_name[vs_output];
-      unsigned index = vs->info.output_semantic_index[vs_output];
-      unsigned i;
-
-      for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
-         if (c->key.fs_signature.input[i].semantic == semantic &&
+   for (i = 0; i < c->key.fs_signature.nr_inputs; i++) {
+      if (c->key.fs_signature.input[i].semantic == semantic &&
           c->key.fs_signature.input[i].semantic_index == index) {
-            *fs_input_slot = i;
-            return TRUE;
-         }
+         *fs_input_slot = i;
+         return TRUE;
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index de6156795d..3ca676647c 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -9,6 +9,8 @@ if not env.has_key('LLVM_VERSION'):
 
 env.Tool('udis86')
 
+env.Append(CPPPATH = ['.'])
+
 env.CodeGenerate(
 	target = 'lp_tile_soa.c',
 	script = 'lp_tile_soa.py',
@@ -74,7 +76,7 @@ llvmpipe = env.ConvenienceLibrary(
 
 env = env.Clone()
 
-env.Prepend(LIBS = [llvmpipe] + auxiliaries)
+env.Prepend(LIBS = [llvmpipe] + gallium)
 
 tests = [
     'format',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
index d14f468ba9..ced7b9c11d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_blend_aos.c
@@ -142,7 +142,7 @@ lp_build_blend_factor_unswizzled(struct lp_build_blend_aos_context *bld,
 
 enum lp_build_blend_swizzle {
    LP_BUILD_BLEND_SWIZZLE_RGBA = 0,
-   LP_BUILD_BLEND_SWIZZLE_AAAA = 1,
+   LP_BUILD_BLEND_SWIZZLE_AAAA = 1
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_flow.c b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
index dcc25fbff8..25c10af29f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_flow.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_flow.c
@@ -47,7 +47,7 @@
  */
 enum lp_build_flow_construct_kind {
    lP_BUILD_FLOW_SCOPE,
-   LP_BUILD_FLOW_SKIP,
+   LP_BUILD_FLOW_SKIP
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
index 5836e0173f..10e82f120b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_format_aos.c
@@ -130,7 +130,7 @@ lp_build_unpack_rgba_aos(LLVMBuilderRef builder,
 
    shifted = LLVMBuildLShr(builder, packed, LLVMConstVector(shifts, 4), "");
    masked = LLVMBuildAnd(builder, shifted, LLVMConstVector(masks, 4), "");
-   // UIToFP can't be expressed in SSE2
+   /* UIToFP can't be expressed in SSE2 */
    casted = LLVMBuildSIToFP(builder, masked, LLVMVectorType(LLVMFloatType(), 4), "");
 
    if (normalized)
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
index a67c70ff25..61b033c9fc 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_tgsi_soa.c
@@ -321,7 +321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
 {
    const uint unit = inst->Src[1].Register.Index;
    LLVMValueRef lodbias;
-   LLVMValueRef oow;
+   LLVMValueRef oow = NULL;
    LLVMValueRef coords[3];
    unsigned num_coords;
    unsigned i;
@@ -446,7 +446,12 @@ emit_instruction(
 {
    unsigned chan_index;
    LLVMValueRef src0, src1, src2;
-   LLVMValueRef tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+   LLVMValueRef tmp0, tmp1, tmp2;
+   LLVMValueRef tmp3 = NULL;
+   LLVMValueRef tmp4 = NULL;
+   LLVMValueRef tmp5 = NULL;
+   LLVMValueRef tmp6 = NULL;
+   LLVMValueRef tmp7 = NULL;
    LLVMValueRef res;
    LLVMValueRef dst0[NUM_CHANNELS];
 
@@ -1310,7 +1315,7 @@ emit_instruction(
       return 0;
       break;
 
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
       /* deprecated? */
       assert(0);
       return 0;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 001311e703..37587d4f79 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -140,6 +140,7 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context( pipe );
    unsigned i;
 
+   /* check if any of the bound drawing surfaces are this texture */
    if(llvmpipe->dirty_render_cache) {
       for (i = 0; i < llvmpipe->framebuffer.nr_cbufs; i++) {
          if(llvmpipe->framebuffer.cbufs[i] && 
@@ -150,6 +151,13 @@ llvmpipe_is_texture_referenced( struct pipe_context *pipe,
          llvmpipe->framebuffer.zsbuf->texture == texture)
          return PIPE_REFERENCED_FOR_WRITE;
    }
+
+   /* check if any of the tex_cache textures are this texture */
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      if (llvmpipe->tex_cache[i] &&
+            llvmpipe->tex_cache[i]->texture == texture)
+         return PIPE_REFERENCED_FOR_READ;
+   }
    for (i = 0; i < PIPE_MAX_VERTEX_SAMPLERS; i++) {
       if (llvmpipe->vertex_tex_cache[i] &&
           llvmpipe->vertex_tex_cache[i]->texture == texture)
diff --git a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
index 2299566c66..a96c2cad9d 100644
--- a/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
+++ b/src/gallium/drivers/llvmpipe/lp_draw_arrays.c
@@ -103,7 +103,7 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
    draw_arrays(draw, mode, start, count);
 
    /*
-    * unmap vertex/index buffers - will cause draw module to flush
+    * unmap vertex/index buffers
     */
    for (i = 0; i < lp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
@@ -112,6 +112,12 @@ llvmpipe_draw_range_elements(struct pipe_context *pipe,
       draw_set_mapped_element_buffer(draw, 0, NULL);
    }
 
+   /*
+    * TODO: Flush only when a user vertex/index buffer is present
+    * (or even better, modify draw module to do this
+    * internally when this condition is seen?)
+    */
+   draw_flush(draw);
 
    /* Note: leave drawing surfaces mapped */
 
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
index 4abff4eccc..e8e2e2524a 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
@@ -128,6 +128,7 @@ lp_vbuf_unmap_vertices(struct vbuf_render *vbr,
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
    assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   (void) cvbr;
    /* do nothing */
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_blend.c b/src/gallium/drivers/llvmpipe/lp_state_blend.c
index b2e75d3b14..a94cd05ef2 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_blend.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_blend.c
@@ -34,6 +34,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "util/u_debug_dump.h"
+#include "draw/draw_context.h"
 #include "lp_screen.h"
 #include "lp_context.h"
 #include "lp_state.h"
@@ -51,6 +52,11 @@ void llvmpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
+   if (llvmpipe->blend == blend)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
    llvmpipe->blend = blend;
 
    llvmpipe->dirty |= LP_NEW_BLEND;
@@ -69,6 +75,11 @@ void llvmpipe_set_blend_color( struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    unsigned i, j;
 
+   if(memcmp(&llvmpipe->blend_color, blend_color, sizeof *blend_color) == 0)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
    memcpy(&llvmpipe->blend_color, blend_color, sizeof *blend_color);
 
    if(!llvmpipe->jit_context.blend_color)
@@ -99,7 +110,12 @@ llvmpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
-   llvmpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   if (llvmpipe->depth_stencil == depth_stencil)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->depth_stencil = depth_stencil;
 
    if(llvmpipe->depth_stencil)
       llvmpipe->jit_context.alpha_ref_value = llvmpipe->depth_stencil->alpha.ref_value;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index e703964aaa..acfd7be5f7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -66,7 +66,7 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       /* compute vertex layout now */
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
       struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-      const uint num = draw_num_vs_outputs(llvmpipe->draw);
+      const uint num = draw_current_shader_outputs(llvmpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
@@ -116,13 +116,13 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
          }
 
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(llvmpipe->draw,
+         src = draw_find_shader_output(llvmpipe->draw,
                                    lpfs->info.input_semantic_name[i],
                                    lpfs->info.input_semantic_index[i]);
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
-      llvmpipe->psize_slot = draw_find_vs_output(llvmpipe->draw,
+      llvmpipe->psize_slot = draw_find_shader_output(llvmpipe->draw,
                                                  TGSI_SEMANTIC_PSIZE, 0);
       if (llvmpipe->psize_slot > 0) {
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 22683ff8b4..f2b8c36264 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -673,7 +673,12 @@ llvmpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
-   llvmpipe->fs = (struct lp_fragment_shader *) fs;
+   if (llvmpipe->fs == fs)
+      return;
+
+   draw_flush(llvmpipe->draw);
+
+   llvmpipe->fs = fs;
 
    llvmpipe->dirty |= LP_NEW_FS;
 }
@@ -688,6 +693,7 @@ llvmpipe_delete_fs_state(struct pipe_context *pipe, void *fs)
    struct lp_fragment_shader_variant *variant;
 
    assert(fs != llvmpipe->fs);
+   (void) llvmpipe;
 
    variant = shader->variants;
    while(variant) {
@@ -723,8 +729,7 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
-   if(shader == PIPE_SHADER_VERTEX)
-      draw_flush(llvmpipe->draw);
+   draw_flush(llvmpipe->draw);
 
    /* note: reference counting */
    pipe_buffer_reference(&llvmpipe->constants[shader].buffer, buffer);
@@ -734,7 +739,8 @@ llvmpipe_set_constant_buffer(struct pipe_context *pipe,
    }
 
    if(shader == PIPE_SHADER_VERTEX) {
-      draw_set_mapped_constant_buffer(llvmpipe->draw, data, size);
+      draw_set_mapped_constant_buffer(llvmpipe->draw, PIPE_SHADER_VERTEX,
+                                      data, size);
    }
 
    llvmpipe->dirty |= LP_NEW_CONSTANTS;
diff --git a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
index 4561c6b845..aa3b5a3f91 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_rasterizer.c
@@ -41,14 +41,17 @@ llvmpipe_create_rasterizer_state(struct pipe_context *pipe,
 }
 
 void llvmpipe_bind_rasterizer_state(struct pipe_context *pipe,
-                                    void *setup)
+                                    void *rasterizer)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
 
+   if (llvmpipe->rasterizer == rasterizer)
+      return;
+
    /* pass-through to draw module */
-   draw_set_rasterizer_state(llvmpipe->draw, setup);
+   draw_set_rasterizer_state(llvmpipe->draw, rasterizer);
 
-   llvmpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+   llvmpipe->rasterizer = rasterizer;
 
    llvmpipe->dirty |= LP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index ba970cac98..e37ff04f3d 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -51,6 +51,8 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct llvmpipe_context *lp = llvmpipe_context(pipe);
    uint i;
 
+   draw_flush(lp->draw);
+
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       /* check if changing cbuf */
       if (lp->framebuffer.cbufs[i] != fb->cbufs[i]) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_vs.c b/src/gallium/drivers/llvmpipe/lp_state_vs.c
index 8a761648e7..884e3878e6 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_vs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_vs.c
@@ -70,14 +70,18 @@ fail:
 
 
 void
-llvmpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
+llvmpipe_bind_vs_state(struct pipe_context *pipe, void *_vs)
 {
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
+   const struct lp_vertex_shader *vs = (const struct lp_vertex_shader *)_vs;
 
-   llvmpipe->vs = (const struct lp_vertex_shader *)vs;
+   if (llvmpipe->vs == vs)
+      return;
 
-   draw_bind_vertex_shader(llvmpipe->draw,
-                           (llvmpipe->vs ? llvmpipe->vs->draw_data : NULL));
+   draw_bind_vertex_shader(llvmpipe->draw, 
+                           vs ? vs->draw_data : NULL);
+
+   llvmpipe->vs = vs;
 
    llvmpipe->dirty |= LP_NEW_VS;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_test_conv.c b/src/gallium/drivers/llvmpipe/lp_test_conv.c
index 968c7a2d4a..faddfb9677 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_conv.c
@@ -330,7 +330,7 @@ test_one(unsigned verbose,
          fprintf(stderr, "conv.bc written\n");
          fprintf(stderr, "Invoke as \"llc -o - conv.bc\"\n");
          firsttime = FALSE;
-         //abort();
+         /* abort(); */
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_cache.h b/src/gallium/drivers/llvmpipe/lp_tex_cache.h
index 9fa6c36812..05fded78e1 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_cache.h
@@ -115,7 +115,7 @@ extern const struct llvmpipe_cached_tex_tile *
 lp_find_cached_tex_tile(struct llvmpipe_tex_tile_cache *tc,
                         union tex_tile_address addr );
 
-static INLINE const union tex_tile_address
+static INLINE union tex_tile_address
 tex_tile_address( unsigned x,
                   unsigned y,
                   unsigned z,
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
index 0d01c07fb5..68520fa4f0 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample_c.c
@@ -1085,7 +1085,7 @@ lp_get_samples_2d_common(struct tgsi_sampler *tgsi_sampler,
    const struct pipe_sampler_state *sampler = samp->sampler;
    unsigned level0, level1, j, imgFilter;
    int width, height;
-   float levelBlend;
+   float levelBlend = 0.0f;
 
    choose_mipmap_levels(tgsi_sampler, s, t, p, 
                         lodbias,
@@ -1241,7 +1241,7 @@ lp_get_samples_3d(struct tgsi_sampler *tgsi_sampler,
    /* get/map pipe_surfaces corresponding to 3D tex slices */
    unsigned level0, level1, j, imgFilter;
    int width, height, depth;
-   float levelBlend;
+   float levelBlend = 0.0f;
    const uint face = 0;
 
    choose_mipmap_levels(tgsi_sampler, s, t, p, 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.h b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
index 040b01865d..19d00b58d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.h
@@ -29,7 +29,7 @@
 #define LP_TILE_SOA_H
 
 #include "pipe/p_compiler.h"
-#include "tgsi/tgsi_exec.h" // for NUM_CHANNELS
+#include "tgsi/tgsi_exec.h" /* for NUM_CHANNELS */
 
 
 #ifdef __cplusplus
diff --git a/src/gallium/drivers/llvmpipe/lp_winsys.h b/src/gallium/drivers/llvmpipe/lp_winsys.h
index 595481c2cb..74b472b653 100644
--- a/src/gallium/drivers/llvmpipe/lp_winsys.h
+++ b/src/gallium/drivers/llvmpipe/lp_winsys.h
@@ -35,7 +35,7 @@
 #define LP_WINSYS_H
 
 
-#include "pipe/p_compiler.h" // for boolean
+#include "pipe/p_compiler.h" /* for boolean */
 #include "pipe/p_format.h"
 
 
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index e4cf91c005..0437af3725 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -31,7 +31,7 @@ nouveau_screen_bo_skel(struct pipe_screen *pscreen, struct nouveau_bo *bo,
 		       unsigned alignment, unsigned usage, unsigned size)
 {
 	struct pipe_buffer *pb;
-	
+
 	pb = CALLOC(1, sizeof(struct pipe_buffer)+sizeof(struct nouveau_bo *));
 	if (!pb) {
 		nouveau_bo_ref(NULL, &bo);
@@ -239,5 +239,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
+	nouveau_channel_free(&screen->channel);
 }
 
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 42c77e5e77..4c3e08a43f 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -23,6 +23,9 @@
 #define NOUVEAU_BUFFER_USAGE_ZETA     (1 << 17)
 #define NOUVEAU_BUFFER_USAGE_TRANSFER (1 << 18)
 
+/* use along with GPU_WRITE for 2D-only writes */
+#define NOUVEAU_BUFFER_USAGE_NO_RENDER (1 << 19)
+
 extern struct pipe_screen *
 nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *);
 
diff --git a/src/gallium/drivers/nv04/nv04_context.c b/src/gallium/drivers/nv04/nv04_context.c
index 4b33636b2e..770733a4a1 100644
--- a/src/gallium/drivers/nv04/nv04_context.c
+++ b/src/gallium/drivers/nv04/nv04_context.c
@@ -31,26 +31,26 @@ static boolean
 nv04_init_hwctx(struct nv04_context *nv04)
 {
 	// requires a valid handle
-//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOTIFY, 1);
+//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOTIFY, 1);
 //	OUT_RING(0);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_NOP, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_NOP, 1);
 	OUT_RING(0);
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
 	OUT_RING(0x40182800);
 //	OUT_RING(1<<20/*no cull*/);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
 //	OUT_RING(0x24|(1<<6)|(1<<8));
 	OUT_RING(0x120001a4);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FORMAT, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FORMAT, 1);
 	OUT_RING(0x332213a1);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FILTER, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FILTER, 1);
 	OUT_RING(0x11001010);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_COLORKEY, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_COLORKEY, 1);
 	OUT_RING(0x0);
-//	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 1);
+//	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 1);
 //	OUT_RING(SCREEN_OFFSET);
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_FOGCOLOR, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_FOGCOLOR, 1);
 	OUT_RING(0xff000000);
 
 
diff --git a/src/gallium/drivers/nv04/nv04_fragtex.c b/src/gallium/drivers/nv04/nv04_fragtex.c
index 0cce71ad1d..c152b52119 100644
--- a/src/gallium/drivers/nv04/nv04_fragtex.c
+++ b/src/gallium/drivers/nv04/nv04_fragtex.c
@@ -4,7 +4,7 @@
 #define _(m,tf)                                                                \
 {                                                                              \
   PIPE_FORMAT_##m,                                                             \
-  NV04_DX5_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
+  NV04_TEXTURED_TRIANGLE_FORMAT_COLOR_##tf,                                               \
 }
 
 struct nv04_texture_format {
@@ -53,14 +53,14 @@ nv04_fragtex_build(struct nv04_context *nv04, int unit)
 		return;
 	}
 
-	nv04->fragtex.format = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER 
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
+	nv04->fragtex.format = NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_ZOH_CORNER
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ORIGIN_FOH_CORNER
 		| nv04_fragtex_format(pt->format)
-		| ( (pt->last_level + 1) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
-		| ( log2i(pt->width0) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
-		| ( log2i(pt->height0) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
-		| NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
+		| ( (pt->last_level + 1) << NV04_TEXTURED_TRIANGLE_FORMAT_MIPMAP_LEVELS_SHIFT )
+		| ( log2i(pt->width0) << NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_U_SHIFT )
+		| ( log2i(pt->height0) << NV04_TEXTURED_TRIANGLE_FORMAT_BASE_SIZE_V_SHIFT )
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE
+		| NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_CLAMP_TO_EDGE
 		;
 }
 
diff --git a/src/gallium/drivers/nv04/nv04_prim_vbuf.c b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
index f6458232ae..25395edfd7 100644
--- a/src/gallium/drivers/nv04/nv04_prim_vbuf.c
+++ b/src/gallium/drivers/nv04/nv04_prim_vbuf.c
@@ -93,7 +93,7 @@ nv04_vbuf_render_set_primitive( struct vbuf_render *render,
 
 static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
+	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xA),49);
 	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
@@ -105,7 +105,7 @@ static INLINE void nv04_2triangles(struct nv04_context* nv04, unsigned char* buf
 
 static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
+	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xD),25);
 	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
@@ -114,7 +114,7 @@ static INLINE void nv04_1triangle(struct nv04_context* nv04, unsigned char* buff
 
 static INLINE void nv04_1quad(struct nv04_context* nv04, unsigned char* buffer, ushort v0, ushort v1, ushort v2, ushort v3)
 {
-	BEGIN_RING(fahrenheit,NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
+	BEGIN_RING(fahrenheit,NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0xC),33);
 	OUT_RINGp(buffer + VERTEX_SIZE * v0,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v1,8);
 	OUT_RINGp(buffer + VERTEX_SIZE * v2,8);
@@ -166,11 +166,11 @@ static void nv04_vbuf_render_tri_strip_elts(struct nv04_vbuf_render* render, con
 		if (numvert<3)
 			break;
 
-		BEGIN_RING( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
+		BEGIN_RING( fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), numvert*8 );
 		for(j = 0; j<numvert; j++)
 			OUT_RINGp( buffer + VERTEX_SIZE * indices [i+j], 8 );
 
-		BEGIN_RING_NI( fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2 );
+		BEGIN_RING_NI( fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2 );
 		for(j = 0; j<numtri/2; j++ )
 			OUT_RING(striptbl[j]);
 		if (numtri%2)
@@ -185,7 +185,7 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 	struct nv04_context* nv04 = render->nv04;
 	int i,j;
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x0), 8);
 	OUT_RINGp(buffer + VERTEX_SIZE * indices[0], 8);
 
 	for(i = 1; i<nr_indices; i+=14)
@@ -195,12 +195,12 @@ static void nv04_vbuf_render_tri_fan_elts(struct nv04_vbuf_render* render, const
 		if (numvert < 3)
 			break;
 
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
+		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_TLVERTEX_SX(0x1), numvert*8);
 
 		for(j=0;j<numvert;j++)
 			OUT_RINGp( buffer + VERTEX_SIZE * indices[ i+j ], 8 );
 
-		BEGIN_RING_NI(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_TLVERTEX_DRAWPRIMITIVE(0), (numtri+1)/2);
+		BEGIN_RING_NI(fahrenheit, NV04_TEXTURED_TRIANGLE_DRAWPRIMITIVE(0), (numtri+1)/2);
 		for(j = 0; j<numtri/2; j++)
 			OUT_RING(fantbl[j]);
 		if (numtri%2)
diff --git a/src/gallium/drivers/nv04/nv04_screen.c b/src/gallium/drivers/nv04/nv04_screen.c
index 170ce3eb7e..7c5b6e8229 100644
--- a/src/gallium/drivers/nv04/nv04_screen.c
+++ b/src/gallium/drivers/nv04/nv04_screen.c
@@ -119,6 +119,8 @@ nv04_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_grobj_free(&screen->fahrenheit);
 	nv04_surface_2d_takedown(&screen->eng2d);
 
+	nouveau_screen_fini(&screen->base);
+
 	FREE(pscreen);
 }
 
@@ -163,10 +165,10 @@ nv04_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		fahrenheit_class = 0;
 		sub3d_class = 0;
 	} else if (dev->chipset >= 0x10) {
-		fahrenheit_class = NV10_DX5_TEXTURED_TRIANGLE;
+		fahrenheit_class = NV10_TEXTURED_TRIANGLE;
 		sub3d_class = NV10_CONTEXT_SURFACES_3D;
 	} else {
-		fahrenheit_class=NV04_DX5_TEXTURED_TRIANGLE;
+		fahrenheit_class=NV04_TEXTURED_TRIANGLE;
 		sub3d_class = NV04_CONTEXT_SURFACES_3D;
 	}
 
diff --git a/src/gallium/drivers/nv04/nv04_state.c b/src/gallium/drivers/nv04/nv04_state.c
index ef3005db5f..e3dc4c5bf4 100644
--- a/src/gallium/drivers/nv04/nv04_state.c
+++ b/src/gallium/drivers/nv04/nv04_state.c
@@ -50,28 +50,28 @@ wrap_mode(unsigned wrap) {
 
 	switch (wrap) {
 	case PIPE_TEX_WRAP_REPEAT:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_REPEAT;
 		break;
 	case PIPE_TEX_WRAP_MIRROR_REPEAT:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_MIRRORED_REPEAT;
 		break;
 	case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_EDGE;
 		break;
 	case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP_TO_BORDER;
 		break;
 	case PIPE_TEX_WRAP_CLAMP:
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
 		break;
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
 	case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
 	case PIPE_TEX_WRAP_MIRROR_CLAMP:
 	default:
 		NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
-		ret = NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
+		ret = NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_CLAMP;
 	}
-	return ret >> NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
+	return ret >> NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT;
 }
 
 static void *
@@ -84,20 +84,20 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 
 	ss = MALLOC(sizeof(struct nv04_sampler_state));
 
-	ss->format = ((wrap_mode(cso->wrap_s) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
-		    (wrap_mode(cso->wrap_t) << NV04_DX5_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
+	ss->format = ((wrap_mode(cso->wrap_s) << NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSU_SHIFT) |
+		    (wrap_mode(cso->wrap_t) << NV04_TEXTURED_TRIANGLE_FORMAT_ADDRESSV_SHIFT));
 
 	if (cso->max_anisotropy > 1.0) {
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_DX5_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MINIFY_ENABLE | NV04_TEXTURED_TRIANGLE_FILTER_ANISOTROPIC_MAGNIFY_ENABLE;
 	}
 
 	switch (cso->mag_img_filter) {
 	case PIPE_TEX_FILTER_LINEAR:
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_LINEAR;
 		break;
 	case PIPE_TEX_FILTER_NEAREST:
 	default:
-		filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
+		filter |= NV04_TEXTURED_TRIANGLE_FILTER_MAGNIFY_NEAREST;
 		break;
 	}
 
@@ -105,14 +105,14 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 	case PIPE_TEX_FILTER_LINEAR:
 		switch (cso->min_mip_filter) {
 		case PIPE_TEX_MIPFILTER_NEAREST:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_NEAREST;
 			break;
 		case PIPE_TEX_MIPFILTER_LINEAR:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR_MIPMAP_LINEAR;
 			break;
 		case PIPE_TEX_MIPFILTER_NONE:
 		default:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_LINEAR;
 			break;
 		}
 		break;
@@ -120,14 +120,14 @@ nv04_sampler_state_create(struct pipe_context *pipe,
 	default:
 		switch (cso->min_mip_filter) {
 		case PIPE_TEX_MIPFILTER_NEAREST:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_NEAREST;
 		break;
 		case PIPE_TEX_MIPFILTER_LINEAR:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST_MIPMAP_LINEAR;
 			break;
 		case PIPE_TEX_MIPFILTER_NONE:
 		default:
-			filter |= NV04_DX5_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
+			filter |= NV04_TEXTURED_TRIANGLE_FILTER_MINIFY_NEAREST;
 			break;
 		}
 		break;
@@ -181,7 +181,7 @@ nv04_rasterizer_state_create(struct pipe_context *pipe,
 	 */
 	rs = MALLOC(sizeof(struct nv04_rasterizer_state));
 
-	rs->blend = cso->flatshade ? NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_DX5_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
+	rs->blend = cso->flatshade ? NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_FLAT : NV04_TEXTURED_TRIANGLE_BLEND_SHADE_MODE_GOURAUD;
 
 	return (void *)rs;
 }
@@ -229,16 +229,16 @@ nv04_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 	hw = MALLOC(sizeof(struct nv04_depth_stencil_alpha_state));
 
 	hw->control = float_to_ubyte(cso->alpha.ref_value);
-	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
-	hw->control |= cso->alpha.enabled ? NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ALPHA_TEST_ENABLE : 0;
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
-	hw->control |= cso->depth.enabled ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE_SHIFT) : 0;
-	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
-	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
-	hw->control |= NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
-	hw->control |= cso->depth.writemask ? (1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_WRITE_ENABLE_SHIFT) : 0;
-	hw->control |= 1 << NV04_DX5_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
+	hw->control |= ( nv04_compare_func(cso->alpha.func) << NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_FUNC_SHIFT );
+	hw->control |= cso->alpha.enabled ? NV04_TEXTURED_TRIANGLE_CONTROL_ALPHA_ENABLE : 0;
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_ORIGIN;
+	hw->control |= cso->depth.enabled ? NV04_TEXTURED_TRIANGLE_CONTROL_Z_ENABLE : 0;
+	hw->control |= ( nv04_compare_func(cso->depth.func)<< NV04_TEXTURED_TRIANGLE_CONTROL_Z_FUNC_SHIFT );
+	hw->control |= 1 << NV04_TEXTURED_TRIANGLE_CONTROL_CULL_MODE_SHIFT; // no culling, handled by the draw module
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_DITHER_ENABLE;
+	hw->control |= NV04_TEXTURED_TRIANGLE_CONTROL_Z_PERSPECTIVE_ENABLE;
+	hw->control |= cso->depth.writemask ? NV04_TEXTURED_TRIANGLE_CONTROL_Z_WRITE : 0;
+	hw->control |= 1 << NV04_TEXTURED_TRIANGLE_CONTROL_Z_FORMAT_SHIFT; // integer zbuffer format
 
 	return (void *)hw;
 }
@@ -377,7 +377,7 @@ nv04_set_scissor_state(struct pipe_context *pipe,
 /*	struct nv04_context *nv04 = nv04_context(pipe);
 
 	// XXX
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_SCISSOR_HORIZ, 2);
 	OUT_RING  (((s->maxx - s->minx) << 16) | s->minx);
 	OUT_RING  (((s->maxy - s->miny) << 16) | s->miny);*/
 }
diff --git a/src/gallium/drivers/nv04/nv04_state_emit.c b/src/gallium/drivers/nv04/nv04_state_emit.c
index eb2c1c57c6..bd98ae091f 100644
--- a/src/gallium/drivers/nv04/nv04_state_emit.c
+++ b/src/gallium/drivers/nv04/nv04_state_emit.c
@@ -58,7 +58,7 @@ static void nv04_emit_control(struct nv04_context* nv04)
 {
 	uint32_t control = nv04->dsa->control;
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
 	OUT_RING(control);
 }
 
@@ -75,7 +75,7 @@ static void nv04_emit_blend(struct nv04_context* nv04)
 	blend|=(nv04_blend_func(nv04->blend->b_src)<<24);
 	blend|=(nv04_blend_func(nv04->blend->b_dst)<<28);
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_BLEND, 1);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_BLEND, 1);
 	OUT_RING(blend);
 }
 
@@ -84,7 +84,7 @@ static void nv04_emit_sampler(struct nv04_context *nv04, int unit)
 	struct nv04_miptree *nv04mt = nv04->tex_miptree[unit];
 	struct pipe_texture *pt = &nv04mt->base;
 
-	BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 3);
+	BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 3);
 	OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 	OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[unit]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
 	OUT_RING(nv04->sampler[unit]->filter);
@@ -163,7 +163,7 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 	if (nv04->dirty & NV04_NEW_CONTROL) {
 		nv04->dirty &= ~NV04_NEW_CONTROL;
 
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_CONTROL, 1);
+		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_CONTROL, 1);
 		OUT_RING(nv04->dsa->control);
 	}
 
@@ -218,7 +218,7 @@ nv04_emit_hw_state(struct nv04_context *nv04)
 		if (!(nv04->fp_samplers & (1 << i)))
 			continue;
 		struct nv04_miptree *nv04mt = nv04->tex_miptree[i];
-		BEGIN_RING(fahrenheit, NV04_DX5_TEXTURED_TRIANGLE_OFFSET, 2);
+		BEGIN_RING(fahrenheit, NV04_TEXTURED_TRIANGLE_OFFSET, 2);
 		OUT_RELOCl(nv04mt->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD);
 		OUT_RELOCd(nv04mt->buffer, (nv04->fragtex.format | nv04->sampler[i]->format), NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_OR | NOUVEAU_BO_RD, 1/*VRAM*/,2/*TT*/);
 	}
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.c b/src/gallium/drivers/nv04/nv04_surface_2d.c
index 12df7fd199..b24a9cee5a 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.c
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.c
@@ -77,7 +77,7 @@ nv04_scaled_image_format(enum pipe_format format)
 }
 
 static INLINE unsigned
-nv04_swizzle_bits(unsigned x, unsigned y)
+nv04_swizzle_bits_square(unsigned x, unsigned y)
 {
 	unsigned u = (x & 0x001) << 0 |
 	             (x & 0x002) << 1 |
@@ -107,6 +107,15 @@ nv04_swizzle_bits(unsigned x, unsigned y)
 	return v | u;
 }
 
+/* rectangular swizzled textures are linear concatenations of swizzled square tiles */
+static INLINE unsigned
+nv04_swizzle_bits(unsigned x, unsigned y, unsigned w, unsigned h)
+{
+	unsigned s = MIN2(w, h);
+	unsigned m = s - 1;
+	return (((x | y) & ~m) * s) | nv04_swizzle_bits_square(x & m, y & m);
+}
+
 static int
 nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 			  struct pipe_surface *dst, int dx, int dy,
@@ -158,20 +167,19 @@ nv04_surface_copy_swizzle(struct nv04_surface_2d *ctx,
 	  for (x = 0; x < w; x += sub_w) {
 	    sub_w = MIN2(sub_w, w - x);
 
-	    /* Must be 64-byte aligned */
-	    assert(!((dst->offset + nv04_swizzle_bits(dx+x, dy+y) * util_format_get_blocksize(dst->texture->format)) & 63));
+	    assert(!(dst->offset & 63));
 
 	    BEGIN_RING(chan, swzsurf, NV04_SWIZZLED_SURFACE_OFFSET, 1);
-	    OUT_RELOCl(chan, dst_bo, dst->offset + nv04_swizzle_bits(dx+x, dy+y) * util_format_get_blocksize(dst->texture->format),
+	    OUT_RELOCl(chan, dst_bo, dst->offset,
                              NOUVEAU_BO_GART | NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
 
 	    BEGIN_RING(chan, sifm, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION, 9);
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_COLOR_CONVERSION_TRUNCATE);
 	    OUT_RING  (chan, nv04_scaled_image_format(src->format));
 	    OUT_RING  (chan, NV04_SCALED_IMAGE_FROM_MEMORY_OPERATION_SRCCOPY);
-	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_POINT_Y_SHIFT));
 	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_CLIP_SIZE_H_SHIFT | sub_w);
-	    OUT_RING  (chan, 0);
+	    OUT_RING  (chan, (x + dx) | ((y + dy) << NV04_SCALED_IMAGE_FROM_MEMORY_OUT_POINT_Y_SHIFT));
 	    OUT_RING  (chan, sub_h << NV04_SCALED_IMAGE_FROM_MEMORY_OUT_SIZE_H_SHIFT | sub_w);
 	    OUT_RING  (chan, 1 << 20);
 	    OUT_RING  (chan, 1 << 20);
@@ -491,3 +499,49 @@ nv04_surface_2d_init(struct nouveau_screen *screen)
 	ctx->fill = nv04_surface_fill;
 	return ctx;
 }
+
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns)
+{
+	int temp_flags;
+
+	// printf("creating temp, flags is %i!\n", flags);
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_DISCARD)
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_DISCARD;
+	}
+	else
+	{
+		temp_flags = ns->base.usage | PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+		ns->base.usage = PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER | PIPE_BUFFER_USAGE_GPU_READ;
+	}
+
+	struct nv40_screen* screen = (struct nv40_screen*)pscreen;
+	ns->base.usage = PIPE_BUFFER_USAGE_GPU_READ | PIPE_BUFFER_USAGE_GPU_WRITE;
+
+	struct pipe_texture templ;
+	memset(&templ, 0, sizeof(templ));
+	templ.format = ns->base.texture->format;
+	templ.target = PIPE_TEXTURE_2D;
+	templ.width0 = ns->base.width;
+	templ.height0 = ns->base.height;
+	templ.depth0 = 1;
+	templ.last_level = 0;
+
+	// TODO: this is probably wrong and we should specifically handle multisampling somehow once it is implemented
+	templ.nr_samples = ns->base.texture->nr_samples;
+
+	templ.tex_usage = ns->base.texture->tex_usage | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+	struct pipe_texture* temp_tex = pscreen->texture_create(pscreen, &templ);
+	struct nv04_surface* temp_ns = (struct nv04_surface*)pscreen->get_tex_surface(pscreen, temp_tex, 0, 0, 0, temp_flags);
+	temp_ns->backing = ns;
+
+	if(ns->base.usage & PIPE_BUFFER_USAGE_GPU_READ)
+		eng2d->copy(eng2d, &temp_ns->backing->base, 0, 0, &ns->base, 0, 0, ns->base.width, ns->base.height);
+
+	return temp_ns;
+}
+
diff --git a/src/gallium/drivers/nv04/nv04_surface_2d.h b/src/gallium/drivers/nv04/nv04_surface_2d.h
index 02b3f56ba8..ce696a11a3 100644
--- a/src/gallium/drivers/nv04/nv04_surface_2d.h
+++ b/src/gallium/drivers/nv04/nv04_surface_2d.h
@@ -4,6 +4,7 @@
 struct nv04_surface {
 	struct pipe_surface base;
 	unsigned pitch;
+	struct nv04_surface* backing;
 };
 
 struct nv04_surface_2d {
@@ -30,4 +31,7 @@ nv04_surface_2d_init(struct nouveau_screen *screen);
 void
 nv04_surface_2d_takedown(struct nv04_surface_2d **);
 
+struct nv04_surface*
+nv04_surface_wrap_for_render(struct pipe_screen *pscreen, struct nv04_surface_2d* eng2d, struct nv04_surface* ns);
+
 #endif
diff --git a/src/gallium/drivers/nv04/nv04_transfer.c b/src/gallium/drivers/nv04/nv04_transfer.c
index 8446073ae8..2dd2e146a8 100644
--- a/src/gallium/drivers/nv04/nv04_transfer.c
+++ b/src/gallium/drivers/nv04/nv04_transfer.c
@@ -16,14 +16,14 @@ struct nv04_transfer {
 };
 
 static void
-nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width0 = u_minify(pt->width0, level);
-	template->height0 = u_minify(pt->height0, level);
+	template->width0 = width;
+	template->height0 = height;
 	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
@@ -71,7 +71,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv04_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv04_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv04_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -105,8 +107,8 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -130,9 +132,9 @@ nv04_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv04_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv04/nv04_vbo.c b/src/gallium/drivers/nv04/nv04_vbo.c
index e3167814f2..099ab10043 100644
--- a/src/gallium/drivers/nv04/nv04_vbo.c
+++ b/src/gallium/drivers/nv04/nv04_vbo.c
@@ -45,7 +45,7 @@ boolean nv04_draw_elements( struct pipe_context *pipe,
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
-	draw_set_mapped_constant_buffer(draw,
+	draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
 					nv04->constbuf[PIPE_SHADER_VERTEX],
 					nv04->constbuf_nr[PIPE_SHADER_VERTEX]);
 
diff --git a/src/gallium/drivers/nv10/nv10_screen.c b/src/gallium/drivers/nv10/nv10_screen.c
index ee5901e743..6a39ddeaac 100644
--- a/src/gallium/drivers/nv10/nv10_screen.c
+++ b/src/gallium/drivers/nv10/nv10_screen.c
@@ -115,6 +115,9 @@ nv10_screen_destroy(struct pipe_screen *pscreen)
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->celsius);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
diff --git a/src/gallium/drivers/nv10/nv10_transfer.c b/src/gallium/drivers/nv10/nv10_transfer.c
index c664973e90..eb04af9782 100644
--- a/src/gallium/drivers/nv10/nv10_transfer.c
+++ b/src/gallium/drivers/nv10/nv10_transfer.c
@@ -16,14 +16,14 @@ struct nv10_transfer {
 };
 
 static void
-nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width0 = u_minify(pt->width0, level);
-	template->height0 = u_minify(pt->height0, level);
+	template->width0 = width;
+	template->height0 = height;
 	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
@@ -71,7 +71,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv10_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv10_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv10_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -105,8 +107,8 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -130,9 +132,9 @@ nv10_transfer_del(struct pipe_transfer *ptx)
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv10_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv10/nv10_vbo.c b/src/gallium/drivers/nv10/nv10_vbo.c
index 441a4f75f3..0d26141248 100644
--- a/src/gallium/drivers/nv10/nv10_vbo.c
+++ b/src/gallium/drivers/nv10/nv10_vbo.c
@@ -45,6 +45,7 @@ boolean nv10_draw_elements( struct pipe_context *pipe,
 	}
 
 	draw_set_mapped_constant_buffer(draw,
+                                        PIPE_SHADER_VERTEX,
 					nv10->constbuf[PIPE_SHADER_VERTEX],
 					nv10->constbuf_nr[PIPE_SHADER_VERTEX]);
 
diff --git a/src/gallium/drivers/nv20/nv20_miptree.c b/src/gallium/drivers/nv20/nv20_miptree.c
index d1291a92e0..8f7538e7f5 100644
--- a/src/gallium/drivers/nv20/nv20_miptree.c
+++ b/src/gallium/drivers/nv20/nv20_miptree.c
@@ -6,6 +6,7 @@
 
 #include "nv20_context.h"
 #include "nv20_screen.h"
+#include "../nv04/nv04_surface_2d.h"
 
 static void
 nv20_miptree_layout(struct nv20_miptree *nv20mt)
@@ -127,6 +128,12 @@ nv20_miptree_create(struct pipe_screen *screen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv20_miptree_layout(mt);
 
 	mt->buffer = screen->buffer_create(screen, 256, buf_usage, mt->total_size);
@@ -183,12 +190,27 @@ nv20_miptree_surface_get(struct pipe_screen *screen, struct pipe_texture *pt,
 		ns->base.offset = nv20mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(screen, ((struct nv20_screen*)screen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv20_miptree_surface_destroy(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv20_screen* screen = (struct nv20_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv20_miptree_surface_destroy(&ns->backing->base);
+	}
+	
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv20/nv20_screen.c b/src/gallium/drivers/nv20/nv20_screen.c
index 4eeacd1afd..a0973f1ebd 100644
--- a/src/gallium/drivers/nv20/nv20_screen.c
+++ b/src/gallium/drivers/nv20/nv20_screen.c
@@ -115,6 +115,9 @@ nv20_screen_destroy(struct pipe_screen *pscreen)
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->kelvin);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
diff --git a/src/gallium/drivers/nv20/nv20_state_emit.c b/src/gallium/drivers/nv20/nv20_state_emit.c
index 0122b1c2cd..63cba1f412 100644
--- a/src/gallium/drivers/nv20/nv20_state_emit.c
+++ b/src/gallium/drivers/nv20/nv20_state_emit.c
@@ -228,7 +228,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	}
 
 	/* always do position */ {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_POSITION, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_POSITION, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
 		vinfo->hwfmt[0] |= (1 << 0);
 	}
@@ -237,19 +237,19 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 4; i < 6; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i - 3));
 	}
 
 	if (colors[0]) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_COLOR, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
 		vinfo->hwfmt[0] |= (1 << 3);
 	}
 
 	if (colors[1]) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_COLOR, 1);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_COLOR, 1);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
 		vinfo->hwfmt[0] |= (1 << 4);
 	}
@@ -258,7 +258,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 6; i < 10; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i - 1));
 	}
@@ -267,7 +267,7 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 0; i < 4; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i + 9));
 	}
@@ -276,13 +276,13 @@ static void nv20_vertex_layout(struct nv20_context *nv20)
 	for (i = 10; i < 12; i++) {
 		if (!generics[i])
 			continue;
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_GENERIC, i);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_GENERIC, i);
 		draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << (i + 3));
 	}
 
 	if (fog) {
-		src = draw_find_vs_output(dc, TGSI_SEMANTIC_FOG, 0);
+		src = draw_find_shader_output(dc, TGSI_SEMANTIC_FOG, 0);
 		draw_emit_vertex_attr(vinfo, EMIT_1F, INTERP_PERSPECTIVE, src);
 		vinfo->hwfmt[0] |= (1 << 15);
 	}
diff --git a/src/gallium/drivers/nv20/nv20_transfer.c b/src/gallium/drivers/nv20/nv20_transfer.c
index 69b79c809f..699773e8e6 100644
--- a/src/gallium/drivers/nv20/nv20_transfer.c
+++ b/src/gallium/drivers/nv20/nv20_transfer.c
@@ -16,14 +16,14 @@ struct nv20_transfer {
 };
 
 static void
-nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width0 = u_minify(pt->width0, level);
-	template->height0 = u_minify(pt->height0, level);
+	template->width0 = width;
+	template->height0 = height;
 	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
@@ -71,7 +71,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv20_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv20_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv20_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -105,8 +107,8 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -126,13 +128,13 @@ nv20_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv20_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv20/nv20_vbo.c b/src/gallium/drivers/nv20/nv20_vbo.c
index 84d7db6c5e..4bf461eba9 100644
--- a/src/gallium/drivers/nv20/nv20_vbo.c
+++ b/src/gallium/drivers/nv20/nv20_vbo.c
@@ -45,7 +45,7 @@ boolean nv20_draw_elements( struct pipe_context *pipe,
 		draw_set_mapped_element_buffer(draw, 0, NULL);
 	}
 
-	draw_set_mapped_constant_buffer(draw,
+	draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX,
 					nv20->constbuf[PIPE_SHADER_VERTEX],
 					nv20->constbuf_nr[PIPE_SHADER_VERTEX]);
 
diff --git a/src/gallium/drivers/nv30/nv30_context.c b/src/gallium/drivers/nv30/nv30_context.c
index 46a821a48b..38b39159f1 100644
--- a/src/gallium/drivers/nv30/nv30_context.c
+++ b/src/gallium/drivers/nv30/nv30_context.c
@@ -25,6 +25,12 @@ static void
 nv30_destroy(struct pipe_context *pipe)
 {
 	struct nv30_context *nv30 = nv30_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < NV30_STATE_MAX; i++) {
+		if (nv30->state.hw[i])
+			so_ref(NULL, &nv30->state.hw[i]);
+	}
 
 	if (nv30->draw)
 		draw_destroy(nv30->draw);
diff --git a/src/gallium/drivers/nv30/nv30_fragprog.c b/src/gallium/drivers/nv30/nv30_fragprog.c
index 40965a9772..d1ff18e2df 100644
--- a/src/gallium/drivers/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nv30/nv30_fragprog.c
@@ -435,10 +435,11 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = temp(fpc);
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp = nv30_sr(NV30SR_NONE, 0);
 		tmp.cc_update = 1;
 		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV30_VP_INST_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
 		dst.cc_test = NV30_VP_INST_COND_LT;
 		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
 		break;
@@ -517,13 +518,28 @@ nv30_fragprog_parse_instruction(struct nv30_fpc *fpc,
 		arith(fpc, sat, RSQ, dst, mask, abs(swz(src[0], X, X, X, X)), none, none);
 		break;
 	case TGSI_OPCODE_SCS:
-		if (mask & MASK_X) {
-			arith(fpc, sat, COS, dst, MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
+		/* avoid overwriting the source */
+		if(src[0].swz[SWZ_X] != SWZ_X)
+		{
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
-		if (mask & MASK_Y) {
-			arith(fpc, sat, SIN, dst, MASK_Y,
-			      swz(src[0], X, X, X, X), none, none);
+		else
+		{
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
 		break;
 	case TGSI_OPCODE_SIN:
@@ -870,6 +886,12 @@ void
 nv30_fragprog_destroy(struct nv30_context *nv30,
 		      struct nv30_fragment_program *fp)
 {
+	if (fp->buffer)
+		pipe_buffer_reference(&fp->buffer, NULL);
+
+	if (fp->so)
+		so_ref(NULL, &fp->so);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
diff --git a/src/gallium/drivers/nv30/nv30_miptree.c b/src/gallium/drivers/nv30/nv30_miptree.c
index ce95d9700f..8fbba38e78 100644
--- a/src/gallium/drivers/nv30/nv30_miptree.c
+++ b/src/gallium/drivers/nv30/nv30_miptree.c
@@ -5,6 +5,7 @@
 #include "util/u_math.h"
 
 #include "nv30_context.h"
+#include "../nv04/nv04_surface_2d.h"
 
 static void
 nv30_miptree_layout(struct nv30_miptree *nv30mt)
@@ -108,6 +109,12 @@ nv30_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv30_miptree_layout(mt);
 
 	mt->buffer = pscreen->buffer_create(pscreen, 256, buf_usage,
@@ -196,12 +203,27 @@ nv30_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		ns->base.offset = nv30mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(pscreen, ((struct nv30_screen*)pscreen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv30_miptree_surface_del(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv30_screen* screen = (struct nv30_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv30_miptree_surface_del(&ns->backing->base);
+	}
+
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv30/nv30_screen.c b/src/gallium/drivers/nv30/nv30_screen.c
index 7cd36902eb..760467f736 100644
--- a/src/gallium/drivers/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nv30/nv30_screen.c
@@ -156,6 +156,12 @@ static void
 nv30_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv30_screen *screen = nv30_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < NV30_STATE_MAX; i++) {
+		if (screen->state[i])
+			so_ref(NULL, &screen->state[i]);
+	}
 
 	nouveau_resource_free(&screen->vp_exec_heap);
 	nouveau_resource_free(&screen->vp_data_heap);
@@ -163,6 +169,9 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->rankine);
+	nv04_surface_2d_takedown(&screen->eng2d);
+
+	nouveau_screen_fini(&screen->base);
 
 	FREE(pscreen);
 }
diff --git a/src/gallium/drivers/nv30/nv30_transfer.c b/src/gallium/drivers/nv30/nv30_transfer.c
index 2255a02cae..65598991c6 100644
--- a/src/gallium/drivers/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nv30/nv30_transfer.c
@@ -16,14 +16,14 @@ struct nv30_transfer {
 };
 
 static void
-nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width0 = u_minify(pt->width0, level);
-	template->height0 = u_minify(pt->height0, level);
+	template->width0 = width;
+	template->height0 = height;
 	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
@@ -71,7 +71,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv30_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv30_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv30_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -105,8 +107,8 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -126,13 +128,13 @@ nv30_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv30_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv40/nv40_context.c b/src/gallium/drivers/nv40/nv40_context.c
index eb9cce4c78..d56c7a6b49 100644
--- a/src/gallium/drivers/nv40/nv40_context.c
+++ b/src/gallium/drivers/nv40/nv40_context.c
@@ -25,6 +25,12 @@ static void
 nv40_destroy(struct pipe_context *pipe)
 {
 	struct nv40_context *nv40 = nv40_context(pipe);
+	unsigned i;
+
+	for (i = 0; i < NV40_STATE_MAX; i++) {
+		if (nv40->state.hw[i])
+			so_ref(NULL, &nv40->state.hw[i]);
+	}
 
 	if (nv40->draw)
 		draw_destroy(nv40->draw);
diff --git a/src/gallium/drivers/nv40/nv40_draw.c b/src/gallium/drivers/nv40/nv40_draw.c
index b2f19ecb69..3875bc3545 100644
--- a/src/gallium/drivers/nv40/nv40_draw.c
+++ b/src/gallium/drivers/nv40/nv40_draw.c
@@ -261,7 +261,8 @@ nv40_draw_elements_swtnl(struct pipe_context *pipe,
 		map = pipe_buffer_map(pscreen,
 				      nv40->constbuf[PIPE_SHADER_VERTEX],
 				      PIPE_BUFFER_USAGE_CPU_READ);
-		draw_set_mapped_constant_buffer(nv40->draw, map, nr);
+		draw_set_mapped_constant_buffer(nv40->draw, PIPE_SHADER_VERTEX,
+                                                map, nr);
 	}
 
 	draw_arrays(nv40->draw, mode, start, count);
@@ -285,7 +286,7 @@ static INLINE void
 emit_attrib(struct nv40_context *nv40, unsigned hw, unsigned emit,
 	    unsigned semantic, unsigned index)
 {
-	unsigned draw_out = draw_find_vs_output(nv40->draw, semantic, index);
+	unsigned draw_out = draw_find_shader_output(nv40->draw, semantic, index);
 	unsigned a = nv40->swtnl.nr_attribs++;
 
 	nv40->swtnl.hw[a] = hw;
diff --git a/src/gallium/drivers/nv40/nv40_fragprog.c b/src/gallium/drivers/nv40/nv40_fragprog.c
index 1bf16726d1..bb9c85cc43 100644
--- a/src/gallium/drivers/nv40/nv40_fragprog.c
+++ b/src/gallium/drivers/nv40/nv40_fragprog.c
@@ -149,7 +149,7 @@ emit_src(struct nv40_fpc *fpc, int pos, struct nv40_sreg src)
 				sizeof(uint32_t) * 4);
 		}
 
-		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);	
+		sr |= (NV40_FP_REG_TYPE_CONST << NV40_FP_REG_TYPE_SHIFT);
 		break;
 	case NV40SR_NONE:
 		sr |= (NV40_FP_REG_TYPE_INPUT << NV40_FP_REG_TYPE_SHIFT);
@@ -445,10 +445,11 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 		arith(fpc, sat, ADD, dst, mask, src[0], src[1], none);
 		break;
 	case TGSI_OPCODE_CMP:
-		tmp = temp(fpc);
-		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
+		tmp = nv40_sr(NV40SR_NONE, 0);
 		tmp.cc_update = 1;
 		arith(fpc, 0, MOV, tmp, 0xf, src[0], none, none);
+		dst.cc_test = NV40_VP_INST_COND_GE;
+		arith(fpc, sat, MOV, dst, mask, src[2], none, none);
 		dst.cc_test = NV40_VP_INST_COND_LT;
 		arith(fpc, sat, MOV, dst, mask, src[1], none, none);
 		break;
@@ -573,13 +574,28 @@ nv40_fragprog_parse_instruction(struct nv40_fpc *fpc,
 		      neg(swz(tmp, X, X, X, X)), none, none);
 		break;
 	case TGSI_OPCODE_SCS:
-		if (mask & MASK_X) {
-			arith(fpc, sat, COS, dst, MASK_X,
-			      swz(src[0], X, X, X, X), none, none);
+		/* avoid overwriting the source */
+		if(src[0].swz[SWZ_X] != SWZ_X)
+		{
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
-		if (mask & MASK_Y) {
-			arith(fpc, sat, SIN, dst, MASK_Y,
-			      swz(src[0], X, X, X, X), none, none);
+		else
+		{
+			if (mask & MASK_Y) {
+				arith(fpc, sat, SIN, dst, MASK_Y,
+				      swz(src[0], X, X, X, X), none, none);
+			}
+			if (mask & MASK_X) {
+				arith(fpc, sat, COS, dst, MASK_X,
+				      swz(src[0], X, X, X, X), none, none);
+			}
 		}
 		break;
 	case TGSI_OPCODE_SEQ:
@@ -752,7 +768,7 @@ nv40_fragprog_prepare(struct nv40_fpc *fpc)
 		{
 			struct tgsi_full_immediate *imm;
 			float vals[4];
-			
+
 			imm = &p.FullToken.FullImmediate;
 			assert(imm->Immediate.DataType == TGSI_IMM_FLOAT32);
 			assert(fpc->nr_imm < MAX_IMM);
@@ -836,7 +852,7 @@ nv40_fragprog_translate(struct nv40_context *nv40,
 	fp->insn[fpc->inst_offset + 1] = 0x00000000;
 	fp->insn[fpc->inst_offset + 2] = 0x00000000;
 	fp->insn[fpc->inst_offset + 3] = 0x00000000;
-	
+
 	fp->translated = TRUE;
 out_err:
 	tgsi_parse_free(&parse);
@@ -917,7 +933,7 @@ nv40_fragprog_validate(struct nv40_context *nv40)
 update_constants:
 	if (fp->nr_consts) {
 		float *map;
-		
+
 		map = pipe_buffer_map(pscreen, constbuf,
 				      PIPE_BUFFER_USAGE_CPU_READ);
 		for (i = 0; i < fp->nr_consts; i++) {
@@ -948,6 +964,12 @@ void
 nv40_fragprog_destroy(struct nv40_context *nv40,
 		      struct nv40_fragment_program *fp)
 {
+	if (fp->buffer)
+		pipe_buffer_reference(&fp->buffer, NULL);
+
+	if (fp->so)
+		so_ref(NULL, &fp->so);
+
 	if (fp->insn_len)
 		FREE(fp->insn);
 }
diff --git a/src/gallium/drivers/nv40/nv40_miptree.c b/src/gallium/drivers/nv40/nv40_miptree.c
index b974e68a07..89bd155ff4 100644
--- a/src/gallium/drivers/nv40/nv40_miptree.c
+++ b/src/gallium/drivers/nv40/nv40_miptree.c
@@ -5,6 +5,7 @@
 #include "util/u_math.h"
 
 #include "nv40_context.h"
+#include "../nv04/nv04_surface_2d.h"
 
 
 
@@ -105,6 +106,12 @@ nv40_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	if (pt->tex_usage & PIPE_TEXTURE_USAGE_DYNAMIC)
 		buf_usage |= PIPE_BUFFER_USAGE_CPU_READ_WRITE;
 
+	/* apparently we can't render to swizzled surfaces smaller than 64 bytes, so make them linear.
+	 * If the user did not ask for a render target, they can still render to it, but it will cost them an extra copy.
+	 * This also happens for small mipmaps of large textures. */
+	if (pt->tex_usage & PIPE_TEXTURE_USAGE_RENDER_TARGET && util_format_get_stride(pt->format, pt->width0) < 64)
+		mt->base.tex_usage |= NOUVEAU_TEXTURE_USAGE_LINEAR;
+
 	nv40_miptree_layout(mt);
 
 	mt->buffer = pscreen->buffer_create(pscreen, 256, buf_usage, mt->total_size);
@@ -191,12 +198,27 @@ nv40_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		ns->base.offset = mt->level[level].image_offset[0];
 	}
 
+	/* create a linear temporary that we can render into if necessary.
+	 * Note that ns->pitch is always a multiple of 64 for linear surfaces and swizzled surfaces are POT, so
+	 * ns->pitch & 63 is equivalent to (ns->pitch < 64 && swizzled)*/
+	if((ns->pitch & 63) && (ns->base.usage & (PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER)) == PIPE_BUFFER_USAGE_GPU_WRITE)
+		return &nv04_surface_wrap_for_render(pscreen, ((struct nv40_screen*)pscreen)->eng2d, ns)->base;
+
 	return &ns->base;
 }
 
 static void
 nv40_miptree_surface_del(struct pipe_surface *ps)
 {
+	struct nv04_surface* ns = (struct nv04_surface*)ps;
+	if(ns->backing)
+	{
+		struct nv40_screen* screen = (struct nv40_screen*)ps->texture->screen;
+		if(ns->backing->base.usage & PIPE_BUFFER_USAGE_GPU_WRITE)
+			screen->eng2d->copy(screen->eng2d, &ns->backing->base, 0, 0, ps, 0, 0, ns->base.width, ns->base.height);
+		nv40_miptree_surface_del(&ns->backing->base);
+	}
+
 	pipe_texture_reference(&ps->texture, NULL);
 	FREE(ps);
 }
diff --git a/src/gallium/drivers/nv40/nv40_screen.c b/src/gallium/drivers/nv40/nv40_screen.c
index bd13dfddd1..d01e712805 100644
--- a/src/gallium/drivers/nv40/nv40_screen.c
+++ b/src/gallium/drivers/nv40/nv40_screen.c
@@ -140,6 +140,12 @@ static void
 nv40_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv40_screen *screen = nv40_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < NV40_STATE_MAX; i++) {
+		if (screen->state[i])
+			so_ref(NULL, &screen->state[i]);
+	}
 
 	nouveau_resource_free(&screen->vp_exec_heap);
 	nouveau_resource_free(&screen->vp_data_heap);
@@ -147,6 +153,7 @@ nv40_screen_destroy(struct pipe_screen *pscreen)
 	nouveau_notifier_free(&screen->query);
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->curie);
+	nv04_surface_2d_takedown(&screen->eng2d);
 
 	nouveau_screen_fini(&screen->base);
 
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index b084a38b48..791ee6823d 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -16,14 +16,14 @@ struct nv40_transfer {
 };
 
 static void
-nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
+nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned width, unsigned height,
                              struct pipe_texture *template)
 {
 	memset(template, 0, sizeof(struct pipe_texture));
 	template->target = pt->target;
 	template->format = pt->format;
-	template->width0 = u_minify(pt->width0, level);
-	template->height0 = u_minify(pt->height0, level);
+	template->width0 = width;
+	template->height0 = height;
 	template->depth0 = 1;
 	template->last_level = 0;
 	template->nr_samples = pt->nr_samples;
@@ -71,7 +71,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->direct = false;
 
-	nv40_compatible_transfer_tex(pt, level, &tx_tex_template);
+	nv40_compatible_transfer_tex(pt, w, h, &tx_tex_template);
 
 	tx_tex = pscreen->texture_create(pscreen, &tx_tex_template);
 	if (!tx_tex)
@@ -80,6 +80,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
+	tx->base.stride = ((struct nv40_miptree*)tx_tex)->level[0].pitch;
+
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
 	                                       pipe_transfer_buffer_flags(&tx->base));
@@ -105,8 +107,8 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		/* TODO: Check if SIFM can un-swizzle */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
 		                      tx->surface, 0, 0,
-		                      src, 0, 0,
-		                      src->width, src->height);
+		                      src, x, y,
+		                      w, h);
 
 		pipe_surface_reference(&src, NULL);
 	}
@@ -126,13 +128,13 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 
 		dst = pscreen->get_tex_surface(pscreen, ptx->texture,
 	                                       ptx->face, ptx->level, ptx->zslice,
-	                                       PIPE_BUFFER_USAGE_GPU_WRITE);
+	                                       PIPE_BUFFER_USAGE_GPU_WRITE | NOUVEAU_BUFFER_USAGE_NO_RENDER);
 
 		/* TODO: Check if SIFM can deal with x,y,w,h when swizzling */
 		nvscreen->eng2d->copy(nvscreen->eng2d,
-		                      dst, 0, 0,
+		                      dst, tx->base.x, tx->base.y,
 		                      tx->surface, 0, 0,
-		                      dst->width, dst->height);
+		                      tx->base.width, tx->base.height);
 
 		pipe_surface_reference(&dst, NULL);
 	}
@@ -151,8 +153,10 @@ nv40_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
 	                            pipe_transfer_buffer_flags(ptx));
 
-	return map + ns->base.offset +
-	       ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
+	if(!tx->direct)
+		return map + ns->base.offset;
+	else
+		return map + ns->base.offset + ptx->y * ns->pitch + ptx->x * util_format_get_blocksize(ptx->texture->format);
 }
 
 static void
diff --git a/src/gallium/drivers/nv50/nv50_context.c b/src/gallium/drivers/nv50/nv50_context.c
index d21b80eab8..5997456e4c 100644
--- a/src/gallium/drivers/nv50/nv50_context.c
+++ b/src/gallium/drivers/nv50/nv50_context.c
@@ -43,6 +43,39 @@ nv50_destroy(struct pipe_context *pipe)
 {
 	struct nv50_context *nv50 = nv50_context(pipe);
 
+        if (nv50->state.fb)
+		so_ref(NULL, &nv50->state.fb);
+	if (nv50->state.blend)
+		so_ref(NULL, &nv50->state.blend);
+	if (nv50->state.blend_colour)
+		so_ref(NULL, &nv50->state.blend_colour);
+	if (nv50->state.zsa)
+		so_ref(NULL, &nv50->state.zsa);
+	if (nv50->state.rast)
+		so_ref(NULL, &nv50->state.rast);
+	if (nv50->state.stipple)
+		so_ref(NULL, &nv50->state.stipple);
+	if (nv50->state.scissor)
+		so_ref(NULL, &nv50->state.scissor);
+	if (nv50->state.viewport)
+		so_ref(NULL, &nv50->state.viewport);
+	if (nv50->state.tsc_upload)
+		so_ref(NULL, &nv50->state.tsc_upload);
+	if (nv50->state.tic_upload)
+		so_ref(NULL, &nv50->state.tic_upload);
+	if (nv50->state.vertprog)
+		so_ref(NULL, &nv50->state.vertprog);
+	if (nv50->state.fragprog)
+		so_ref(NULL, &nv50->state.fragprog);
+	if (nv50->state.programs)
+		so_ref(NULL, &nv50->state.programs);
+	if (nv50->state.vtxfmt)
+		so_ref(NULL, &nv50->state.vtxfmt);
+	if (nv50->state.vtxbuf)
+		so_ref(NULL, &nv50->state.vtxbuf);
+	if (nv50->state.vtxattr)
+		so_ref(NULL, &nv50->state.vtxattr);
+
 	draw_destroy(nv50->draw);
 	FREE(nv50);
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index 679c28ce4b..2d0b1818ef 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -154,26 +154,17 @@ struct nv50_pc {
 	int if_lvl, loop_lvl;
 	unsigned loop_pos[NV50_MAX_LOOP_NESTING];
 
+	unsigned *insn_pos; /* actual program offset of each TGSI insn */
+	boolean in_subroutine;
+
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
 	unsigned insn_nr;
 
 	boolean allow32;
-};
-
-static INLINE struct nv50_reg *
-reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
-{
-	struct nv50_reg *ri;
 
-	assert(pc->reg_instance_nr < 16);
-	ri = &pc->reg_instances[pc->reg_instance_nr++];
-	if (reg) {
-		*ri = *reg;
-		reg->mod = 0;
-	}
-	return ri;
-}
+	uint8_t edgeflag_out;
+};
 
 static INLINE void
 ctor_reg(struct nv50_reg *reg, unsigned type, int index, int hw)
@@ -253,6 +244,21 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 	assert(0);
 }
 
+static INLINE struct nv50_reg *
+reg_instance(struct nv50_pc *pc, struct nv50_reg *reg)
+{
+	struct nv50_reg *ri;
+
+	assert(pc->reg_instance_nr < 16);
+	ri = &pc->reg_instances[pc->reg_instance_nr++];
+	if (reg) {
+		alloc_reg(pc, reg);
+		*ri = *reg;
+		reg->mod = 0;
+	}
+	return ri;
+}
+
 /* XXX: For shaders that aren't executed linearly (e.g. shaders that
  * contain loops), we need to assign all hw regs to TGSI TEMPs early,
  * lest we risk temp_temps overwriting regs alloc'd "later".
@@ -279,22 +285,6 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 	return NULL;
 }
 
-/* Assign the hw of the discarded temporary register src
- * to the tgsi register dst and free src.
- */
-static void
-assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
-{
-	assert(src->index == -1 && src->hw != -1);
-
-	if (dst->hw != -1)
-		pc->r_temp[dst->hw] = NULL;
-	pc->r_temp[src->hw] = dst;
-	dst->hw = src->hw;
-
-	FREE(src);
-}
-
 /* release the hardware resource held by r */
 static void
 release_hw(struct nv50_pc *pc, struct nv50_reg *r)
@@ -451,10 +441,19 @@ is_immd(struct nv50_program_exec *e)
 	return FALSE;
 }
 
+static boolean
+is_join(struct nv50_program_exec *e)
+{
+	if (is_long(e) && (e->inst[1] & 3) == 2)
+		return TRUE;
+	return FALSE;
+}
+
 static INLINE void
 set_pred(struct nv50_pc *pc, unsigned pred, unsigned idx,
 	 struct nv50_program_exec *e)
 {
+	assert(!is_immd(e));
 	set_long(pc, e);
 	e->inst[1] &= ~((0x1f << 7) | (0x3 << 12));
 	e->inst[1] |= (pred << 7) | (idx << 12);
@@ -497,15 +496,6 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)
 static INLINE void
 set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 {
-	union {
-		float f;
-		uint32_t ui;
-	} u;
-	u.ui = pc->immd_buf[imm->hw];
-
-	u.f = (imm->mod & NV50_MOD_ABS) ? fabsf(u.f) : u.f;
-	u.f = (imm->mod & NV50_MOD_NEG) ? -u.f : u.f;
-
 	set_long(pc, e);
 	/* XXX: can't be predicated - bits overlap; cases where both
 	 * are required should be avoided by using pc->allow32 */
@@ -513,8 +503,8 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)
 	set_pred_wr(pc, 0, 0, e);
 
 	e->inst[1] |= 0x00000002 | 0x00000001;
-	e->inst[0] |= (u.ui & 0x3f) << 16;
-	e->inst[1] |= (u.ui >> 6) << 2;
+	e->inst[0] |= (pc->immd_buf[imm->hw] & 0x3f) << 16;
+	e->inst[1] |= (pc->immd_buf[imm->hw] >> 6) << 2;
 }
 
 static INLINE void
@@ -663,6 +653,7 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,
 	e->inst[1] |= (((src->type == P_IMMD) ? 0 : 1) << 22);
 }
 
+/* Never apply nv50_reg::mod in emit_mov, or carefully check the code !!! */
 static void
 emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
@@ -715,6 +706,34 @@ emit_mov_immdval(struct nv50_pc *pc, struct nv50_reg *dst, float f)
 	FREE(imm);
 }
 
+/* Assign the hw of the discarded temporary register src
+ * to the tgsi register dst and free src.
+ */
+static void
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
+{
+	assert(src->index == -1 && src->hw != -1);
+
+	if (pc->if_lvl || pc->loop_lvl ||
+	    (dst->type != P_TEMP) ||
+	    (src->hw < pc->result_nr * 4 &&
+	     pc->p->type == PIPE_SHADER_FRAGMENT) ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_CAL] ||
+	    pc->p->info.opcode_count[TGSI_OPCODE_BRA]) {
+
+		emit_mov(pc, dst, src);
+		free_temp(pc, src);
+		return;
+	}
+
+	if (dst->hw != -1)
+		pc->r_temp[dst->hw] = NULL;
+	pc->r_temp[src->hw] = dst;
+	dst->hw = src->hw;
+
+	FREE(src);
+}
+
 static void
 emit_nop(struct nv50_pc *pc)
 {
@@ -886,7 +905,7 @@ emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	set_dst(pc, dst, e);
 	set_src_0(pc, src0, e);
 	if (src1->type == P_IMMD && !is_long(e)) {
-		if (src0->mod & NV50_MOD_NEG)
+		if (src0->mod ^ src1->mod)
 			e->inst[0] |= 0x00008000;
 		set_immd(pc, src1, e);
 	} else {
@@ -997,6 +1016,8 @@ emit_bitop2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	    op != TGSI_OPCODE_XOR)
 		assert(!"invalid bit op");
 
+	assert(!(src0->mod | src1->mod));
+
 	if (src1->type == P_IMMD && src0->type == P_TEMP && pc->allow32) {
 		set_immd(pc, src1, e);
 		if (op == TGSI_OPCODE_OR)
@@ -1048,6 +1069,14 @@ emit_msb(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,
 	src2->mod ^= NV50_MOD_NEG;
 }
 
+#define NV50_FLOP_RCP 0
+#define NV50_FLOP_RSQ 2
+#define NV50_FLOP_LG2 3
+#define NV50_FLOP_SIN 4
+#define NV50_FLOP_COS 5
+#define NV50_FLOP_EX2 6
+
+/* rcp, rsqrt, lg2 support neg and abs */
 static void
 emit_flop(struct nv50_pc *pc, unsigned sub,
 	  struct nv50_reg *dst, struct nv50_reg *src)
@@ -1055,17 +1084,20 @@ emit_flop(struct nv50_pc *pc, unsigned sub,
 	struct nv50_program_exec *e = exec(pc);
 
 	e->inst[0] |= 0x90000000;
-	if (sub) {
+	if (sub || src->mod) {
 		set_long(pc, e);
 		e->inst[1] |= (sub << 29);
 	}
 
 	set_dst(pc, dst, e);
+	set_src_0_restricted(pc, src, e);
 
-	if (sub == 0 || sub == 2)
-		set_src_0_restricted(pc, src, e);
-	else
-		set_src_0(pc, src, e);
+	assert(!src->mod || sub < 4);
+
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
 
 	emit(pc, e);
 }
@@ -1082,6 +1114,11 @@ emit_preex2(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	set_long(pc, e);
 	e->inst[1] |= (6 << 29) | 0x00004000;
 
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
 	emit(pc, e);
 }
 
@@ -1097,6 +1134,11 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	set_long(pc, e);
 	e->inst[1] |= (6 << 29);
 
+	if (src->mod & NV50_MOD_NEG)
+		e->inst[1] |= 0x04000000;
+	if (src->mod & NV50_MOD_ABS)
+		e->inst[1] |= 0x00100000;
+
 	emit(pc, e);
 }
 
@@ -1231,10 +1273,10 @@ emit_pow(struct nv50_pc *pc, struct nv50_reg *dst,
 {
 	struct nv50_reg *temp = alloc_temp(pc, NULL);
 
-	emit_flop(pc, 3, temp, v);
+	emit_flop(pc, NV50_FLOP_LG2, temp, v);
 	emit_mul(pc, temp, temp, e);
 	emit_preex2(pc, temp, temp);
-	emit_flop(pc, 6, dst, temp);
+	emit_flop(pc, NV50_FLOP_EX2, dst, temp);
 
 	free_temp(pc, temp);
 }
@@ -1336,66 +1378,53 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
 }
 
 static struct nv50_program_exec *
-emit_breakaddr(struct nv50_pc *pc)
+emit_control_flow(struct nv50_pc *pc, unsigned op, int pred, unsigned cc)
 {
 	struct nv50_program_exec *e = exec(pc);
 
-	e->inst[0] = 0x40000002;
+	e->inst[0] = (op << 28) | 2;
 	set_long(pc, e);
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
 
 	emit(pc, e);
 	return e;
 }
 
-static void
-emit_break(struct nv50_pc *pc, int pred, unsigned cc)
+static INLINE struct nv50_program_exec *
+emit_breakaddr(struct nv50_pc *pc)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x50000002;
-	set_long(pc, e);
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
+	return emit_control_flow(pc, 0x4, -1, 0);
+}
 
-	emit(pc, e);
+static INLINE void
+emit_break(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	emit_control_flow(pc, 0x5, pred, cc);
 }
 
-static struct nv50_program_exec *
+static INLINE struct nv50_program_exec *
 emit_joinat(struct nv50_pc *pc)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0xa0000002;
-	set_long(pc, e);
-
-	emit(pc, e);
-	return e;
+	return emit_control_flow(pc, 0xa, -1, 0);
 }
 
-static struct nv50_program_exec *
+static INLINE struct nv50_program_exec *
 emit_branch(struct nv50_pc *pc, int pred, unsigned cc)
 {
-	struct nv50_program_exec *e = exec(pc);
+	return emit_control_flow(pc, 0x1, pred, cc);
+}
 
-	e->inst[0] = 0x10000002;
-	set_long(pc, e);
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
-	emit(pc, e);
-	return pc->p->exec_tail;
+static INLINE struct nv50_program_exec *
+emit_call(struct nv50_pc *pc, int pred, unsigned cc)
+{
+	return emit_control_flow(pc, 0x2, pred, cc);
 }
 
-static void
+static INLINE void
 emit_ret(struct nv50_pc *pc, int pred, unsigned cc)
 {
-	struct nv50_program_exec *e = exec(pc);
-
-	e->inst[0] = 0x30000002;
-	set_long(pc, e);
-	if (pred >= 0)
-		set_pred(pc, cc, pred, e);
-
-	emit(pc, e);
+	emit_control_flow(pc, 0x3, pred, cc);
 }
 
 #define QOP_ADD 0
@@ -1458,7 +1487,7 @@ load_cube_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 	if (arg == 4) /* there is no textureProj(samplerCubeShadow) */
 		emit_mov(pc, t[3], src[3]);
 
-	emit_flop(pc, 0, t[2], t[2]);
+	emit_flop(pc, NV50_FLOP_RCP, t[2], t[2]);
 
 	emit_mul(pc, t[0], src[0], t[2]);
 	emit_mul(pc, t[1], src[1], t[2]);
@@ -1476,7 +1505,7 @@ load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 
 		t[3]->rhw = src[3]->rhw;
 		emit_interp(pc, t[3], NULL, (mode & INTERP_CENTROID));
-		emit_flop(pc, 0, t[3], t[3]);
+		emit_flop(pc, NV50_FLOP_RCP, t[3], t[3]);
 
 		for (c = 0; c < dim; ++c) {
 			t[c]->rhw = src[c]->rhw;
@@ -1490,7 +1519,7 @@ load_proj_tex_coords(struct nv50_pc *pc, struct nv50_reg *t[4],
 		/* XXX: for some reason the blob sometimes uses MAD
 		 * (mad f32 $rX $rY $rZ neg $r63)
 		 */
-		emit_flop(pc, 0, t[3], src[3]);
+		emit_flop(pc, NV50_FLOP_RCP, t[3], src[3]);
 		for (c = 0; c < dim; ++c)
 			emit_mul(pc, t[c], src[c], t[3]);
 		if (arg != dim) /* depth reference value */
@@ -1537,7 +1566,13 @@ emit_texlod_sequence(struct nv50_pc *pc, struct nv50_reg *tlod,
 		     struct nv50_reg *src, struct nv50_program_exec *tex)
 {
 	struct nv50_program_exec *join_at;
-	unsigned i, target = pc->p->exec_size + 7 * 2;
+	unsigned i, target = pc->p->exec_size + 9 * 2;
+
+	if (pc->p->type != PIPE_SHADER_FRAGMENT) {
+		emit(pc, tex);
+		return;
+	}
+	pc->allow32 = FALSE;
 
 	/* Subtract lod of each pixel from lod of top left pixel, jump
 	 * texlod insn if result is 0, then repeat for 2 other pixels.
@@ -1663,6 +1698,7 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 		emit(pc, e);
 	} else
 	if (bias_lod < 0) {
+		assert(pc->p->type == PIPE_SHADER_FRAGMENT);
 		e->inst[0] |= arg << 22;
 		e->inst[1] |= 0x20000000; /* texbias */
 		emit_mov(pc, t[arg], src[3]);
@@ -1782,20 +1818,24 @@ static boolean
 negate_supported(const struct tgsi_full_instruction *insn, int i)
 {
 	switch (insn->Instruction.Opcode) {
+	case TGSI_OPCODE_ADD:
+	case TGSI_OPCODE_COS:
 	case TGSI_OPCODE_DDX:
 	case TGSI_OPCODE_DDY:
 	case TGSI_OPCODE_DP3:
 	case TGSI_OPCODE_DP4:
-	case TGSI_OPCODE_MUL:
+	case TGSI_OPCODE_EX2:
 	case TGSI_OPCODE_KIL:
-	case TGSI_OPCODE_ADD:
-	case TGSI_OPCODE_SUB:
+	case TGSI_OPCODE_LG2:
 	case TGSI_OPCODE_MAD:
-		return TRUE;
+	case TGSI_OPCODE_MUL:
 	case TGSI_OPCODE_POW:
-		if (i == 1)
-			return TRUE;
-		return FALSE;
+	case TGSI_OPCODE_RCP:
+	case TGSI_OPCODE_RSQ: /* ignored, RSQ = rsqrt(abs(src.x)) */
+	case TGSI_OPCODE_SCS:
+	case TGSI_OPCODE_SIN:
+	case TGSI_OPCODE_SUB:
+		return TRUE;
 	default:
 		return FALSE;
 	}
@@ -1820,7 +1860,9 @@ nv50_tgsi_src_mask(const struct tgsi_full_instruction *insn, int c)
 	case TGSI_OPCODE_DST:
 		return mask & (c ? 0xa : 0x6);
 	case TGSI_OPCODE_EX2:
+	case TGSI_OPCODE_EXP:
 	case TGSI_OPCODE_LG2:
+	case TGSI_OPCODE_LOG:
 	case TGSI_OPCODE_POW:
 	case TGSI_OPCODE_RCP:
 	case TGSI_OPCODE_RSQ:
@@ -2042,6 +2084,8 @@ nv50_tgsi_dst_revdep(unsigned op, int s, int c)
 			assert(0);
 			return 0x0;
 		}
+	case TGSI_OPCODE_EXP:
+	case TGSI_OPCODE_LOG:
 	case TGSI_OPCODE_LIT:
 	case TGSI_OPCODE_SCS:
 	case TGSI_OPCODE_TEX:
@@ -2082,6 +2126,8 @@ nv50_kill_branch(struct nv50_pc *pc)
 
 	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
 		return FALSE;
+	if (is_immd(pc->p->exec_tail))
+		return FALSE;
 
 	/* if ccode == 'true', the BRA is from an ELSE and the predicate
 	 * reg may no longer be valid, since we currently always use $p0
@@ -2215,10 +2261,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_BGNSUB:
+		assert(!pc->in_subroutine);
+		pc->in_subroutine = TRUE;
+		/* probably not necessary, but align to 8 byte boundary */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+		break;
 	case TGSI_OPCODE_BRK:
 		assert(pc->loop_lvl > 0);
 		emit_break(pc, -1, 0);
 		break;
+	case TGSI_OPCODE_CAL:
+		assert(inst->Label.Label < pc->insn_nr);
+		emit_call(pc, -1, 0)->param.index = inst->Label.Label;
+		/* replaced by actual offset in nv50_program_fixup_insns */
+		break;
 	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
@@ -2239,17 +2297,22 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			set_pred(pc, 0x6, 1, pc->p->exec_tail); /* @NSF */
 		}
 		break;
+	case TGSI_OPCODE_CONT:
+		assert(pc->loop_lvl > 0);
+		emit_branch(pc, -1, 0)->param.index =
+			pc->loop_pos[pc->loop_lvl - 1];
+		break;
 	case TGSI_OPCODE_COS:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, 5, dst[3], temp);
+			emit_flop(pc, NV50_FLOP_COS, dst[3], temp);
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
 				temp = brdc = temp_temp(pc);
 		}
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 5, brdc, temp);
+		emit_flop(pc, NV50_FLOP_COS, brdc, temp);
 		break;
 	case TGSI_OPCODE_DDX:
 		for (c = 0; c < 4; c++) {
@@ -2321,9 +2384,40 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		pc->loop_brka[pc->loop_lvl]->param.index = pc->p->exec_size;
 		terminate_mbb(pc);
 		break;
+	case TGSI_OPCODE_ENDSUB:
+		assert(pc->in_subroutine);
+		pc->in_subroutine = FALSE;
+		break;
 	case TGSI_OPCODE_EX2:
 		emit_preex2(pc, temp, src[0][0]);
-		emit_flop(pc, 6, brdc, temp);
+		emit_flop(pc, NV50_FLOP_EX2, brdc, temp);
+		break;
+	case TGSI_OPCODE_EXP:
+	{
+		struct nv50_reg *t[2];
+
+		assert(!temp);
+		t[0] = temp_temp(pc);
+		t[1] = temp_temp(pc);
+
+		if (mask & 0x6)
+			emit_mov(pc, t[0], src[0][0]);
+		if (mask & 0x3)
+			emit_flr(pc, t[1], src[0][0]);
+
+		if (mask & (1 << 1))
+			emit_sub(pc, dst[1], t[0], t[1]);
+		if (mask & (1 << 0)) {
+			emit_preex2(pc, t[1], t[1]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[0], t[1]);
+		}
+		if (mask & (1 << 2)) {
+			emit_preex2(pc, t[0], t[0]);
+			emit_flop(pc, NV50_FLOP_EX2, dst[2], t[0]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
 		break;
 	case TGSI_OPCODE_FLR:
 		for (c = 0; c < 4; c++) {
@@ -2363,7 +2457,35 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
 	case TGSI_OPCODE_LG2:
-		emit_flop(pc, 3, brdc, src[0][0]);
+		emit_flop(pc, NV50_FLOP_LG2, brdc, src[0][0]);
+		break;
+	case TGSI_OPCODE_LOG:
+	{
+		struct nv50_reg *t[2];
+
+		t[0] = temp_temp(pc);
+		if (mask & (1 << 1))
+			t[1] = temp_temp(pc);
+		else
+			t[1] = t[0];
+
+		emit_abs(pc, t[0], src[0][0]);
+		emit_flop(pc, NV50_FLOP_LG2, t[1], t[0]);
+		if (mask & (1 << 2))
+			emit_mov(pc, dst[2], t[1]);
+		emit_flr(pc, t[1], t[1]);
+		if (mask & (1 << 0))
+			emit_mov(pc, dst[0], t[1]);
+		if (mask & (1 << 1)) {
+			t[1]->mod = NV50_MOD_NEG;
+			emit_preex2(pc, t[1], t[1]);
+			t[1]->mod = 0;
+			emit_flop(pc, NV50_FLOP_EX2, t[1], t[1]);
+			emit_mul(pc, dst[1], t[0], t[1]);
+		}
+		if (mask & (1 << 3))
+			emit_mov_immdval(pc, dst[3], 1.0f);
+	}
 		break;
 	case TGSI_OPCODE_LRP:
 		temp = temp_temp(pc);
@@ -2413,24 +2535,25 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		emit_pow(pc, brdc, src[0][0], src[1][0]);
 		break;
 	case TGSI_OPCODE_RCP:
-		emit_flop(pc, 0, brdc, src[0][0]);
+		emit_flop(pc, NV50_FLOP_RCP, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_RET:
-		if (pc->p->type == PIPE_SHADER_FRAGMENT)
+		if (pc->p->type == PIPE_SHADER_FRAGMENT && !pc->in_subroutine)
 			nv50_fp_move_results(pc);
 		emit_ret(pc, -1, 0);
 		break;
 	case TGSI_OPCODE_RSQ:
-		emit_flop(pc, 2, brdc, src[0][0]);
+		src[0][0]->mod |= NV50_MOD_ABS;
+		emit_flop(pc, NV50_FLOP_RSQ, brdc, src[0][0]);
 		break;
 	case TGSI_OPCODE_SCS:
 		temp = temp_temp(pc);
 		if (mask & 3)
 			emit_precossin(pc, temp, src[0][0]);
 		if (mask & (1 << 0))
-			emit_flop(pc, 5, dst[0], temp);
+			emit_flop(pc, NV50_FLOP_COS, dst[0], temp);
 		if (mask & (1 << 1))
-			emit_flop(pc, 4, dst[1], temp);
+			emit_flop(pc, NV50_FLOP_SIN, dst[1], temp);
 		if (mask & (1 << 2))
 			emit_mov_immdval(pc, dst[2], 0.0);
 		if (mask & (1 << 3))
@@ -2439,14 +2562,14 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 	case TGSI_OPCODE_SIN:
 		if (mask & 8) {
 			emit_precossin(pc, temp, src[0][3]);
-			emit_flop(pc, 4, dst[3], temp);
+			emit_flop(pc, NV50_FLOP_SIN, dst[3], temp);
 			if (!(mask &= 7))
 				break;
 			if (temp == dst[3])
 				temp = brdc = temp_temp(pc);
 		}
 		emit_precossin(pc, temp, src[0][0]);
-		emit_flop(pc, 4, brdc, temp);
+		emit_flop(pc, NV50_FLOP_SIN, brdc, temp);
 		break;
 	case TGSI_OPCODE_SLT:
 	case TGSI_OPCODE_SGE:
@@ -2510,6 +2633,17 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_mov_immdval(pc, dst[3], 1.0);
 		break;
 	case TGSI_OPCODE_END:
+		if (pc->p->type == PIPE_SHADER_FRAGMENT)
+			nv50_fp_move_results(pc);
+
+		/* last insn must be long so it can have the exit bit set */
+		if (!is_long(pc->p->exec_tail))
+			convert_to_long(pc, pc->p->exec_tail);
+		else
+		if (is_immd(pc->p->exec_tail) || is_join(pc->p->exec_tail))
+			emit_nop(pc);
+
+		pc->p->exec_tail->inst[1] |= 1; /* set exit bit */
 		break;
 	default:
 		NOUVEAU_ERR("invalid opcode %d\n", inst->Instruction.Opcode);
@@ -2554,10 +2688,16 @@ prep_inspect_insn(struct nv50_pc *pc, const struct tgsi_full_instruction *insn)
 	mask = dst->WriteMask;
 
         if (dst->File == TGSI_FILE_TEMPORARY)
-                reg = pc->temp;
+		reg = pc->temp;
         else
-        if (dst->File == TGSI_FILE_OUTPUT)
-                reg = pc->result;
+	if (dst->File == TGSI_FILE_OUTPUT) {
+		reg = pc->result;
+
+		if (insn->Instruction.Opcode == TGSI_OPCODE_MOV &&
+		    dst->Index == pc->edgeflag_out &&
+		    insn->Src[0].Register.File == TGSI_FILE_INPUT)
+			pc->p->cfg.edgeflag_in = insn->Src[0].Register.Index;
+	}
 
 	if (reg) {
 		for (c = 0; c < 4; c++) {
@@ -2724,7 +2864,7 @@ nv50_tgsi_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 	}
 	pc->r_brdc = NULL;
 
-	if (!deqs)
+	if (!deqs || (!rdep[0] && !rdep[1] && !rdep[2] && !rdep[3]))
 		return nv50_program_tx_insn(pc, &insn);
 
 	deqs = nv50_revdep_reorder(m, rdep);
@@ -2775,7 +2915,7 @@ load_interpolant(struct nv50_pc *pc, struct nv50_reg *reg)
 		iv->rhw = popcnt4(pc->p->cfg.regs[1] >> 24) - 1;
 
 		emit_interp(pc, iv, NULL, mode & INTERP_CENTROID);
-		emit_flop(pc, 0, iv, iv);
+		emit_flop(pc, NV50_FLOP_RCP, iv, iv);
 
 		/* XXX: when loading interpolants dynamically, move these
 		 * to the program head, or make sure it can't be skipped.
@@ -2856,6 +2996,9 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 					if (p->cfg.io_nr > first)
 						p->cfg.io_nr = first;
 					break;
+				case TGSI_SEMANTIC_EDGEFLAG:
+					pc->edgeflag_out = first;
+					break;
 					/*
 				case TGSI_SEMANTIC_CLIP_DISTANCE:
 					p->cfg.clpd = MIN2(p->cfg.clpd, first);
@@ -3104,6 +3247,8 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 	p->cfg.two_side[0].hw = 0x40;
 	p->cfg.two_side[1].hw = 0x40;
 
+	p->cfg.edgeflag_in = pc->edgeflag_out = 0xff;
+
 	switch (p->type) {
 	case PIPE_SHADER_VERTEX:
 		p->cfg.psiz = 0x40;
@@ -3192,16 +3337,6 @@ nv50_program_fixup_insns(struct nv50_pc *pc)
 		if (e->param.index >= 0 && !e->param.mask)
 			bra_list[n++] = e;
 
-	/* last instruction must be long so it can have the exit bit set */
-	if (!is_long(pc->p->exec_tail))
-		convert_to_long(pc, pc->p->exec_tail);
-	/* set exit bit */
-	pc->p->exec_tail->inst[1] |= 1;
-
-	/* !immd on exit insn simultaneously means !join */
-	assert(!is_immd(pc->p->exec_head));
-	assert(!is_immd(pc->p->exec_tail));
-
 	/* Make sure we don't have any single 32 bit instructions. */
 	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
 		pos += is_long(e) ? 2 : 1;
@@ -3210,12 +3345,24 @@ nv50_program_fixup_insns(struct nv50_pc *pc)
 			for (i = 0; i < n; ++i)
 				if (bra_list[i]->param.index >= pos)
 					bra_list[i]->param.index += 1;
+			for (i = 0; i < pc->insn_nr; ++i)
+				if (pc->insn_pos[i] >= pos)
+					pc->insn_pos[i] += 1;
 			convert_to_long(pc, e);
 			++pos;
 		}
 	}
 
 	FREE(bra_list);
+
+	if (!pc->p->info.opcode_count[TGSI_OPCODE_CAL])
+		return;
+
+	/* fill in CALL offsets */
+	for (e = pc->p->exec_head; e; e = e->next) {
+		if ((e->inst[0] & 2) && (e->inst[0] >> 28) == 0x2)
+			e->param.index = pc->insn_pos[e->param.index];
+	}
 }
 
 static boolean
@@ -3237,19 +3384,20 @@ nv50_program_tx(struct nv50_program *p)
 	if (ret == FALSE)
 		goto out_cleanup;
 
+	pc->insn_pos = MALLOC(pc->insn_nr * sizeof(unsigned));
+
 	tgsi_parse_init(&parse, pc->p->pipe.tokens);
 	while (!tgsi_parse_end_of_tokens(&parse)) {
 		const union tgsi_full_token *tok = &parse.FullToken;
 
-		/* don't allow half insn/immd on first and last instruction */
+		/* previously allow32 was FALSE for first & last instruction */
 		pc->allow32 = TRUE;
-		if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)
-			pc->allow32 = FALSE;
 
 		tgsi_parse_token(&parse);
 
 		switch (tok->Token.Type) {
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			pc->insn_pos[pc->insn_cur] = pc->p->exec_size;
 			++pc->insn_cur;
 			ret = nv50_tgsi_insn(pc, tok);
 			if (ret == FALSE)
@@ -3260,9 +3408,6 @@ nv50_program_tx(struct nv50_program *p)
 		}
 	}
 
-	if (pc->p->type == PIPE_SHADER_FRAGMENT)
-		nv50_fp_move_results(pc);
-
 	nv50_program_fixup_insns(pc);
 
 	p->param_nr = pc->param_nr * 4;
@@ -3480,7 +3625,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_data  (so, p->cfg.high_temp);
 	so_method(so, tesla, NV50TCL_FP_RESULT_COUNT, 1);
 	so_data  (so, p->cfg.high_result);
-	so_method(so, tesla, NV50TCL_FP_CTRL_UNK19A8, 1);
+	so_method(so, tesla, NV50TCL_FP_CONTROL, 1);
 	so_data  (so, p->cfg.regs[2]);
 	so_method(so, tesla, NV50TCL_FP_CTRL_UNK196C, 1);
 	so_data  (so, p->cfg.regs[3]);
@@ -3652,7 +3797,7 @@ nv50_linkage_validate(struct nv50_context *nv50)
 	so_method(so, tesla, NV50TCL_FP_INTERPOLANT_CTRL, 1);
 	so_data  (so, reg[4]);
 
-	so_method(so, tesla, 0x1540, 4);
+	so_method(so, tesla, NV50TCL_NOPERSPECTIVE_BITMAP(0), 4);
 	so_datap (so, lin, 4);
 
 	if (nv50->rasterizer->pipe.point_sprite) {
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 4a90c372ce..461fec1d89 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -58,6 +58,7 @@ struct nv50_program {
 		/* VP only */
 		uint8_t clpd, clpd_nr;
 		uint8_t psiz;
+		uint8_t edgeflag_in;
 	} cfg;
 };
 
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 268c9823f7..5d9e18218a 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -77,9 +77,9 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nv50_query *q = nv50_query(pq);
 
-	BEGIN_RING(chan, tesla, 0x1530, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_RESET, 1);
 	OUT_RING  (chan, 1);
-	BEGIN_RING(chan, tesla, 0x1514, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_SAMPLECNT_ENABLE, 1);
 	OUT_RING  (chan, 1);
 
 	q->ready = FALSE;
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index d443ca3ad0..7e039ea82e 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -128,7 +128,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 	case PIPE_CAP_TEXTURE_MIRROR_REPEAT:
 		return 1;
 	case PIPE_CAP_TGSI_CONT_SUPPORTED:
-		return 0;
+		return 1;
 	case PIPE_CAP_BLEND_EQUATION_SEPARATE:
 		return 1;
 	case NOUVEAU_CAP_HW_VTXBUF:
@@ -165,6 +165,21 @@ static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
 	struct nv50_screen *screen = nv50_screen(pscreen);
+	unsigned i;
+
+	for (i = 0; i < 2; i++) {
+		if (screen->constbuf_parm[i])
+			nouveau_bo_ref(NULL, &screen->constbuf_parm[i]);
+	}
+
+	if (screen->constbuf_misc[0])
+		nouveau_bo_ref(NULL, &screen->constbuf_misc[0]);
+	if (screen->tic)
+		nouveau_bo_ref(NULL, &screen->tic);
+	if (screen->tsc)
+		nouveau_bo_ref(NULL, &screen->tsc);
+	if (screen->static_init)
+		so_ref(NULL, &screen->static_init);
 
 	nouveau_notifier_free(&screen->sync);
 	nouveau_grobj_free(&screen->tesla);
@@ -231,8 +246,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 		break;
 	case 0x80:
 	case 0x90:
-		/* this stupid name should be corrected. */
-		tesla_class = NV54TCL;
+		tesla_class = NV84TCL;
 		break;
 	case 0xa0:
 		switch (chipset) {
@@ -242,7 +256,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 			tesla_class = NVA0TCL;
 			break;
 		default:
-			tesla_class = 0x8597;
+			tesla_class = NVA8TCL;
 			break;
 		}
 		break;
@@ -287,7 +301,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	so_data  (so, chan->vram->handle);
 	so_method(so, screen->eng2d, NV50_2D_OPERATION, 1);
 	so_data  (so, NV50_2D_OPERATION_SRCCOPY);
-	so_method(so, screen->eng2d, 0x0290, 1);
+	so_method(so, screen->eng2d, NV50_2D_CLIP_ENABLE, 1);
 	so_data  (so, 0);
 	so_method(so, screen->eng2d, 0x0888, 1);
 	so_data  (so, 1);
@@ -297,34 +311,33 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	/* Static tesla init */
 	so = so_new(256, 20);
 
-	so_method(so, screen->tesla, 0x1558, 1);
-	so_data  (so, 1);
+	so_method(so, screen->tesla, NV50TCL_COND_MODE, 1);
+	so_data  (so, NV50TCL_COND_MODE_ALWAYS);
 	so_method(so, screen->tesla, NV50TCL_DMA_NOTIFY, 1);
 	so_data  (so, screen->sync->handle);
-	so_method(so, screen->tesla, NV50TCL_DMA_UNK0(0),
-				     NV50TCL_DMA_UNK0__SIZE);
-	for (i = 0; i < NV50TCL_DMA_UNK0__SIZE; i++)
+	so_method(so, screen->tesla, NV50TCL_DMA_ZETA, 11);
+	for (i = 0; i < 11; i++)
 		so_data(so, chan->vram->handle);
-	so_method(so, screen->tesla, NV50TCL_DMA_UNK1(0),
-				     NV50TCL_DMA_UNK1__SIZE);
-	for (i = 0; i < NV50TCL_DMA_UNK1__SIZE; i++)
+	so_method(so, screen->tesla, NV50TCL_DMA_COLOR(0),
+				     NV50TCL_DMA_COLOR__SIZE);
+	for (i = 0; i < NV50TCL_DMA_COLOR__SIZE; i++)
 		so_data(so, chan->vram->handle);
-	so_method(so, screen->tesla, 0x121c, 1);
+	so_method(so, screen->tesla, NV50TCL_RT_CONTROL, 1);
 	so_data  (so, 1);
 
 	/* activate all 32 lanes (threads) in a warp */
-	so_method(so, screen->tesla, 0x19a0, 1);
+	so_method(so, screen->tesla, NV50TCL_WARP_HALVES, 1);
 	so_data  (so, 0x2);
 	so_method(so, screen->tesla, 0x1400, 1);
 	so_data  (so, 0xf);
 
 	/* max TIC (bits 4:8) & TSC (ignored) bindings, per program type */
-	so_method(so, screen->tesla, 0x13b4, 1);
+	so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(0), 1);
 	so_data  (so, 0x54);
-	so_method(so, screen->tesla, 0x13bc, 1);
+	so_method(so, screen->tesla, NV50TCL_TEX_LIMITS(2), 1);
 	so_data  (so, 0x54);
 	/* origin is top left (set to 1 for bottom left) */
-	so_method(so, screen->tesla, 0x13ac, 1);
+	so_method(so, screen->tesla, NV50TCL_Y_ORIGIN_BOTTOM, 1);
 	so_data  (so, 0);
 	so_method(so, screen->tesla, NV50TCL_VP_REG_ALLOC_RESULT, 1);
 	so_data  (so, 8);
@@ -360,7 +373,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 	//  B = buffer ID (maybe more than 1 byte)
 	//  N = CB index used in shader instruction
 	//  P = program type (0 = VP, 2 = GP, 3 = FP)
-	so_method(so, screen->tesla, 0x1694, 1);
+	so_method(so, screen->tesla, NV50TCL_SET_PROGRAM_CB, 1);
 	so_data  (so, 0x000BBNP1);
 	*/
 
@@ -424,23 +437,26 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
 
 	/* Vertex array limits - max them out */
 	for (i = 0; i < 16; i++) {
-		so_method(so, screen->tesla, NV50TCL_UNK1080_OFFSET_HIGH(i), 2);
+		so_method(so, screen->tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
 		so_data  (so, 0x000000ff);
 		so_data  (so, 0xffffffff);
 	}
 
-	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR, 2);
+	so_method(so, screen->tesla, NV50TCL_DEPTH_RANGE_NEAR(0), 2);
 	so_data  (so, fui(0.0));
 	so_data  (so, fui(1.0));
 
 	/* no dynamic combination of TIC & TSC entries => only BIND_TIC used */
-	so_method(so, screen->tesla, 0x1234, 1);
+	so_method(so, screen->tesla, NV50TCL_LINKED_TSC, 1);
 	so_data  (so, 1);
 
 	/* activate first scissor rectangle */
-	so_method(so, screen->tesla, NV50TCL_SCISSOR_ENABLE, 1);
+	so_method(so, screen->tesla, NV50TCL_SCISSOR_ENABLE(0), 1);
 	so_data  (so, 1);
 
+	so_method(so, screen->tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+	so_data  (so, 1); /* default edgeflag to TRUE */
+
 	so_emit(chan, so);
 	so_ref (so, &screen->static_init);
 	so_ref (NULL, &so);
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 88aef52d08..30b2b0f91b 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -295,7 +295,7 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 	so_method(so, tesla, NV50TCL_SHADE_MODEL, 1);
 	so_data  (so, cso->flatshade ? NV50TCL_SHADE_MODEL_FLAT :
 				       NV50TCL_SHADE_MODEL_SMOOTH);
-	so_method(so, tesla, 0x1684, 1);
+	so_method(so, tesla, NV50TCL_PROVOKING_VERTEX_LAST, 1);
 	so_data  (so, cso->flatshade_first ? 0 : 1);
 
 	so_method(so, tesla, NV50TCL_VERTEX_TWO_SIDE_ENABLE, 1);
@@ -392,7 +392,7 @@ nv50_rasterizer_state_create(struct pipe_context *pipe,
 		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_FACTOR, 1);
 		so_data  (so, fui(cso->offset_scale));
 		so_method(so, tesla, NV50TCL_POLYGON_OFFSET_UNITS, 1);
-		so_data  (so, fui(cso->offset_units));
+		so_data  (so, fui(cso->offset_units * 2.0f));
 	}
 
 	rso->pipe = *cso;
@@ -439,9 +439,8 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, 0);
 	}
 
-	/* XXX: keep hex values until header is updated (names reversed) */
 	if (cso->stencil[0].enabled) {
-		so_method(so, tesla, 0x1380, 8);
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 8);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[0].zfail_op));
@@ -451,23 +450,23 @@ nv50_depth_stencil_alpha_state_create(struct pipe_context *pipe,
 		so_data  (so, cso->stencil[0].writemask);
 		so_data  (so, cso->stencil[0].valuemask);
 	} else {
-		so_method(so, tesla, 0x1380, 1);
+		so_method(so, tesla, NV50TCL_STENCIL_FRONT_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
 	if (cso->stencil[1].enabled) {
-		so_method(so, tesla, 0x1594, 5);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 5);
 		so_data  (so, 1);
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].fail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zfail_op));
 		so_data  (so, nvgl_stencil_op(cso->stencil[1].zpass_op));
 		so_data  (so, nvgl_comparison_op(cso->stencil[1].func));
-		so_method(so, tesla, 0x0f54, 3);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_FUNC_REF, 3);
 		so_data  (so, cso->stencil[1].ref_value);
 		so_data  (so, cso->stencil[1].writemask);
 		so_data  (so, cso->stencil[1].valuemask);
 	} else {
-		so_method(so, tesla, 0x1594, 1);
+		so_method(so, tesla, NV50TCL_STENCIL_BACK_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 871e8097b6..c8bdf9dc27 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -41,7 +41,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 	 * FP result 0 always goes to RT[0], bits 4 - 6 are ignored.
 	 * Ambiguous assignment results in no rendering (no DATA_ERROR).
 	 */
-	so_method(so, tesla, 0x121c, 1);
+	so_method(so, tesla, NV50TCL_RT_CONTROL, 1);
 	so_data  (so, fb->nr_cbufs |
 		  (0 <<  4) | (1 <<  7) | (2 << 10) | (3 << 13) |
 		  (4 << 16) | (5 << 19) | (6 << 22) | (7 << 25));
@@ -87,7 +87,7 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 				level[fb->cbufs[i]->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
-		so_method(so, tesla, 0x1224, 1);
+		so_method(so, tesla, NV50TCL_RT_ARRAY_MODE, 1);
 		so_data  (so, 1);
 	}
 
@@ -124,22 +124,22 @@ nv50_state_validate_fb(struct nv50_context *nv50)
 				level[fb->zsbuf->level].tile_mode << 4);
 		so_data(so, 0x00000000);
 
-		so_method(so, tesla, 0x1538, 1);
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
 		so_data  (so, 1);
 		so_method(so, tesla, NV50TCL_ZETA_HORIZ, 3);
 		so_data  (so, fb->zsbuf->width);
 		so_data  (so, fb->zsbuf->height);
 		so_data  (so, 0x00010001);
 	} else {
-		so_method(so, tesla, 0x1538, 1);
+		so_method(so, tesla, NV50TCL_ZETA_ENABLE, 1);
 		so_data  (so, 0);
 	}
 
-	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ, 2);
+	so_method(so, tesla, NV50TCL_VIEWPORT_HORIZ(0), 2);
 	so_data  (so, w << 16);
 	so_data  (so, h << 16);
 	/* set window lower left corner */
-	so_method(so, tesla, NV50TCL_WINDOW_LEFT, 2);
+	so_method(so, tesla, NV50TCL_WINDOW_OFFSET_X, 2);
 	so_data  (so, 0);
 	so_data  (so, 0);
 	/* set screen scissor rectangle */
@@ -325,7 +325,7 @@ nv50_state_validate(struct nv50_context *nv50)
 		nv50->state.scissor_enabled = rast->scissor;
 
 		so = so_new(3, 0);
-		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ, 2);
+		so_method(so, tesla, NV50TCL_SCISSOR_HORIZ(0), 2);
 		if (nv50->state.scissor_enabled) {
 			so_data(so, (s->maxx << 16) | s->minx);
 			so_data(so, (s->maxy << 16) | s->miny);
@@ -355,11 +355,11 @@ scissor_uptodate:
 
 		so = so_new(14, 0);
 		if (!bypass) {
-			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
 			so_data  (so, fui(nv50->viewport.translate[1]));
 			so_data  (so, fui(nv50->viewport.translate[2]));
-			so_method(so, tesla, NV50TCL_VIEWPORT_SCALE(0), 3);
+			so_method(so, tesla, NV50TCL_VIEWPORT_SCALE_X(0), 3);
 			so_data  (so, fui(nv50->viewport.scale[0]));
 			so_data  (so, fui(nv50->viewport.scale[1]));
 			so_data  (so, fui(nv50->viewport.scale[2]));
@@ -440,7 +440,7 @@ void nv50_so_init_sifc(struct nv50_context *nv50,
 	so_data  (so, 1);
 	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_HIGH, 0, 0);
 	so_reloc (so, bo, offset, reloc | NOUVEAU_BO_LOW, 0, 0);
-	so_method(so, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	so_method(so, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
 	so_data  (so, 0);
 	so_data  (so, NV50_2D_SIFC_FORMAT_R8_UNORM);
 	so_method(so, eng2d, NV50_2D_SIFC_WIDTH, 10);
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 79655fc08d..6378132979 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -176,11 +176,11 @@ nv50_surface_fill(struct pipe_context *pipe, struct pipe_surface *dest,
 	if (ret)
 		return;
 
-	BEGIN_RING(chan, eng2d, 0x0580, 3);
-	OUT_RING  (chan, 4);
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_SHAPE, 3);
+	OUT_RING  (chan, NV50_2D_DRAW_SHAPE_RECTANGLES);
 	OUT_RING  (chan, format);
 	OUT_RING  (chan, value);
-	BEGIN_RING(chan, eng2d, NV50_2D_RECT_X1, 4);
+	BEGIN_RING(chan, eng2d, NV50_2D_DRAW_POINT32_X(0), 4);
 	OUT_RING  (chan, destx);
 	OUT_RING  (chan, desty);
 	OUT_RING  (chan, width);
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index 4d9afa6fed..a2f1db2914 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -47,7 +47,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_IN, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_IN, 1);
 		OUT_RING  (chan, src_pitch);
 		src_offset += (sy * src_pitch) + (sx * cpp);
 	} else {
@@ -66,7 +66,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			NV50_MEMORY_TO_MEMORY_FORMAT_LINEAR_OUT, 1);
 		OUT_RING  (chan, 1);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
+			NV04_MEMORY_TO_MEMORY_FORMAT_PITCH_OUT, 1);
 		OUT_RING  (chan, dst_pitch);
 		dst_offset += (dy * dst_pitch) + (dx * cpp);
 	} else {
@@ -89,7 +89,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 		OUT_RELOCh(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCh(chan, dst_bo, dst_offset, dst_reloc);
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
+			NV04_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 2);
 		OUT_RELOCl(chan, src_bo, src_offset, src_reloc);
 		OUT_RELOCl(chan, dst_bo, dst_offset, dst_reloc);
 		if (src_bo->tile_flags) {
@@ -107,7 +107,7 @@ nv50_transfer_rect_m2mf(struct pipe_screen *pscreen,
 			dst_offset += (line_count * dst_pitch);
 		}
 		BEGIN_RING(chan, m2mf,
-			NV50_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
+			NV04_MEMORY_TO_MEMORY_FORMAT_LINE_LENGTH_IN, 4);
 		OUT_RING  (chan, width * cpp);
 		OUT_RING  (chan, line_count);
 		OUT_RING  (chan, 0x00000101);
@@ -291,7 +291,7 @@ nv50_upload_sifc(struct nv50_context *nv50,
 
 	/* NV50_2D_OPERATION_SRCCOPY assumed already set */
 
-	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_UNK0800, 2);
+	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
 	OUT_RING  (chan, 0);
 	OUT_RING  (chan, src_format);
 	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
@@ -334,6 +334,6 @@ nv50_upload_sifc(struct nv50_context *nv50,
 		src += src_pitch;
 	}
 
-	BEGIN_RING(chan, tesla, 0x1440, 1);
+	BEGIN_RING(chan, tesla, NV50TCL_CODE_CB_FLUSH, 1);
 	OUT_RING  (chan, 0);
 }
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index f7fa0659e8..602adfc50d 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -99,19 +99,19 @@ nv50_vbo_size_to_hw(unsigned size, unsigned nr_c)
 {
 	static const uint32_t hw_values[] = {
 		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_8_8_8_8,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_16_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_8_8_8_8,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_16_16_16_16,
 		0, 0, 0, 0,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32,
-		NV50TCL_VERTEX_ARRAY_ATTRIB_SIZE_32_32_32_32 };
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32,
+		NV50TCL_VERTEX_ARRAY_ATTRIB_FORMAT_32_32_32_32 };
 
 	/* we'd also have R11G11B10 and R10G10B10A2 */
 
@@ -198,7 +198,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		return nv50_push_elements_u08(nv50, map, count);
 
 	if (count & 1) {
-		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, map[0]);
 		map++;
 		count--;
@@ -208,7 +208,7 @@ nv50_draw_elements_inline_u08(struct nv50_context *nv50, uint8_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16 | 0x40000000, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -231,7 +231,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		return nv50_push_elements_u16(nv50, map, count);
 
 	if (count & 1) {
-		BEGIN_RING(chan, tesla, 0x15e8, 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32, 1);
 		OUT_RING  (chan, map[0]);
 		map++;
 		count--;
@@ -241,7 +241,7 @@ nv50_draw_elements_inline_u16(struct nv50_context *nv50, uint16_t *map,
 		unsigned nr = count > 2046 ? 2046 : count;
 		int i;
 
-		BEGIN_RING(chan, tesla, 0x400015f0, nr >> 1);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U16 | 0x40000000, nr >> 1);
 		for (i = 0; i < nr; i += 2)
 			OUT_RING  (chan, (map[i + 1] << 16) | map[i]);
 
@@ -266,7 +266,7 @@ nv50_draw_elements_inline_u32(struct nv50_context *nv50, uint32_t *map,
 	while (count) {
 		unsigned nr = count > 2047 ? 2047 : count;
 
-		BEGIN_RING(chan, tesla, 0x400015e8, nr);
+		BEGIN_RING(chan, tesla, NV50TCL_VB_ELEMENT_U32 | 0x40000000, nr);
 		OUT_RINGp (chan, map, nr);
 
 		count -= nr;
@@ -372,6 +372,10 @@ nv50_vbo_static_attrib(struct nv50_context *nv50, unsigned attrib,
 		so_data  (so, fui(v[1]));
 		break;
 	case 1:
+		if (attrib == nv50->vertprog->cfg.edgeflag_in) {
+			so_method(so, tesla, NV50TCL_EDGEFLAG_ENABLE, 1);
+			so_data  (so, v[0] ? 1 : 0);
+		}
 		so_method(so, tesla, NV50TCL_VTX_ATTR_1F(attrib), 1);
 		so_data  (so, fui(v[0]));
 		break;
@@ -401,6 +405,9 @@ nv50_vbo_validate(struct nv50_context *nv50)
 		    !(nv50->vtxbuf[i].buffer->usage & PIPE_BUFFER_USAGE_VERTEX))
 			nv50->vbo_fifo = 0xffff;
 
+	if (nv50->vertprog->cfg.edgeflag_in < 16)
+		nv50->vbo_fifo = 0xffff; /* vertprog can't set edgeflag */
+
 	n_ve = MAX2(nv50->vtxelt_nr, nv50->state.vtxelt_nr);
 
 	vtxattr = NULL;
@@ -445,7 +452,7 @@ nv50_vbo_validate(struct nv50_context *nv50)
 			  NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);
 
 		/* vertex array limits */
-		so_method(vtxbuf, tesla, 0x1080 + (i * 8), 2);
+		so_method(vtxbuf, tesla, NV50TCL_VERTEX_ARRAY_LIMIT_HIGH(i), 2);
 		so_reloc (vtxbuf, bo, vb->buffer->size - 1,
 			  NOUVEAU_BO_VRAM | NOUVEAU_BO_GART | NOUVEAU_BO_RD |
 			  NOUVEAU_BO_HIGH, 0, 0);
@@ -479,6 +486,9 @@ struct nv50_vbo_emitctx
 	unsigned nr_ve;
 	unsigned vtx_dwords;
 	unsigned vtx_max;
+
+	float edgeflag;
+	unsigned ve_edgeflag;
 };
 
 static INLINE void
@@ -622,6 +632,9 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 	if (nv50_map_vbufs(nv50) == FALSE)
 		return FALSE;
 
+	emit->ve_edgeflag = nv50->vertprog->cfg.edgeflag_in;
+
+	emit->edgeflag = 0.5f;
 	emit->nr_ve = 0;
 	emit->vtx_dwords = 0;
 
@@ -644,7 +657,8 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 		desc = util_format_description(ve->src_format);
 		assert(desc);
 
-		size = util_format_get_component_bits(ve->src_format, UTIL_FORMAT_COLORSPACE_RGB, 0);
+		size = util_format_get_component_bits(
+			ve->src_format, UTIL_FORMAT_COLORSPACE_RGB, 0);
 
 		assert(ve->nr_components > 0 && ve->nr_components <= 4);
 
@@ -686,10 +700,31 @@ emit_prepare(struct nv50_context *nv50, struct nv50_vbo_emitctx *emit,
 	}
 
 	emit->vtx_max = 512 / emit->vtx_dwords;
+	if (emit->ve_edgeflag < 16)
+		emit->vtx_max = 1;
 
 	return TRUE;
 }
 
+static INLINE void
+set_edgeflag(struct nouveau_channel *chan,
+	     struct nouveau_grobj *tesla,
+	     struct nv50_vbo_emitctx *emit, uint32_t index)
+{
+	unsigned i = emit->ve_edgeflag;
+
+	if (i < 16) {
+		float f = *((float *)(emit->map[i] + index * emit->stride[i]));
+
+		if (emit->edgeflag != f) {
+			emit->edgeflag = f;
+
+			BEGIN_RING(chan, tesla, 0x15e4, 1);
+			OUT_RING  (chan, f ? 1 : 0);
+		}
+	}
+}
+
 static boolean
 nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 {
@@ -704,6 +739,8 @@ nv50_push_arrays(struct nv50_context *nv50, unsigned start, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
+		set_edgeflag(chan, tesla, &emit, 0); /* nr will be 1 */
+
 		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx_next(chan, &emit);
@@ -729,6 +766,8 @@ nv50_push_elements_u32(struct nv50_context *nv50, uint32_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
+		set_edgeflag(chan, tesla, &emit, *map);
+
 		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
@@ -754,6 +793,8 @@ nv50_push_elements_u16(struct nv50_context *nv50, uint16_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
+		set_edgeflag(chan, tesla, &emit, *map);
+
 		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
@@ -779,6 +820,8 @@ nv50_push_elements_u08(struct nv50_context *nv50, uint8_t *map, unsigned count)
 		unsigned i, dw, nr = MIN2(count, emit.vtx_max);
 	        dw = nr * emit.vtx_dwords;
 
+		set_edgeflag(chan, tesla, &emit, *map);
+
 		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_DATA | 0x40000000, dw);
 		for (i = 0; i < nr; ++i)
 			emit_vtx(chan, &emit, *map++);
diff --git a/src/gallium/drivers/r300/SConscript b/src/gallium/drivers/r300/SConscript
index 0d2de17be9..183aa17f9b 100644
--- a/src/gallium/drivers/r300/SConscript
+++ b/src/gallium/drivers/r300/SConscript
@@ -4,7 +4,12 @@ r300compiler = SConscript('#/src/mesa/drivers/dri/r300/compiler/SConscript')
 
 env = env.Clone()
 # add the paths for r300compiler
-env.Append(CPPPATH = ['#/src/mesa/drivers/dri/r300/compiler', '#/include', '#/src/mesa'])
+env.Append(CPPPATH = [
+    '#/src/mesa/drivers/dri/r300/compiler', 
+    '#/src/gallium/winsys/drm/radeon/core',
+    '#/include', 
+    '#/src/mesa',
+])
 
 r300 = env.ConvenienceLibrary(
     target = 'r300',
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 199ce3a945..1dc9216a7b 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -997,7 +997,7 @@ validate:
             goto validate;
         }
     } else {
-        // debug_printf("No VBO while emitting dirty state!\n");
+        /* debug_printf("No VBO while emitting dirty state!\n"); */
     }
     if (!r300->winsys->validate(r300->winsys)) {
         r300->context.flush(&r300->context, 0, NULL);
@@ -1129,7 +1129,7 @@ validate:
     */
 
     /* Finally, emit the VBO. */
-    //r300_emit_vertex_buffer(r300);
+    /* r300_emit_vertex_buffer(r300); */
 
     r300->dirty_hw++;
 }
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index d8d08fbe26..0aa1da07f8 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -2638,7 +2638,7 @@ enum {
 	VE_COND_MUX_GTE			= 25,
 	VE_SET_GREATER_THAN		= 26,
 	VE_SET_EQUAL			= 27,
-	VE_SET_NOT_EQUAL		= 28,
+	VE_SET_NOT_EQUAL		= 28
 };
 
 enum {
@@ -2672,20 +2672,20 @@ enum {
 	ME_PRED_SET_CLR			= 25,
 	ME_PRED_SET_INV			= 26,
 	ME_PRED_SET_POP			= 27,
-	ME_PRED_SET_RESTORE		= 28,
+	ME_PRED_SET_RESTORE		= 28
 };
 
 enum {
 	/* R3XX */
 	PVS_MACRO_OP_2CLK_MADD		= 0,
-	PVS_MACRO_OP_2CLK_M2X_ADD	= 1,
+	PVS_MACRO_OP_2CLK_M2X_ADD	= 1
 };
 
 enum {
 	PVS_SRC_REG_TEMPORARY		= 0,	/* Intermediate Storage */
 	PVS_SRC_REG_INPUT		= 1,	/* Input Vertex Storage */
 	PVS_SRC_REG_CONSTANT		= 2,	/* Constant State Storage */
-	PVS_SRC_REG_ALT_TEMPORARY	= 3,	/* Alternate Intermediate Storage */
+	PVS_SRC_REG_ALT_TEMPORARY	= 3	/* Alternate Intermediate Storage */
 };
 
 enum {
@@ -2694,7 +2694,7 @@ enum {
 	PVS_DST_REG_OUT			= 2,	/* Output Memory. Used for all outputs */
 	PVS_DST_REG_OUT_REPL_X		= 3,	/* Output Memory & Replicate X to all channels */
 	PVS_DST_REG_ALT_TEMPORARY	= 4,	/* Alternate Intermediate Storage */
-	PVS_DST_REG_INPUT		= 5,	/* Output Memory & Replicate X to all channels */
+	PVS_DST_REG_INPUT		= 5	/* Output Memory & Replicate X to all channels */
 };
 
 enum {
@@ -2703,7 +2703,7 @@ enum {
 	PVS_SRC_SELECT_Z		= 2,	/* Select Z Component */
 	PVS_SRC_SELECT_W		= 3,	/* Select W Component */
 	PVS_SRC_SELECT_FORCE_0		= 4,	/* Force Component to 0.0 */
-	PVS_SRC_SELECT_FORCE_1		= 5,	/* Force Component to 1.0 */
+	PVS_SRC_SELECT_FORCE_1		= 5	/* Force Component to 1.0 */
 };
 
 /* PVS Opcode & Destination Operand Description */
@@ -2742,7 +2742,7 @@ enum {
 	PVS_DST_ADDR_SEL_MASK		= 0x3,
 	PVS_DST_ADDR_SEL_SHIFT		= 29,
 	PVS_DST_ADDR_MODE_0_MASK	= 0x1,
-	PVS_DST_ADDR_MODE_0_SHIFT	= 31,
+	PVS_DST_ADDR_MODE_0_SHIFT	= 31
 };
 
 /* PVS Source Operand Description */
@@ -2777,7 +2777,7 @@ enum {
 	PVS_SRC_ADDR_SEL_MASK		= 0x3,
 	PVS_SRC_ADDR_SEL_SHIFT		= 29,
 	PVS_SRC_ADDR_MODE_1_MASK	= 0x0,
-	PVS_SRC_ADDR_MODE_1_SHIFT	= 32,
+	PVS_SRC_ADDR_MODE_1_SHIFT	= 32
 };
 
 /*\}*/
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 2d70ec2ac9..a89cb633e0 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -335,8 +335,9 @@ boolean r300_swtcl_draw_arrays(struct pipe_context* pipe,
     draw_set_mapped_element_buffer(r300->draw, 0, NULL);
 
     draw_set_mapped_constant_buffer(r300->draw,
-            r300->shader_constants[PIPE_SHADER_VERTEX].constants,
-            r300->shader_constants[PIPE_SHADER_VERTEX].count *
+				    PIPE_SHADER_VERTEX,
+				    r300->shader_constants[PIPE_SHADER_VERTEX].constants,
+				    r300->shader_constants[PIPE_SHADER_VERTEX].count *
                 (sizeof(float) * 4));
 
     draw_arrays(r300->draw, mode, start, count);
@@ -361,6 +362,7 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
 {
     struct r300_context* r300 = r300_context(pipe);
     int i;
+    void* indices;
 
     if (!u_trim_pipe_prim(mode, &count)) {
         return FALSE;
@@ -377,12 +379,13 @@ boolean r300_swtcl_draw_range_elements(struct pipe_context* pipe,
         draw_set_mapped_vertex_buffer(r300->draw, i, buf);
     }
 
-    void* indices = pipe_buffer_map(pipe->screen, indexBuffer,
-                                    PIPE_BUFFER_USAGE_CPU_READ);
+    indices = pipe_buffer_map(pipe->screen, indexBuffer,
+                              PIPE_BUFFER_USAGE_CPU_READ);
     draw_set_mapped_element_buffer_range(r300->draw, indexSize,
                                          minIndex, maxIndex, indices);
 
     draw_set_mapped_constant_buffer(r300->draw,
+				    PIPE_SHADER_VERTEX,
             r300->shader_constants[PIPE_SHADER_VERTEX].constants,
             r300->shader_constants[PIPE_SHADER_VERTEX].count *
                 (sizeof(float) * 4));
@@ -474,7 +477,7 @@ static void* r300_render_map_vertices(struct vbuf_render* render)
     r300render->vbo_ptr = pipe_buffer_map(screen, r300render->vbo,
                                           PIPE_BUFFER_USAGE_CPU_WRITE);
 
-    return (r300render->vbo_ptr + r300render->vbo_offset);
+    return ((uint8_t*)r300render->vbo_ptr + r300render->vbo_offset);
 }
 
 static void r300_render_unmap_vertices(struct vbuf_render* render,
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 29bc701a86..727ae7ade6 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -71,9 +71,9 @@ static void r300_draw_emit_attrib(struct r300_context* r300,
     struct tgsi_shader_info* info = &r300->vs->info;
     int output;
 
-    output = draw_find_vs_output(r300->draw,
-                                 info->output_semantic_name[index],
-                                 info->output_semantic_index[index]);
+    output = draw_find_shader_output(r300->draw,
+                                     info->output_semantic_name[index],
+                                     info->output_semantic_index[index]);
     draw_emit_vertex_attr(&r300->vertex_info->vinfo, emit, interp, output);
 }
 
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 096cdb20bb..a792c2cf98 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -120,7 +120,7 @@ static unsigned translate_opcode(unsigned opcode)
      /* case TGSI_OPCODE_NOT: return RC_OPCODE_NOT; */
      /* case TGSI_OPCODE_TRUNC: return RC_OPCODE_TRUNC; */
      /* case TGSI_OPCODE_SHL: return RC_OPCODE_SHL; */
-     /* case TGSI_OPCODE_SHR: return RC_OPCODE_SHR; */
+     /* case TGSI_OPCODE_ISHR: return RC_OPCODE_SHR; */
      /* case TGSI_OPCODE_AND: return RC_OPCODE_AND; */
      /* case TGSI_OPCODE_OR: return RC_OPCODE_OR; */
      /* case TGSI_OPCODE_MOD: return RC_OPCODE_MOD; */
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index f98087deb8..5f130453c3 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -36,6 +36,7 @@
 #include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_tile_cache.h"
 
 
@@ -55,6 +56,9 @@ softpipe_clear(struct pipe_context *pipe, unsigned buffers, const float *rgba,
    if (softpipe->no_rast)
       return;
 
+   if (!softpipe_check_render_cond(softpipe))
+      return;
+
 #if 0
    softpipe_update_derived(softpipe); /* not needed?? */
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index 969d69d6b4..3ac807d4b5 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -176,6 +176,19 @@ softpipe_is_buffer_referenced( struct pipe_context *pipe,
 }
 
 
+static void
+softpipe_render_condition( struct pipe_context *pipe,
+                           struct pipe_query *query,
+                           uint mode )
+{
+   struct softpipe_context *softpipe = softpipe_context( pipe );
+
+   softpipe->render_cond_query = query;
+   softpipe->render_cond_mode = mode;
+}
+
+
+
 struct pipe_context *
 softpipe_create( struct pipe_screen *screen )
 {
@@ -191,6 +204,7 @@ softpipe_create( struct pipe_screen *screen )
 #endif
 
    softpipe->dump_fs = debug_get_bool_option( "GALLIUM_DUMP_FS", FALSE );
+   softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
    softpipe->pipe.winsys = screen->winsys;
    softpipe->pipe.screen = screen;
@@ -222,6 +236,10 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->pipe.bind_vs_state   = softpipe_bind_vs_state;
    softpipe->pipe.delete_vs_state = softpipe_delete_vs_state;
 
+   softpipe->pipe.create_gs_state = softpipe_create_gs_state;
+   softpipe->pipe.bind_gs_state   = softpipe_bind_gs_state;
+   softpipe->pipe.delete_gs_state = softpipe_delete_gs_state;
+
    softpipe->pipe.set_blend_color = softpipe_set_blend_color;
    softpipe->pipe.set_clip_state = softpipe_set_clip_state;
    softpipe->pipe.set_constant_buffer = softpipe_set_constant_buffer;
@@ -249,6 +267,8 @@ softpipe_create( struct pipe_screen *screen )
 
    softpipe_init_query_funcs( softpipe );
 
+   softpipe->pipe.render_condition = softpipe_render_condition;
+
    /*
     * Alloc caches for accessing drawing surfaces and textures.
     * Must be before quad stage setup!
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 8ce20c5744..73fa744f9d 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -58,6 +58,7 @@ struct softpipe_context {
    struct pipe_rasterizer_state *rasterizer;
    struct sp_fragment_shader *fs;
    struct sp_vertex_shader *vs;
+   struct sp_geometry_shader *gs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -115,6 +116,10 @@ struct softpipe_context {
 
    unsigned line_stipple_counter;
 
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+
    /** Software quad rendering pipeline */
    struct {
       struct quad_stage *shade;
@@ -147,6 +152,7 @@ struct softpipe_context {
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
+   unsigned dump_gs : 1;
    unsigned no_rast : 1;
 };
 
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 14cb1322e1..87312ae151 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -38,6 +38,7 @@
 #include "util/u_prim.h"
 
 #include "sp_context.h"
+#include "sp_query.h"
 #include "sp_state.h"
 
 #include "draw/draw_context.h"
@@ -48,7 +49,7 @@ static void
 softpipe_map_constant_buffers(struct softpipe_context *sp)
 {
    struct pipe_winsys *ws = sp->pipe.winsys;
-   uint i, size;
+   uint i, vssize, gssize;
 
    for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       if (sp->constants[i].buffer && sp->constants[i].buffer->size)
@@ -57,13 +58,21 @@ softpipe_map_constant_buffers(struct softpipe_context *sp)
    }
 
    if (sp->constants[PIPE_SHADER_VERTEX].buffer)
-      size = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
+      vssize = sp->constants[PIPE_SHADER_VERTEX].buffer->size;
    else
-      size = 0;
+      vssize = 0;
 
-   draw_set_mapped_constant_buffer(sp->draw,
+   if (sp->constants[PIPE_SHADER_GEOMETRY].buffer)
+      gssize = sp->constants[PIPE_SHADER_GEOMETRY].buffer->size;
+   else
+      gssize = 0;
+
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX,
                                    sp->mapped_constants[PIPE_SHADER_VERTEX],
-                                   size);
+                                   vssize);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_GEOMETRY,
+                                   sp->mapped_constants[PIPE_SHADER_GEOMETRY],
+                                   gssize);
 }
 
 
@@ -78,9 +87,10 @@ softpipe_unmap_constant_buffers(struct softpipe_context *sp)
     */
    draw_flush(sp->draw);
 
-   draw_set_mapped_constant_buffer(sp->draw, NULL, 0);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_VERTEX, NULL, 0);
+   draw_set_mapped_constant_buffer(sp->draw, PIPE_SHADER_GEOMETRY, NULL, 0);
 
-   for (i = 0; i < 2; i++) {
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
       if (sp->constants[i].buffer && sp->constants[i].buffer->size)
          ws->buffer_unmap(ws, sp->constants[i].buffer);
       sp->mapped_constants[i] = NULL;
@@ -220,6 +230,9 @@ softpipe_draw_range_elements_instanced(struct pipe_context *pipe,
    struct draw_context *draw = sp->draw;
    unsigned i;
 
+   if (!softpipe_check_render_cond(sp))
+      return TRUE;
+
    sp->reduced_api_prim = u_reduced_prim(mode);
 
    if (sp->dirty) {
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 5fbac06a53..7f573aef3c 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -128,6 +128,7 @@ sp_vbuf_unmap_vertices(struct vbuf_render *vbr,
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    assert( cvbr->vertex_buffer_size >= (max_index+1) * cvbr->vertex_size );
+   (void) cvbr;
    /* do nothing */
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index fe6b6cec35..d9babe81da 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -229,7 +229,7 @@ blend_quad(struct quad_stage *qs,
    static const float zero[4] = { 0, 0, 0, 0 };
    static const float one[4] = { 1, 1, 1, 1 };
    struct softpipe_context *softpipe = qs->softpipe;
-   float source[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE] = { { 0 } };
 
    /*
     * Compute src/first term RGB
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index 379cf4ad06..4ef5d9f7b1 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -99,6 +99,32 @@ softpipe_get_query_result(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called by rendering function to check rendering is conditional.
+ * \return TRUE if we should render, FALSE if we should skip rendering
+ */
+boolean
+softpipe_check_render_cond(struct softpipe_context *sp)
+{
+   struct pipe_context *pipe = &sp->pipe;
+   boolean b, wait;
+   uint64_t result;
+
+   if (!sp->render_cond_query) {
+      return TRUE;  /* no query predicate, draw normally */
+   }
+
+   wait = (sp->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+           sp->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(pipe, sp->render_cond_query, wait, &result);
+   if (b)
+      return result > 0;
+   else
+      return TRUE;
+}
+
+
 void softpipe_init_query_funcs(struct softpipe_context *softpipe )
 {
    softpipe->pipe.create_query = softpipe_create_query;
diff --git a/src/gallium/drivers/softpipe/sp_query.h b/src/gallium/drivers/softpipe/sp_query.h
index 05060a4575..736c033897 100644
--- a/src/gallium/drivers/softpipe/sp_query.h
+++ b/src/gallium/drivers/softpipe/sp_query.h
@@ -32,6 +32,10 @@
 #ifndef SP_QUERY_H
 #define SP_QUERY_H
 
+extern boolean
+softpipe_check_render_cond(struct softpipe_context *sp);
+
+
 struct softpipe_context;
 extern void softpipe_init_query_funcs(struct softpipe_context * );
 
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 615581b95f..3da75364c5 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -1268,7 +1268,7 @@ void sp_setup_prepare( struct setup_context *setup )
    }
 
    /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->nr_vertex_attrs = draw_num_vs_outputs(sp->draw);
+   setup->nr_vertex_attrs = draw_num_shader_outputs(sp->draw);
 
    sp->quad.first->begin( sp->quad.first );
 
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 00da41b985..f8886565e9 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -50,6 +50,7 @@
 #define SP_NEW_VERTEX        0x1000
 #define SP_NEW_VS            0x2000
 #define SP_NEW_QUERY         0x4000
+#define SP_NEW_GS            0x8000
 
 
 struct tgsi_sampler;
@@ -90,6 +91,11 @@ struct sp_vertex_shader {
    int max_sampler;             /* -1 if no samplers */
 };
 
+/** Subclass of pipe_shader_state */
+struct sp_geometry_shader {
+   struct pipe_shader_state shader;
+   struct draw_geometry_shader *draw_data;
+};
 
 
 void *
@@ -143,6 +149,10 @@ void *softpipe_create_vs_state(struct pipe_context *,
                                const struct pipe_shader_state *);
 void softpipe_bind_vs_state(struct pipe_context *, void *);
 void softpipe_delete_vs_state(struct pipe_context *, void *);
+void *softpipe_create_gs_state(struct pipe_context *,
+                               const struct pipe_shader_state *);
+void softpipe_bind_gs_state(struct pipe_context *, void *);
+void softpipe_delete_gs_state(struct pipe_context *, void *);
 
 void softpipe_set_polygon_stipple( struct pipe_context *,
 				  const struct pipe_poly_stipple * );
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index efed082f82..95ab323433 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -29,6 +29,7 @@
  */
 
 #include "util/u_memory.h"
+#include "draw/draw_context.h"
 #include "sp_context.h"
 #include "sp_state.h"
 
@@ -45,6 +46,8 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
@@ -62,6 +65,8 @@ void softpipe_set_blend_color( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   draw_flush(softpipe->draw);
+
    softpipe->blend_color = *blend_color;
 
    softpipe->dirty |= SP_NEW_BLEND;
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index c24a737d07..f6856a5f69 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -67,7 +67,7 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       /* compute vertex layout now */
       const struct sp_fragment_shader *spfs = softpipe->fs;
       struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-      const uint num = draw_num_vs_outputs(softpipe->draw);
+      const uint num = draw_current_shader_outputs(softpipe->draw);
       uint i;
 
       /* Tell draw_vbuf to simply emit the whole post-xform vertex
@@ -117,13 +117,13 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
          }
 
          /* this includes texcoords and varying vars */
-         src = draw_find_vs_output(softpipe->draw,
-                                   spfs->info.input_semantic_name[i],
-                                   spfs->info.input_semantic_index[i]);
+         src = draw_find_shader_output(softpipe->draw,
+                                       spfs->info.input_semantic_name[i],
+                                       spfs->info.input_semantic_index[i]);
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
-      softpipe->psize_slot = draw_find_vs_output(softpipe->draw,
+      softpipe->psize_slot = draw_find_shader_output(softpipe->draw,
                                                  TGSI_SEMANTIC_PSIZE, 0);
       if (softpipe->psize_slot > 0) {
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index b41f7e8ab7..aa12bb215a 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -69,7 +69,14 @@ softpipe_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->fs = (struct sp_fragment_shader *) fs;
+   draw_flush(softpipe->draw);
+
+   if (softpipe->fs == fs)
+      return;
+
+   draw_flush(softpipe->draw);
+
+   softpipe->fs = fs;
 
    softpipe->dirty |= SP_NEW_FS;
 }
@@ -159,9 +166,75 @@ softpipe_set_constant_buffer(struct pipe_context *pipe,
    assert(shader < PIPE_SHADER_TYPES);
    assert(index == 0);
 
+   draw_flush(softpipe->draw);
+
    /* note: reference counting */
    pipe_buffer_reference(&softpipe->constants[shader].buffer,
 			 buf ? buf->buffer : NULL);
 
    softpipe->dirty |= SP_NEW_CONSTANTS;
 }
+
+void *
+softpipe_create_gs_state(struct pipe_context *pipe,
+                         const struct pipe_shader_state *templ)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   struct sp_geometry_shader *state;
+
+   state = CALLOC_STRUCT(sp_geometry_shader);
+   if (state == NULL )
+      goto fail;
+
+   /* debug */
+   if (softpipe->dump_gs)
+      tgsi_dump(templ->tokens, 0);
+
+   /* copy shader tokens, the ones passed in will go away.
+    */
+   state->shader.tokens = tgsi_dup_tokens(templ->tokens);
+   if (state->shader.tokens == NULL)
+      goto fail;
+
+   state->draw_data = draw_create_geometry_shader(softpipe->draw, templ);
+   if (state->draw_data == NULL)
+      goto fail;
+
+   return state;
+
+fail:
+   if (state) {
+      FREE( (void *)state->shader.tokens );
+      FREE( state->draw_data );
+      FREE( state );
+   }
+   return NULL;
+}
+
+
+void
+softpipe_bind_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   softpipe->gs = (struct sp_geometry_shader *)gs;
+
+   draw_bind_geometry_shader(softpipe->draw,
+                             (softpipe->gs ? softpipe->gs->draw_data : NULL));
+
+   softpipe->dirty |= SP_NEW_GS;
+}
+
+
+void
+softpipe_delete_gs_state(struct pipe_context *pipe, void *gs)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+
+   struct sp_geometry_shader *state =
+      (struct sp_geometry_shader *)gs;
+
+   draw_delete_geometry_shader(softpipe->draw,
+                               (state) ? state->draw_data : 0);
+   FREE(state);
+}
diff --git a/src/gallium/drivers/softpipe/sp_state_rasterizer.c b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
index 87b7219683..a5b00336d4 100644
--- a/src/gallium/drivers/softpipe/sp_state_rasterizer.c
+++ b/src/gallium/drivers/softpipe/sp_state_rasterizer.c
@@ -41,14 +41,17 @@ softpipe_create_rasterizer_state(struct pipe_context *pipe,
 }
 
 void softpipe_bind_rasterizer_state(struct pipe_context *pipe,
-                                    void *setup)
+                                    void *rasterizer)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
+   if (softpipe->rasterizer == rasterizer)
+      return;
+
    /* pass-through to draw module */
-   draw_set_rasterizer_state(softpipe->draw, setup);
+   draw_set_rasterizer_state(softpipe->draw, rasterizer);
 
-   softpipe->rasterizer = (struct pipe_rasterizer_state *)setup;
+   softpipe->rasterizer = rasterizer;
 
    softpipe->dirty |= SP_NEW_RASTERIZER;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index a518248bb1..f6154109ea 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -51,6 +51,8 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    struct softpipe_context *sp = softpipe_context(pipe);
    uint i;
 
+   draw_flush(sp->draw);
+
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index c3de12b4a3..af99c9de37 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -29,6 +29,7 @@
 #include "pipe/p_inlines.h"
 #include "pipe/p_screen.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "util/u_upload_mgr.h"
 
 #include "svga_context.h"
@@ -61,6 +62,9 @@ static void svga_destroy( struct pipe_context *pipe )
    u_upload_destroy( svga->upload_vb );
    u_upload_destroy( svga->upload_ib );
 
+   util_bitmask_destroy( svga->vs_bm );
+   util_bitmask_destroy( svga->fs_bm );
+
    for(shader = 0; shader < PIPE_SHADER_TYPES; ++shader)
       pipe_buffer_reference( &svga->curr.cb[shader], NULL );
 
@@ -130,7 +134,7 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    svga = CALLOC_STRUCT(svga_context);
    if (svga == NULL)
-      goto error1;
+      goto no_svga;
 
    svga->pipe.winsys = screen->winsys;
    svga->pipe.screen = screen;
@@ -142,7 +146,7 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    svga->swc = svgascreen->sws->context_create(svgascreen->sws);
    if(!svga->swc)
-      goto error2;
+      goto no_swc;
 
    svga_init_blend_functions(svga);
    svga_init_blit_functions(svga);
@@ -165,32 +169,40 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
    svga->debug.disable_shader = debug_get_num_option("SVGA_DISABLE_SHADER", ~0);
 
    if (!svga_init_swtnl(svga))
-      goto error3;
+      goto no_swtnl;
+
+   svga->fs_bm = util_bitmask_create();
+   if (svga->fs_bm == NULL)
+      goto no_fs_bm;
+
+   svga->vs_bm = util_bitmask_create();
+   if (svga->vs_bm == NULL)
+      goto no_vs_bm;
 
    svga->upload_ib = u_upload_create( svga->pipe.screen,
                                       32 * 1024,
                                       16,
                                       PIPE_BUFFER_USAGE_INDEX );
    if (svga->upload_ib == NULL)
-      goto error4;
+      goto no_upload_ib;
 
    svga->upload_vb = u_upload_create( svga->pipe.screen,
                                       128 * 1024,
                                       16,
                                       PIPE_BUFFER_USAGE_VERTEX );
    if (svga->upload_vb == NULL)
-      goto error5;
+      goto no_upload_vb;
 
    svga->hwtnl = svga_hwtnl_create( svga,
                                     svga->upload_ib,
                                     svga->swc );
    if (svga->hwtnl == NULL)
-      goto error6;
+      goto no_hwtnl;
 
 
    ret = svga_emit_initial_state( svga );
    if (ret)
-      goto error7;
+      goto no_state;
    
    /* Avoid shortcircuiting state with initial value of zero.
     */
@@ -209,19 +221,23 @@ struct pipe_context *svga_context_create( struct pipe_screen *screen )
 
    return &svga->pipe;
 
-error7:
+no_state:
    svga_hwtnl_destroy( svga->hwtnl );
-error6:
+no_hwtnl:
    u_upload_destroy( svga->upload_vb );
-error5:
+no_upload_vb:
    u_upload_destroy( svga->upload_ib );
-error4:
+no_upload_ib:
+   util_bitmask_destroy( svga->vs_bm );
+no_vs_bm:
+   util_bitmask_destroy( svga->fs_bm );
+no_fs_bm:
    svga_destroy_swtnl(svga);
-error3:
+no_swtnl:
    svga->swc->destroy(svga->swc);
-error2:
+no_swc:
    FREE(svga);
-error1:
+no_svga:
    return NULL;
 }
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 0885d9ca74..fa7f6cb3bb 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -41,6 +41,7 @@
 struct draw_vertex_shader;
 struct svga_shader_result;
 struct SVGACmdMemory;
+struct util_bitmask;
 struct u_upload_mgr;
 
 
@@ -319,12 +320,14 @@ struct svga_context
       boolean new_vdecl;
    } swtnl;
 
+   /* Bitmask of used shader IDs */
+   struct util_bitmask *fs_bm;
+   struct util_bitmask *vs_bm;
+
    struct {
       unsigned dirty[4];
 
       unsigned texture_timestamp;
-      unsigned next_fs_id;
-      unsigned next_vs_id;
 
       /* Internally generated shaders:
        */
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 8db40d0fd5..ca73cf9d5a 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -164,7 +164,8 @@ svga_hwtnl_flush( struct svga_hwtnl *hwtnl )
       }
 
       SVGA_DBG(DEBUG_DMA, "draw to sid %p, %d prims\n",
-               svga_surface(svga->curr.framebuffer.cbufs[0])->handle,
+               svga->curr.framebuffer.cbufs[0] ?
+               svga_surface(svga->curr.framebuffer.cbufs[0])->handle : NULL,
                hwtnl->cmd.prim_count);
 
       ret = SVGA3D_BeginDrawPrimitives(swc, 
diff --git a/src/gallium/drivers/svga/svga_pipe_fs.c b/src/gallium/drivers/svga/svga_pipe_fs.c
index e3be840d92..a461a86dd3 100644
--- a/src/gallium/drivers/svga/svga_pipe_fs.c
+++ b/src/gallium/drivers/svga/svga_pipe_fs.c
@@ -26,6 +26,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_text.h"
 
@@ -107,6 +108,8 @@ void svga_delete_fs_state(struct pipe_context *pipe, void *shader)
          assert(ret == PIPE_OK);
       }
 
+      util_bitmask_clear( svga->fs_bm, result->id );
+
       svga_destroy_shader_result( result );
    }
 
diff --git a/src/gallium/drivers/svga/svga_pipe_vs.c b/src/gallium/drivers/svga/svga_pipe_vs.c
index c104c41f5f..e82d10c259 100644
--- a/src/gallium/drivers/svga/svga_pipe_vs.c
+++ b/src/gallium/drivers/svga/svga_pipe_vs.c
@@ -27,6 +27,7 @@
 #include "pipe/p_inlines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_text.h"
 
@@ -172,6 +173,8 @@ static void svga_delete_vs_state(struct pipe_context *pipe, void *shader)
          assert(ret == PIPE_OK);
       }
 
+      util_bitmask_clear( svga->vs_bm, result->id );
+
       svga_destroy_shader_result( result );
    }
 
diff --git a/src/gallium/drivers/svga/svga_state_fs.c b/src/gallium/drivers/svga/svga_state_fs.c
index 6ec38ed3e4..1902b0106b 100644
--- a/src/gallium/drivers/svga/svga_state_fs.c
+++ b/src/gallium/drivers/svga/svga_state_fs.c
@@ -26,6 +26,7 @@
 #include "pipe/p_inlines.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
+#include "util/u_bitmask.h"
 
 #include "svga_context.h"
 #include "svga_state.h"
@@ -74,9 +75,12 @@ static enum pipe_error compile_fs( struct svga_context *svga,
       goto fail;
    }
 
+   result->id = util_bitmask_add(svga->fs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+      goto fail;
 
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_fs_id,
+                             result->id,
                              SVGA3D_SHADERTYPE_PS,
                              result->tokens, 
                              result->nr_tokens * sizeof result->tokens[0]);
@@ -84,14 +88,16 @@ static enum pipe_error compile_fs( struct svga_context *svga,
       goto fail;
 
    *out_result = result;
-   result->id = svga->state.next_fs_id++;
    result->next = fs->base.results;
    fs->base.results = result;
    return PIPE_OK;
 
 fail:
-   if (result)
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->fs_bm, result->id );
       svga_destroy_shader_result( result );
+   }
    return ret;
 }
 
@@ -116,7 +122,7 @@ fail:
  */
 static int emit_white_fs( struct svga_context *svga )
 {
-   int ret;
+   int ret = PIPE_ERROR;
 
    /* ps_3_0
     * def c0, 1.000000, 0.000000, 0.000000, 1.000000
@@ -137,16 +143,26 @@ static int emit_white_fs( struct svga_context *svga )
       0x0000ffff,
    };
 
+   assert(SVGA3D_INVALID_ID == UTIL_BITMASK_INVALID_INDEX);
+   svga->state.white_fs_id = util_bitmask_add(svga->fs_bm);
+   if(svga->state.white_fs_id == SVGA3D_INVALID_ID)
+      goto no_fs_id;
+
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_fs_id,
+                             svga->state.white_fs_id,
                              SVGA3D_SHADERTYPE_PS,
                              white_tokens, 
                              sizeof(white_tokens));
    if (ret)
-      return ret;
+      goto no_definition;
 
-   svga->state.white_fs_id = svga->state.next_fs_id++;
    return 0;
+
+no_definition:
+   util_bitmask_clear(svga->fs_bm, svga->state.white_fs_id);
+   svga->state.white_fs_id = SVGA3D_INVALID_ID;
+no_fs_id:
+   return ret;
 }
 
 
@@ -251,12 +267,14 @@ static int emit_hw_fs( struct svga_context *svga,
 
    assert(id != SVGA3D_INVALID_ID);
 
-   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT]) {
-      ret = SVGA3D_SetShader(svga->swc, 
-                             SVGA3D_SHADERTYPE_PS, 
-                             id );
-      if (ret)
-         return ret;
+   if (result != svga->state.hw_draw.fs) {
+      if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT]) {
+         ret = SVGA3D_SetShader(svga->swc,
+                                SVGA3D_SHADERTYPE_PS,
+                                id );
+         if (ret)
+            return ret;
+      }
 
       svga->dirty |= SVGA_NEW_FS_RESULT;
       svga->state.hw_draw.shader_id[PIPE_SHADER_FRAGMENT] = id;
diff --git a/src/gallium/drivers/svga/svga_state_vs.c b/src/gallium/drivers/svga/svga_state_vs.c
index 82e7874e2a..c614281858 100644
--- a/src/gallium/drivers/svga/svga_state_vs.c
+++ b/src/gallium/drivers/svga/svga_state_vs.c
@@ -27,6 +27,7 @@
 #include "pipe/p_defines.h"
 #include "util/u_format.h"
 #include "util/u_math.h"
+#include "util/u_bitmask.h"
 #include "translate/translate.h"
 
 #include "svga_context.h"
@@ -78,8 +79,12 @@ static enum pipe_error compile_vs( struct svga_context *svga,
       goto fail;
    }
 
+   result->id = util_bitmask_add(svga->vs_bm);
+   if(result->id == UTIL_BITMASK_INVALID_INDEX)
+      goto fail;
+
    ret = SVGA3D_DefineShader(svga->swc, 
-                             svga->state.next_vs_id,
+                             result->id,
                              SVGA3D_SHADERTYPE_VS,
                              result->tokens, 
                              result->nr_tokens * sizeof result->tokens[0]);
@@ -87,14 +92,16 @@ static enum pipe_error compile_vs( struct svga_context *svga,
       goto fail;
 
    *out_result = result;
-   result->id = svga->state.next_vs_id++;
    result->next = vs->base.results;
    vs->base.results = result;
    return PIPE_OK;
 
 fail:
-   if (result)
+   if (result) {
+      if (result->id != UTIL_BITMASK_INVALID_INDEX)
+         util_bitmask_clear( svga->vs_bm, result->id );
       svga_destroy_shader_result( result );
+   }
    return ret;
 }
 
@@ -142,12 +149,14 @@ static int emit_hw_vs( struct svga_context *svga,
       id = result->id;
    }
 
-   if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX]) {
-      ret = SVGA3D_SetShader(svga->swc, 
-                             SVGA3D_SHADERTYPE_VS, 
-                             id );
-      if (ret)
-         return ret;
+   if (result != svga->state.hw_draw.vs) {
+      if (id != svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX]) {
+         ret = SVGA3D_SetShader(svga->swc,
+                                SVGA3D_SHADERTYPE_VS,
+                                id );
+         if (ret)
+            return ret;
+      }
 
       svga->dirty |= SVGA_NEW_VS_RESULT;
       svga->state.hw_draw.shader_id[PIPE_SHADER_VERTEX] = id;
diff --git a/src/gallium/drivers/svga/svga_swtnl_draw.c b/src/gallium/drivers/svga/svga_swtnl_draw.c
index 8b14c913f7..7655121bec 100644
--- a/src/gallium/drivers/svga/svga_swtnl_draw.c
+++ b/src/gallium/drivers/svga/svga_swtnl_draw.c
@@ -90,7 +90,7 @@ svga_swtnl_draw_range_elements(struct svga_context *svga,
                             PIPE_BUFFER_USAGE_CPU_READ);
       assert(map);
       draw_set_mapped_constant_buffer(
-         draw, 
+         draw, PIPE_SHADER_VERTEX,
          map,
          svga->curr.cb[PIPE_SHADER_VERTEX]->size);
    }
diff --git a/src/gallium/drivers/svga/svga_swtnl_state.c b/src/gallium/drivers/svga/svga_swtnl_state.c
index 25b8c2af3a..94b6ccc62d 100644
--- a/src/gallium/drivers/svga/svga_swtnl_state.c
+++ b/src/gallium/drivers/svga/svga_swtnl_state.c
@@ -156,7 +156,7 @@ int svga_swtnl_update_vdecl( struct svga_context *svga )
    memset(vdecl, 0, sizeof(vdecl));
 
    /* always add position */
-   src = draw_find_vs_output(draw, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_shader_output(draw, TGSI_SEMANTIC_POSITION, 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_LINEAR, src);
    vinfo->attrib[0].emit = EMIT_4F;
    vdecl[0].array.offset = offset;
@@ -169,7 +169,7 @@ int svga_swtnl_update_vdecl( struct svga_context *svga )
    for (i = 0; i < fs->base.info.num_inputs; i++) {
       unsigned name = fs->base.info.input_semantic_name[i];
       unsigned index = fs->base.info.input_semantic_index[i];
-      src = draw_find_vs_output(draw, name, index);
+      src = draw_find_shader_output(draw, name, index);
       vdecl[nr_decls].array.offset = offset;
       vdecl[nr_decls].identity.usageIndex = fs->base.info.input_semantic_index[i];
 
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index b8ef137c01..0cd620189b 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -31,6 +31,7 @@
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "util/u_memory.h"
+#include "util/u_bitmask.h"
 
 #include "svgadump/svga_shader_dump.h"
 
@@ -221,6 +222,7 @@ svga_tgsi_translate( const struct svga_shader *shader,
    result->tokens = (const unsigned *)emit.buf;
    result->nr_tokens = (emit.ptr - emit.buf) / sizeof(unsigned);
    memcpy(&result->key, &key, sizeof key);
+   result->id = UTIL_BITMASK_INVALID_INDEX;
 
    if (SVGA_DEBUG & DEBUG_TGSI) 
    {
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 1670da8bfa..dc5eb8fc60 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -2109,7 +2109,7 @@ static boolean svga_emit_instruction( struct svga_shader_emitter *emit,
    case TGSI_OPCODE_I2F:
    case TGSI_OPCODE_NOT:
    case TGSI_OPCODE_SHL:
-   case TGSI_OPCODE_SHR:
+   case TGSI_OPCODE_ISHR:
    case TGSI_OPCODE_XOR:
       return FALSE;
 
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.c b/src/gallium/drivers/svga/svgadump/svga_dump.c
index e6d4a74e86..d59fb89a58 100644
--- a/src/gallium/drivers/svga/svgadump/svga_dump.c
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.c
@@ -1444,6 +1444,312 @@ dump_SVGA3dCmdBlitSurfaceToScreen(const SVGA3dCmdBlitSurfaceToScreen *cmd)
 
 
 void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+  
+   switch(cmd_id) {
+   case SVGA_3D_CMD_SURFACE_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
+      {
+         const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
+         dump_SVGA3dCmdDefineSurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dSize) <= next) {
+            dump_SVGA3dSize((const SVGA3dSize *)body);
+            body += sizeof(SVGA3dSize);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
+      {
+         const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
+         dump_SVGA3dCmdDestroySurface(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_COPY:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
+      {
+         const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
+         dump_SVGA3dCmdSurfaceCopy(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_STRETCHBLT:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
+      {
+         const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
+         dump_SVGA3dCmdSurfaceStretchBlt(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SURFACE_DMA:
+      _debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
+      {
+         const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
+         dump_SVGA3dCmdSurfaceDMA(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyBox) <= next) {
+            dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
+            body += sizeof(SVGA3dCopyBox);
+         }
+         while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
+            dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
+            body += sizeof(SVGA3dCmdSurfaceDMASuffix);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
+      {
+         const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
+         dump_SVGA3dCmdDefineContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CONTEXT_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
+         dump_SVGA3dCmdDestroyContext(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTRANSFORM:
+      _debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
+      {
+         const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
+         dump_SVGA3dCmdSetTransform(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETZRANGE:
+      _debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
+      {
+         const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
+         dump_SVGA3dCmdSetZRange(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERSTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
+      {
+         const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
+         dump_SVGA3dCmdSetRenderState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRenderState) <= next) {
+            dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
+            body += sizeof(SVGA3dRenderState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETRENDERTARGET:
+      _debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
+      {
+         const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
+         dump_SVGA3dCmdSetRenderTarget(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETTEXTURESTATE:
+      _debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
+      {
+         const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
+         dump_SVGA3dCmdSetTextureState(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dTextureState) <= next) {
+            dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
+            body += sizeof(SVGA3dTextureState);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETMATERIAL:
+      _debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
+      {
+         const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
+         dump_SVGA3dCmdSetMaterial(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTDATA:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
+      {
+         const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
+         dump_SVGA3dCmdSetLightData(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETLIGHTENABLED:
+      _debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
+      {
+         const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
+         dump_SVGA3dCmdSetLightEnabled(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETVIEWPORT:
+      _debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
+      {
+         const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
+         dump_SVGA3dCmdSetViewport(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SETCLIPPLANE:
+      _debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
+      {
+         const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
+         dump_SVGA3dCmdSetClipPlane(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_CLEAR:
+      _debug_printf("\tSVGA_3D_CMD_CLEAR\n");
+      {
+         const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
+         dump_SVGA3dCmdClear(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dRect) <= next) {
+            dump_SVGA3dRect((const SVGA3dRect *)body);
+            body += sizeof(SVGA3dRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_PRESENT:
+      _debug_printf("\tSVGA_3D_CMD_PRESENT\n");
+      {
+         const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
+         dump_SVGA3dCmdPresent(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGA3dCopyRect) <= next) {
+            dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
+            body += sizeof(SVGA3dCopyRect);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DEFINE:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
+      {
+         const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
+         dump_SVGA3dCmdDefineShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+         svga_shader_dump((const uint32_t *)body, 
+                      (unsigned)(next - body)/sizeof(uint32_t),
+                      FALSE );
+         body = next;
+      }
+      break;
+   case SVGA_3D_CMD_SHADER_DESTROY:
+      _debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
+      {
+         const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
+         dump_SVGA3dCmdDestroyShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
+      {
+         const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
+         dump_SVGA3dCmdSetShader(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_SET_SHADER_CONST:
+      _debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
+      {
+         const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
+         dump_SVGA3dCmdSetShaderConst(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_DRAW_PRIMITIVES:
+      _debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
+      {
+         const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
+         unsigned i, j;
+         dump_SVGA3dCmdDrawPrimitives(cmd);
+         body = (const uint8_t *)&cmd[1];
+         for(i = 0; i < cmd->numVertexDecls; ++i) {
+            dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
+            body += sizeof(SVGA3dVertexDecl);
+         }
+         for(j = 0; j < cmd->numRanges; ++j) {
+            dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
+            body += sizeof(SVGA3dPrimitiveRange);
+         }
+         while(body + sizeof(SVGA3dVertexDivisor) <= next) {
+            dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
+            body += sizeof(SVGA3dVertexDivisor);
+         }
+      }
+      break;
+   case SVGA_3D_CMD_SETSCISSORRECT:
+      _debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
+      {
+         const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
+         dump_SVGA3dCmdSetScissorRect(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BEGIN_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
+      {
+         const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
+         dump_SVGA3dCmdBeginQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_END_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
+      {
+         const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
+         dump_SVGA3dCmdEndQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_WAIT_FOR_QUERY:
+      _debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
+      {
+         const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
+         dump_SVGA3dCmdWaitForQuery(cmd);
+         body = (const uint8_t *)&cmd[1];
+      }
+      break;
+   case SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN:
+      _debug_printf("\tSVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN\n");
+      {
+         const SVGA3dCmdBlitSurfaceToScreen *cmd = (const SVGA3dCmdBlitSurfaceToScreen *)body;
+         dump_SVGA3dCmdBlitSurfaceToScreen(cmd);
+         body = (const uint8_t *)&cmd[1];
+         while(body + sizeof(SVGASignedRect) <= next) {
+            dump_SVGASignedRect((const SVGASignedRect *)body);
+            body += sizeof(SVGASignedRect);
+         }
+      }
+      break;
+   default:
+      _debug_printf("\t0x%08x\n", cmd_id);
+      break;
+   }
+
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
+}
+
+
+void            
 svga_dump_commands(const void *commands, uint32_t size)
 {
    const uint8_t *next = commands;
@@ -1458,307 +1764,11 @@ svga_dump_commands(const void *commands, uint32_t size)
          const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
          const uint8_t *body = (const uint8_t *)&header[1];
 
-         next = (const uint8_t *)body + header->size;
+         next = body + header->size;
          if(next > last)
             break;
 
-         switch(cmd_id) {
-         case SVGA_3D_CMD_SURFACE_DEFINE:
-            _debug_printf("\tSVGA_3D_CMD_SURFACE_DEFINE\n");
-            {
-               const SVGA3dCmdDefineSurface *cmd = (const SVGA3dCmdDefineSurface *)body;
-               dump_SVGA3dCmdDefineSurface(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dSize) <= next) {
-                  dump_SVGA3dSize((const SVGA3dSize *)body);
-                  body += sizeof(SVGA3dSize);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_DESTROY:
-            _debug_printf("\tSVGA_3D_CMD_SURFACE_DESTROY\n");
-            {
-               const SVGA3dCmdDestroySurface *cmd = (const SVGA3dCmdDestroySurface *)body;
-               dump_SVGA3dCmdDestroySurface(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_COPY:
-            _debug_printf("\tSVGA_3D_CMD_SURFACE_COPY\n");
-            {
-               const SVGA3dCmdSurfaceCopy *cmd = (const SVGA3dCmdSurfaceCopy *)body;
-               dump_SVGA3dCmdSurfaceCopy(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyBox) <= next) {
-                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
-                  body += sizeof(SVGA3dCopyBox);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_STRETCHBLT:
-            _debug_printf("\tSVGA_3D_CMD_SURFACE_STRETCHBLT\n");
-            {
-               const SVGA3dCmdSurfaceStretchBlt *cmd = (const SVGA3dCmdSurfaceStretchBlt *)body;
-               dump_SVGA3dCmdSurfaceStretchBlt(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SURFACE_DMA:
-            _debug_printf("\tSVGA_3D_CMD_SURFACE_DMA\n");
-            {
-               const SVGA3dCmdSurfaceDMA *cmd = (const SVGA3dCmdSurfaceDMA *)body;
-               dump_SVGA3dCmdSurfaceDMA(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyBox) <= next) {
-                  dump_SVGA3dCopyBox((const SVGA3dCopyBox *)body);
-                  body += sizeof(SVGA3dCopyBox);
-               }
-               while(body + sizeof(SVGA3dCmdSurfaceDMASuffix) <= next) {
-                  dump_SVGA3dCmdSurfaceDMASuffix((const SVGA3dCmdSurfaceDMASuffix *)body);
-                  body += sizeof(SVGA3dCmdSurfaceDMASuffix);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_CONTEXT_DEFINE:
-            _debug_printf("\tSVGA_3D_CMD_CONTEXT_DEFINE\n");
-            {
-               const SVGA3dCmdDefineContext *cmd = (const SVGA3dCmdDefineContext *)body;
-               dump_SVGA3dCmdDefineContext(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_CONTEXT_DESTROY:
-            _debug_printf("\tSVGA_3D_CMD_CONTEXT_DESTROY\n");
-            {
-               const SVGA3dCmdDestroyContext *cmd = (const SVGA3dCmdDestroyContext *)body;
-               dump_SVGA3dCmdDestroyContext(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETTRANSFORM:
-            _debug_printf("\tSVGA_3D_CMD_SETTRANSFORM\n");
-            {
-               const SVGA3dCmdSetTransform *cmd = (const SVGA3dCmdSetTransform *)body;
-               dump_SVGA3dCmdSetTransform(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETZRANGE:
-            _debug_printf("\tSVGA_3D_CMD_SETZRANGE\n");
-            {
-               const SVGA3dCmdSetZRange *cmd = (const SVGA3dCmdSetZRange *)body;
-               dump_SVGA3dCmdSetZRange(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETRENDERSTATE:
-            _debug_printf("\tSVGA_3D_CMD_SETRENDERSTATE\n");
-            {
-               const SVGA3dCmdSetRenderState *cmd = (const SVGA3dCmdSetRenderState *)body;
-               dump_SVGA3dCmdSetRenderState(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dRenderState) <= next) {
-                  dump_SVGA3dRenderState((const SVGA3dRenderState *)body);
-                  body += sizeof(SVGA3dRenderState);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETRENDERTARGET:
-            _debug_printf("\tSVGA_3D_CMD_SETRENDERTARGET\n");
-            {
-               const SVGA3dCmdSetRenderTarget *cmd = (const SVGA3dCmdSetRenderTarget *)body;
-               dump_SVGA3dCmdSetRenderTarget(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETTEXTURESTATE:
-            _debug_printf("\tSVGA_3D_CMD_SETTEXTURESTATE\n");
-            {
-               const SVGA3dCmdSetTextureState *cmd = (const SVGA3dCmdSetTextureState *)body;
-               dump_SVGA3dCmdSetTextureState(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dTextureState) <= next) {
-                  dump_SVGA3dTextureState((const SVGA3dTextureState *)body);
-                  body += sizeof(SVGA3dTextureState);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETMATERIAL:
-            _debug_printf("\tSVGA_3D_CMD_SETMATERIAL\n");
-            {
-               const SVGA3dCmdSetMaterial *cmd = (const SVGA3dCmdSetMaterial *)body;
-               dump_SVGA3dCmdSetMaterial(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETLIGHTDATA:
-            _debug_printf("\tSVGA_3D_CMD_SETLIGHTDATA\n");
-            {
-               const SVGA3dCmdSetLightData *cmd = (const SVGA3dCmdSetLightData *)body;
-               dump_SVGA3dCmdSetLightData(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETLIGHTENABLED:
-            _debug_printf("\tSVGA_3D_CMD_SETLIGHTENABLED\n");
-            {
-               const SVGA3dCmdSetLightEnabled *cmd = (const SVGA3dCmdSetLightEnabled *)body;
-               dump_SVGA3dCmdSetLightEnabled(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETVIEWPORT:
-            _debug_printf("\tSVGA_3D_CMD_SETVIEWPORT\n");
-            {
-               const SVGA3dCmdSetViewport *cmd = (const SVGA3dCmdSetViewport *)body;
-               dump_SVGA3dCmdSetViewport(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SETCLIPPLANE:
-            _debug_printf("\tSVGA_3D_CMD_SETCLIPPLANE\n");
-            {
-               const SVGA3dCmdSetClipPlane *cmd = (const SVGA3dCmdSetClipPlane *)body;
-               dump_SVGA3dCmdSetClipPlane(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_CLEAR:
-            _debug_printf("\tSVGA_3D_CMD_CLEAR\n");
-            {
-               const SVGA3dCmdClear *cmd = (const SVGA3dCmdClear *)body;
-               dump_SVGA3dCmdClear(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dRect) <= next) {
-                  dump_SVGA3dRect((const SVGA3dRect *)body);
-                  body += sizeof(SVGA3dRect);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_PRESENT:
-            _debug_printf("\tSVGA_3D_CMD_PRESENT\n");
-            {
-               const SVGA3dCmdPresent *cmd = (const SVGA3dCmdPresent *)body;
-               dump_SVGA3dCmdPresent(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGA3dCopyRect) <= next) {
-                  dump_SVGA3dCopyRect((const SVGA3dCopyRect *)body);
-                  body += sizeof(SVGA3dCopyRect);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SHADER_DEFINE:
-            _debug_printf("\tSVGA_3D_CMD_SHADER_DEFINE\n");
-            {
-               const SVGA3dCmdDefineShader *cmd = (const SVGA3dCmdDefineShader *)body;
-               dump_SVGA3dCmdDefineShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-               svga_shader_dump((const uint32_t *)body, 
-                            (unsigned)(next - body)/sizeof(uint32_t),
-                            FALSE );
-               body = next;
-            }
-            break;
-         case SVGA_3D_CMD_SHADER_DESTROY:
-            _debug_printf("\tSVGA_3D_CMD_SHADER_DESTROY\n");
-            {
-               const SVGA3dCmdDestroyShader *cmd = (const SVGA3dCmdDestroyShader *)body;
-               dump_SVGA3dCmdDestroyShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SET_SHADER:
-            _debug_printf("\tSVGA_3D_CMD_SET_SHADER\n");
-            {
-               const SVGA3dCmdSetShader *cmd = (const SVGA3dCmdSetShader *)body;
-               dump_SVGA3dCmdSetShader(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_SET_SHADER_CONST:
-            _debug_printf("\tSVGA_3D_CMD_SET_SHADER_CONST\n");
-            {
-               const SVGA3dCmdSetShaderConst *cmd = (const SVGA3dCmdSetShaderConst *)body;
-               dump_SVGA3dCmdSetShaderConst(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_DRAW_PRIMITIVES:
-            _debug_printf("\tSVGA_3D_CMD_DRAW_PRIMITIVES\n");
-            {
-               const SVGA3dCmdDrawPrimitives *cmd = (const SVGA3dCmdDrawPrimitives *)body;
-               unsigned i, j;
-               dump_SVGA3dCmdDrawPrimitives(cmd);
-               body = (const uint8_t *)&cmd[1];
-               for(i = 0; i < cmd->numVertexDecls; ++i) {
-                  dump_SVGA3dVertexDecl((const SVGA3dVertexDecl *)body);
-                  body += sizeof(SVGA3dVertexDecl);
-               }
-               for(j = 0; j < cmd->numRanges; ++j) {
-                  dump_SVGA3dPrimitiveRange((const SVGA3dPrimitiveRange *)body);
-                  body += sizeof(SVGA3dPrimitiveRange);
-               }
-               while(body + sizeof(SVGA3dVertexDivisor) <= next) {
-                  dump_SVGA3dVertexDivisor((const SVGA3dVertexDivisor *)body);
-                  body += sizeof(SVGA3dVertexDivisor);
-               }
-            }
-            break;
-         case SVGA_3D_CMD_SETSCISSORRECT:
-            _debug_printf("\tSVGA_3D_CMD_SETSCISSORRECT\n");
-            {
-               const SVGA3dCmdSetScissorRect *cmd = (const SVGA3dCmdSetScissorRect *)body;
-               dump_SVGA3dCmdSetScissorRect(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_BEGIN_QUERY:
-            _debug_printf("\tSVGA_3D_CMD_BEGIN_QUERY\n");
-            {
-               const SVGA3dCmdBeginQuery *cmd = (const SVGA3dCmdBeginQuery *)body;
-               dump_SVGA3dCmdBeginQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_END_QUERY:
-            _debug_printf("\tSVGA_3D_CMD_END_QUERY\n");
-            {
-               const SVGA3dCmdEndQuery *cmd = (const SVGA3dCmdEndQuery *)body;
-               dump_SVGA3dCmdEndQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_WAIT_FOR_QUERY:
-            _debug_printf("\tSVGA_3D_CMD_WAIT_FOR_QUERY\n");
-            {
-               const SVGA3dCmdWaitForQuery *cmd = (const SVGA3dCmdWaitForQuery *)body;
-               dump_SVGA3dCmdWaitForQuery(cmd);
-               body = (const uint8_t *)&cmd[1];
-            }
-            break;
-         case SVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN:
-            _debug_printf("\tSVGA_3D_CMD_BLIT_SURFACE_TO_SCREEN\n");
-            {
-               const SVGA3dCmdBlitSurfaceToScreen *cmd = (const SVGA3dCmdBlitSurfaceToScreen *)body;
-               dump_SVGA3dCmdBlitSurfaceToScreen(cmd);
-               body = (const uint8_t *)&cmd[1];
-               while(body + sizeof(SVGASignedRect) <= next) {
-                  dump_SVGASignedRect((const SVGASignedRect *)body);
-                  body += sizeof(SVGASignedRect);
-               }
-            }
-            break;
-         default:
-            _debug_printf("\t0x%08x\n", cmd_id);
-            break;
-         }
-
-         while(body + sizeof(uint32_t) <= next) {
-            _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
-            body += sizeof(uint32_t);
-         }
-         while(body + sizeof(uint32_t) <= next)
-            _debug_printf("\t\t0x%02x\n", *body++);
+         svga_dump_command(cmd_id, body, header->size);
       }
       else if(cmd_id == SVGA_CMD_FENCE) {
          _debug_printf("\tSVGA_CMD_FENCE\n");
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.h b/src/gallium/drivers/svga/svgadump/svga_dump.h
index 69a8702087..ca0154361c 100644
--- a/src/gallium/drivers/svga/svgadump/svga_dump.h
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.h
@@ -28,6 +28,9 @@
 
 #include "pipe/p_compiler.h"
 
+void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size);
+
 void
 svga_dump_commands(const void *commands, uint32_t size);
 
diff --git a/src/gallium/drivers/svga/svgadump/svga_dump.py b/src/gallium/drivers/svga/svgadump/svga_dump.py
index a1ada29ef8..0bc0b3ae31 100755
--- a/src/gallium/drivers/svga/svgadump/svga_dump.py
+++ b/src/gallium/drivers/svga/svgadump/svga_dump.py
@@ -208,6 +208,56 @@ cmds = [
 def dump_cmds():
     print r'''
 void            
+svga_dump_command(uint32_t cmd_id, const void *data, uint32_t size)
+{
+   const uint8_t *body = (const uint8_t *)data;
+   const uint8_t *next = body + size;
+'''
+    print '   switch(cmd_id) {'
+    indexes = 'ijklmn'
+    for id, header, body, footer in cmds:
+        print '   case %s:' % id
+        print '      _debug_printf("\\t%s\\n");' % id
+        print '      {'
+        print '         const %s *cmd = (const %s *)body;' % (header, header)
+        if len(body):
+            print '         unsigned ' + ', '.join(indexes[:len(body)]) + ';'
+        print '         dump_%s(cmd);' % header
+        print '         body = (const uint8_t *)&cmd[1];'
+        for i in range(len(body)):
+            struct, count = body[i]
+            idx = indexes[i]
+            print '         for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
+            print '            dump_%s((const %s *)body);' % (struct, struct)
+            print '            body += sizeof(%s);' % struct
+            print '         }'
+        if footer is not None:
+            print '         while(body + sizeof(%s) <= next) {' % footer
+            print '            dump_%s((const %s *)body);' % (footer, footer)
+            print '            body += sizeof(%s);' % footer
+            print '         }'
+        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
+            print '         svga_shader_dump((const uint32_t *)body,'
+            print '                          (unsigned)(next - body)/sizeof(uint32_t),'
+            print '                          FALSE);'
+            print '         body = next;'
+        print '      }'
+        print '      break;'
+    print '   default:'
+    print '      _debug_printf("\\t0x%08x\\n", cmd_id);'
+    print '      break;'
+    print '   }'
+    print r'''
+   while(body + sizeof(uint32_t) <= next) {
+      _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
+      body += sizeof(uint32_t);
+   }
+   while(body + sizeof(uint32_t) <= next)
+      _debug_printf("\t\t0x%02x\n", *body++);
+}
+'''
+    print r'''
+void            
 svga_dump_commands(const void *commands, uint32_t size)
 {
    const uint8_t *next = commands;
@@ -222,51 +272,11 @@ svga_dump_commands(const void *commands, uint32_t size)
          const SVGA3dCmdHeader *header = (const SVGA3dCmdHeader *)next;
          const uint8_t *body = (const uint8_t *)&header[1];
 
-         next = (const uint8_t *)body + header->size;
+         next = body + header->size;
          if(next > last)
             break;
-'''
 
-    print '         switch(cmd_id) {'
-    indexes = 'ijklmn'
-    for id, header, body, footer in cmds:
-        print '         case %s:' % id
-        print '            _debug_printf("\\t%s\\n");' % id
-        print '            {'
-        print '               const %s *cmd = (const %s *)body;' % (header, header)
-        if len(body):
-            print '               unsigned ' + ', '.join(indexes[:len(body)]) + ';'
-        print '               dump_%s(cmd);' % header
-        print '               body = (const uint8_t *)&cmd[1];'
-        for i in range(len(body)):
-            struct, count = body[i]
-            idx = indexes[i]
-            print '               for(%s = 0; %s < cmd->%s; ++%s) {' % (idx, idx, count, idx)
-            print '                  dump_%s((const %s *)body);' % (struct, struct)
-            print '                  body += sizeof(%s);' % struct
-            print '               }'
-        if footer is not None:
-            print '               while(body + sizeof(%s) <= next) {' % footer
-            print '                  dump_%s((const %s *)body);' % (footer, footer)
-            print '                  body += sizeof(%s);' % footer
-            print '               }'
-        if id == 'SVGA_3D_CMD_SHADER_DEFINE':
-            print '               sh_svga_dump((const uint32_t *)body, (unsigned)(next - body)/sizeof(uint32_t));'
-            print '               body = next;'
-        print '            }'
-        print '            break;'
-    print '         default:'
-    print '            _debug_printf("\\t0x%08x\\n", cmd_id);'
-    print '            break;'
-    print '         }'
-            
-    print r'''
-         while(body + sizeof(uint32_t) <= next) {
-            _debug_printf("\t\t0x%08x\n", *(const uint32_t *)body);
-            body += sizeof(uint32_t);
-         }
-         while(body + sizeof(uint32_t) <= next)
-            _debug_printf("\t\t0x%02x\n", *body++);
+         svga_dump_command(cmd_id, body, header->size);
       }
       else if(cmd_id == SVGA_CMD_FENCE) {
          _debug_printf("\tSVGA_CMD_FENCE\n");
diff --git a/src/gallium/drivers/trace/README b/src/gallium/drivers/trace/README
index 1000c31e49..203c3851bc 100644
--- a/src/gallium/drivers/trace/README
+++ b/src/gallium/drivers/trace/README
@@ -24,11 +24,10 @@ ensure the right libGL.so is being picked by doing
 
  ldd progs/trivial/tri 
 
-== Traceing ==
+== Tracing ==
 
-For traceing then do
+For tracing then do
 
- export XMESA_TRACE=y
  GALLIUM_TRACE=tri.trace progs/trivial/tri
 
 which should create a tri.trace file, which is an XML file. You can view copying 
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 80f4874b78..ad47a56fba 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -52,6 +52,7 @@ trace_buffer_unwrap(struct trace_context *tr_ctx,
 
    assert(tr_buf->buffer);
    assert(tr_buf->buffer->screen == tr_scr->screen);
+   (void) tr_scr;
    return tr_buf->buffer;
 }
 
@@ -90,6 +91,7 @@ trace_surface_unwrap(struct trace_context *tr_ctx,
 
    assert(tr_surf->surface);
    assert(tr_surf->surface->texture->screen == tr_scr->screen);
+   (void) tr_scr;
    return tr_surf->surface;
 }
 
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 7e2ccbcfdc..0f45e211a3 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -40,7 +40,7 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
 #include <stdlib.h>
 #endif
 
@@ -258,7 +258,7 @@ boolean trace_dump_trace_begin()
       trace_dump_writes("<?xml-stylesheet type='text/xsl' href='trace.xsl'?>\n");
       trace_dump_writes("<trace version='0.1'>\n");
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_APPLE)
       /* Linux applications rarely cleanup GL / Gallium resources so catch
        * application exit here */
       atexit(trace_dump_trace_close);
diff --git a/src/gallium/drivers/trace/tr_rbug.c b/src/gallium/drivers/trace/tr_rbug.c
index c31b1d8698..0546aad9b5 100644
--- a/src/gallium/drivers/trace/tr_rbug.c
+++ b/src/gallium/drivers/trace/tr_rbug.c
@@ -45,7 +45,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define sleep Sleep
-#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_APPLE)
 void usleep(int);
 #  define sleep usleep
 #else
@@ -180,7 +180,7 @@ static int
 trace_rbug_texture_info(struct trace_rbug *tr_rbug, struct rbug_header *header, uint32_t serial)
 {
    struct trace_screen *tr_scr = tr_rbug->tr_scr;
-   struct trace_texture *tr_tex;
+   struct trace_texture *tr_tex = NULL;
    struct rbug_proto_texture_info *gpti = (struct rbug_proto_texture_info *)header;
    struct tr_list *ptr;
    struct pipe_texture *t;
@@ -223,7 +223,7 @@ trace_rbug_texture_read(struct trace_rbug *tr_rbug, struct rbug_header *header,
    struct rbug_proto_texture_read *gptr = (struct rbug_proto_texture_read *)header;
 
    struct trace_screen *tr_scr = tr_rbug->tr_scr;
-   struct trace_texture *tr_tex;
+   struct trace_texture *tr_tex = NULL;
    struct tr_list *ptr;
 
    struct pipe_screen *screen = tr_scr->screen;
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index ac20a47af1..117503aaff 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -426,7 +426,7 @@ trace_screen_transfer_unmap(struct pipe_screen *_screen,
    struct pipe_transfer *transfer = tr_trans->transfer;
 
    if(tr_trans->map) {
-      size_t size = util_format_get_nblocksy(transfer->texture->format, transfer->width) * transfer->stride;
+      size_t size = util_format_get_nblocksy(transfer->texture->format, transfer->height) * transfer->stride;
 
       trace_dump_call_begin("pipe_screen", "transfer_write");
 
diff --git a/src/gallium/drivers/trace/tr_state.h b/src/gallium/drivers/trace/tr_state.h
index 1c16042ee5..e2f981d051 100644
--- a/src/gallium/drivers/trace/tr_state.h
+++ b/src/gallium/drivers/trace/tr_state.h
@@ -32,7 +32,7 @@ struct tgsi_token;
 enum trace_shader_type {
    TRACE_SHADER_FRAGMENT = 0,
    TRACE_SHADER_VERTEX   = 1,
-   TRACE_SHADER_GEOMETRY = 2,
+   TRACE_SHADER_GEOMETRY = 2
 };
 
 struct trace_shader
author	Michal Krol <michal@vmware.com>	2010-01-05 11:04:50 +0100
committer	Michal Krol <michal@vmware.com>	2010-01-05 11:04:50 +0100
commit	9b21b3c52a8a7d58d08151d1a6bf25c472dec213 (patch)
tree	d9083b6af4e2e9b70a7fa6cd31bac45a36e0f6b6 /src/gallium/drivers
parent	543b9566bdaa48fea2df1866fa1310c1cdbcde27 (diff)
parent	1f9aa38f4e2be47229d92be2c1189c2b8d9c7133 (diff)